diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 4c49bd78f1d08f2bc09fa0bd8191ed38b7dce5e3..0000000000000000000000000000000000000000
--- a/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.env
diff --git a/.gitmodules b/.gitmodules
index f0372d9b826d75fd51e14f4177e8bab4ecaff0b9..3bc190175db8837a22f2b255a00f66176415ec9c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,15 @@
-[submodule "paddle"]
-	path = paddle
-	url = https://github.com/PaddlePaddle/Paddle.git
-[submodule "book"]
-	path = book
-	url = https://github.com/PaddlePaddle/book.git
+[submodule "external/Paddle"]
+	path = external/Paddle
+	url = https://github.com/PaddlePaddle/Paddle
+[submodule "external/book"]
+	path = external/book
+	url = https://github.com/PaddlePaddle/book
+[submodule "external/Anakin"]
+	path = external/Anakin
+	url = https://github.com/PaddlePaddle/Anakin
+[submodule "external/paddle-mobile"]
+	path = external/paddle-mobile
+	url = https://github.com/PaddlePaddle/paddle-mobile
+[submodule "external/models"]
+	path = external/models
+	url = https://github.com/PaddlePaddle/models
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3ee2d1d6610c80c3416991345b459acbff6c38fc
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,44 @@
+language: cpp
+cache:
+  bundler: true
+  directories:
+    - $HOME/.ccache
+    - $HOME/.cache/pip
+    - $HOME/docker
+    - $TRAVIS_BUILD_DIR/external/Paddle/build/third_party
+
+sudo: required
+dist: trusty
+services:
+  - docker
+os:
+  - linux
+
+addons:
+  apt:
+    packages:
+      - git
+      - python
+      - python-pip
+      - python2.7-dev
+  ssh_known_hosts: 13.229.163.131
+before_install:
+  - sudo pip install pylint pytest astroid isort 
+
+before_install:
+  - sudo pip install pylint pytest astroid isort
+  # Force the script to be timed out after certain duration
+  - function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
+
+jobs:
+  include:
+    # Force the deploy_docs.sh to time out after 40 minutes.
+    # Travis CI will terminate the build completely after 50 minutes and won't allow caching to happen.
+    # Time out the build preemptively to cache built libraries.
+    - script: timeout 2400 scripts/deploy_docs.sh full
+      name: Generate Docs
+    
+notifications:
+  email:
+    on_success: change
+    on_failure: always
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 36ee0a07dbab2ea7c9c1e7031ab3a871bcf1a008..0000000000000000000000000000000000000000
--- a/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  applehelp  to make an Apple Help Book"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-	@echo "  coverage   to run coverage check of the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PaddlePaddleFluid.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PaddlePaddleFluid.qhc"
-
-applehelp:
-	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
-	@echo
-	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
-	@echo "N.B. You won't be able to view it unless you put it in" \
-	      "~/Library/Documentation/Help or install it in your application" \
-	      "bundle."
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/PaddlePaddleFluid"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PaddlePaddleFluid"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
-
-coverage:
-	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
-	@echo "Testing of coverage in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/coverage/python.txt."
-
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/README.md b/README.md
index e744e193bd198d3f6f6b001f0a906c580173cd38..dc02d7c7b39d036321a80218ddaa0cd8cacc92b8 100644
--- a/README.md
+++ b/README.md
@@ -1,57 +1,25 @@
-# Fluid Documentation Skeleton
+# Introduction
+Fluiddoc consolidates all the documentations related to Paddle. It supplies the contents to PaddlePaddle.org via CI. 
 
-## Build
+# Architecture
+FluidDoc submodules Paddle, Book, Models, Mobile and Anakin under `external` folder. All submodules should be put under `external` as standard practice. 
 
-To build documentation, you need have a linux machine and have python2, virtualenv, gmake installed.
+Fluiddoc then uses them as references to load up the documents. The FluidDoc constructs the whole doc-tree under the `FluidDoc/doc/fluid` folder. The entry point is `FluidDoc/doc/fluid/index_cn.rst` and `FluidDoc/doc/fluid/index_en.rst`
 
-### Preparation
+When a release branch is pushed to Github, Travis-CI will start automatically to compile documents and deploy documents to the server. 
 
-You need to create a `virtualenv` instead of polute the global python library path
+## Note: 
+FluidDoc needs Paddle python module to compile API documents. Unfortunately, compiling Paddle python module takes longer time Travis CI permits. Usually Travis CI will fail due because of timeout. That's why there three jobs on Travis, two of them are to build libraries. Once the libraries are cached on the Travis, next build will be a lot faster.
 
-```bash
-virtualenv .env
-```
+## Preview with PPO
+To preview documents constructured by FluidDoc. Please follow the [regular preview step](https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md), but replace the path to paddle with the path to FluidDoc
+`./runserver --paddle <path_to_FluidDoc_dir>`
 
-You can enter virtualenv by
+# Publish New release
+1. Checkout a new release branch. The branch name should follow `release/<version>`
+1. Update the documentations on the submodules or within FluidDoc
+1. Make sure all the submodules are ready for release. Paddle, book, model, mobile and Anakin should all have stable commits. Note: Paddle repo should update the API RST files accordinly if Paddle changes the included module/classes. 
+1. Update the submodules under `external` folder and commit the changes.
+1. Git push the branch to Github, Travis CI will start several builds to publish the documents to the PaddlePaddle.org server
+1. Please notify the PaddlePaddle.org team that the release content is ready. PaddlePaddl.org team should enable the version and update the default version to the latest one. PaddlePaddle.org should also update the search index accordingly (Until the search server is up)
 
-```bash
-source .env/bin/activate
-```
-
-You can exit virtualenv by
-
-```bash
-deactivate
-```
-
-### Install dependencies
-
-```bash
-# enter virtualenv
-source .env/bin/activate
-# install dependencies
-pip install -r requirements.txt
-```
-
-### Make HTML
-
-```bash
-# make clean  # make clean to regenerate toctree. Just `make html` may have a cache.
-make html
-```
-and the html files will be generated to `build/html`. You can open `build/html/index.html` with your browser to see the documentation.
-
-## Edit
-
-### Edit documentation
-
-It is suggested to use `reStructuredText` because it is the only official markup language supportted by our documentation generating system, sphinx. `markdown` can also be used. However, since the `markdown` has so many dialects, there is no guarantee that the `markdown` source file can be rendered well.
-
-The `reStructuredText` cheatsheet is [here](http://docutils.sourceforge.net/docs/user/rst/quickref.html).
-
-
-### Edit structure
-
-The `sphinx` (our documentation generating system) uses `toctree` to organize documentation. `toctree` means `table of content tree`. 
-
-Please see the [sphinx documentation](http://www.sphinx-doc.org/en/master/), especially [`toctree` directives](http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html)
diff --git a/book b/book
deleted file mode 160000
index f4b5cc835ef77e55cfc001d51f8f77565475dc45..0000000000000000000000000000000000000000
--- a/book
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f4b5cc835ef77e55cfc001d51f8f77565475dc45
diff --git a/build/.gitignore b/build/.gitignore
deleted file mode 100644
index 72e8ffc0db8aad71a934dd11e5968bd5109e54b4..0000000000000000000000000000000000000000
--- a/build/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8701b2b54d96c104e6df13f28a0c028b1ca8d16
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
@@ -0,0 +1,56 @@
+# Anakin ARM 性能测试
+
+## 测试环境和参数:
++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
++ 采用android ndk交叉编译，gcc 4.9，enable neon， ABI： armveabi-v7a with neon -mfloat-abi=softfp
++ 测试平台
+   - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+   - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+   - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ 多线程：openmp
++ 时间：warmup10次，运行10次取均值
++ ncnn版本：来源于github的master branch中commits ID：307a77f04be29875f40d337cfff6df747df09de6（msg:convert            LogisticRegressionOutput)版本
++ TFlite版本：来源于github的master branch中commits ID：65c05bc2ac19f51f7027e66350bc71652662125c（msg:Removed unneeded file copy that was causing failure in Pi builds)版本
+
+在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
+
+## BenchMark model
+
+> 注意在性能测试之前，请先将测试model通过[External Converter](#10003)转换为Anakin model
+> 对这些model，本文在ARM上进行多线程的单batch size测试。
+
+- [Mobilenet v1](#11)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#22)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [mobilenet-ssd](#33)  *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
+
+### <span id = '11'> mobilenetv1 </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
+   |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
+   |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan|
+
+### <span id = '22'> mobilenetv2 </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
+   |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
+   |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
+
+### <span id = '33'> mobilenet-ssd </span>
+
+   |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+   |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+   |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
+   |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
+   |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
+
+## How to run those Benchmark models?
+
+   1. 首先, 使用[External Converter](./convert_paddle_to_anakin.html)对caffe model 进行转换
+   2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
+   3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
+   4. 最后，终端显示器上将会打印该模型的运行时间
+   5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
diff --git a/doc/fluid/advanced_usage/deploy/anakin_example.md b/doc/fluid/advanced_usage/deploy/anakin_example.md
new file mode 100644
index 0000000000000000000000000000000000000000..3cd684982e96077fefa7dd7a3d8a0e79a428f5d1
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_example.md
@@ -0,0 +1,38 @@
+# Anakin 运行模型示例
+
+Anakin目前只支持NCHW的格式
+
+示例文件在test/framework/net下
+
+## 在NV的GPU上运行CNN模型
+
+示例文件为打开example_nv_cnn_net.cpp，整体流程如下：
+
+- 将模型的的path设置为anakin模型的路径，初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或Paddle的模型得到
+- 根据模型设置网络图的输入尺寸，进行图优化
+- 根据优化后的网络图初始化网络执行器
+- 取出网络的输入tensor，将数据拷贝到输入tensor
+- 运行推导
+- 取出网络的输出tensor
+
+以NV平台为例演示Anakin框架的使用方法，注意编译时需要打开GPU编译开关
+
+## 在X86上运行RNN模型
+
+示例文件为example_x86_rnn_net.cpp
+
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+
+- 使用X86标识初始化图对象和网络执行器对象
+- rnn模型的输入尺寸是可变的，初始化图时的输入维度是维度的最大值，输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词，其中第0到第4个词是第一句话，第5到第11个词是第二句话
+
+以X86平台为例演示Anakin框架的使用方法，注意编译时需要打开X86编译开关
+
+## 在NV的GPU上使用Anakin的线程池运行CNN模型
+
+示例文件为example_nv_cnn_net_multi_thread.cpp ，示例使用worker的同步预测接口
+
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+
+- 用模型地址和线程池大小初始化worker对象
+- 将输入tensor注入任务队列,获得输出tensor
diff --git a/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..72a5d50d99c982aa29ebb1fdbc55cd836aabce53
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -0,0 +1,165 @@
+# Anakin GPU 性能测试
+
+## 环境:
+
+>  CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
+>  GPU: `Tesla P4`
+>  cuDNN: `v7`
+
+
+## anakin 对比对象:
+
+**`Anakin`** 将与高性能的推理引擎 **`NVIDIA TensorRT 3`** 进行比较
+
+## Benchmark Model
+
+> 注意在性能测试之前，请先将测试model通过 `External Converter` 工具转换为Anakin model
+> 对这些model，本文在GPU上进行单线程单GPU卡的性能测试。
+
+- [Vgg16](#1)   *caffe model 可以在[这儿](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)下载*
+- [Yolo](#2)  *caffe model 可以在[这儿](https://github.com/hojel/caffe-yolo-model)下载*
+- [Resnet50](#3)  *caffe model 可以在[这儿](https://github.com/KaimingHe/deep-residual-networks#models)下载*
+- [Resnet101](#4)  *caffe model 可以在[这儿](https://github.com/KaimingHe/deep-residual-networks#models)下载*
+- [Mobilenet v1](#5)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#6)  *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [RNN](#7)  *暂不支持*
+
+### <span id = '1'>VGG16 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 8.8690 | 8.2815 |
+| 2 | 15.5344 | 13.9116 |
+| 4 | 26.6000 | 21.8747 |
+| 8 | 49.8279 | 40.4076 |
+| 32 | 188.6270 | 163.7660 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 963 | 997 |
+| 2 | 965 | 1039 |
+| 4 | 991 | 1115 |
+| 8 | 1067 | 1269 |
+| 32 | 1715 | 2193 |
+
+
+### <span id = '2'>Yolo </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 16.4596| 15.2124 |
+| 2 | 26.6347| 25.0442 |
+| 4 | 43.3695| 43.5017 |
+| 8 | 80.9139 | 80.9880 |
+| 32 | 293.8080| 310.8810 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 1569 | 1775 |
+| 2 | 1649 | 1815 |
+| 4 | 1709 | 1887 |
+| 8 | 1731 | 2031 |
+| 32 | 2253 | 2907 |
+
+### <span id = '3'> Resnet50 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 4.2459   |  4.1061 |
+| 2 |  6.2627  |  6.5159 |
+| 4 | 10.1277  | 11.3327 |
+| 8 | 17.8209  | 20.6680 |
+| 32 | 65.8582 | 77.8858 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 531  | 503 |
+| 2 | 543  | 517 |
+| 4 | 583 | 541 |
+| 8 | 611 | 589 |
+| 32 |  809 | 879 |
+
+### <span id = '4'> Resnet101 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 7.5562 | 7.0837 |
+| 2 | 11.6023 | 11.4079 |
+| 4 | 18.3650 | 20.0493 |
+| 8 | 32.7632 | 36.0648 |
+| 32 | 123.2550 | 135.4880 |
+
+- GPU Memory Used (`MB)`
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 701  | 683 |
+| 2 | 713  | 697 |
+| 4 | 793 | 721 |
+| 8 | 819 | 769 |
+| 32 | 1043 | 1059 |
+
+###  <span id = '5'> MobileNet V1 </span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 45.5156  |  1.3947 |
+| 2 |  46.5585  |  2.5483 |
+| 4 | 48.4242  | 4.3404 |
+| 8 |  52.7957 |  8.1513 |
+| 32 | 83.2519 | 31.3178 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 329  | 283 |
+| 2 | 345  | 289 |
+| 4 | 371 | 299 |
+| 8 | 393 | 319 |
+| 32 |  531 | 433 |
+
+###  <span id = '6'> MobileNet V2</span>
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 65.6861 | 2.9842 |
+| 2 | 66.6814 | 4.7472 |
+| 4 | 69.7114 | 7.4163 |
+| 8 | 76.1092 | 12.8779 |
+| 32 | 124.9810 | 47.2142 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 341 | 293 |
+| 2 | 353 | 301 |
+| 4 | 385 | 319 |
+| 8 | 421 | 351 |
+| 32 | 637 | 551 |
+
+## How to run those Benchmark models
+
+1. 首先, 使用[External Converter](./convert_paddle_to_anakin.html)对caffe model 进行转换
+2. 然后跳转至 *source_root/benchmark/CNN* 目录下，使用 'mkdir ./models'创建存放模型的目录，并将转换好的Anakin模型放在该目录下
+3. 运行脚本 `sh run.sh`，运行结束后，该模型的运行时间将会显示到终端上
+4. 如果你想获取每层OP的运行时间，你只用将 CMakeLists.txt 中的`ENABLE_OP_TIMER` 设置为 `YES` 即可
diff --git a/doc/fluid/advanced_usage/deploy/anakin_parser_design.md b/doc/fluid/advanced_usage/deploy/anakin_parser_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2ec0c68dea031bf50c3adb37a7795a7f380eca0
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_parser_design.md
@@ -0,0 +1,92 @@
+# Parser的编写指南
+
+  Parser是一种网络框架转换工具，将其他框架如Caffe、TensorFlow的网络结构转换为Anakin网络结构图，然后对转换后的Anakin图进行预测处理
+
+  本文主要介绍Parser功能的框架结构和根据已有的网络框架改写Parser，以解析得到Anakin框架图，进行Anakin预测
+
+  下文称Anakin为AK，运算操作为OP,本文参考TensorFlow的Parser编写,参考代码目录为tools/external_converter_v2/parser/tensorflow
+
+## Parser的功能和执行流程
+
+  Parser功能是将其他深度学习框架(如Caffe，TensorFlow，ONNX)的模型转换为AK的模型
+
+  对AK的作用是屏蔽不同框架间的差异，这种差异包括模型存储、OP的定义、图差异
+
+  因此Parser的执行流程是：
+
+  - 将源框架的模型载入Parser
+  - 将原框架的图解析为AK中的OP节点和OP节点的连接关系
+  - 进行OP定义的转换和图优化
+  - 将符合AK标准的图写入protobuf
+
+## Parser的目录结构
+
+  Parser工具在tools/external_converter_v2/parser目录下
+
+  Parser的目录主要包含3部分:
+
+  - Parser的运行配置文件包括 config.py, config.yaml, converter.py, 用户只用执行converter.py，Parser就会按照config.yaml中的声明去解析模型
+  - Parser的公共定义，包括operations,pbs,proto三个目录。Parser的公共工具函数 graph*.py logger.py utils.py
+  - 各个框架对应的Parser，其目录的命名方式为框架名,如Caffe, TensorFlow
+
+## Parser的编写流程
+
+### 1、声明你的Parser
+
+  - 在config.yaml中填写你的Parser运行的必要信息，包括ProtoPath和SavePath等。OPTIONS/Framework改为你的Parser的类型，TARGET下填写对应的参数列表
+  - 添加你的Parser目录，如TensorFlow，导出你的Parser符号。注意，Parser的框架默认调用你的Parser类中的__call__方法来执行解析，这个方法需要返回填写完毕的GraphProtoIO对象
+  - 在config.py中Configuration下__init__函数中增加对你的Parser的调用，将yaml中读取的配置信息传给你的Parser，此处调用你的Parser中的__init__方法
+
+### 2、添加你的Parser主体
+
+  可以参考parser_tf.py
+
+  - 你需要在Parser主体构造时获取模型路径，input，ouput名字等解析必须的信息
+  - 在__call__中返回填写好的GraphProtoIO对象，该对象为填写protobuf的辅助工具
+  - 建议Parser的解析过程分成三部分，先将原框架的模型载入并转换为一种便于修改的中间的图形式；对中间图修改使得图满足AK的要求；将满足要求的中间图利用NodeProtoIO和GraphProtoIO这两个辅助类填入protobuf，具体细节可以参考parser_tf
+
+### 3、读取原始模型，并将模型转换为中间类型
+
+  可以参考parse_tf_2_med.py
+
+  - 这一步与原始框架结合紧密，你可能需要import原始框架的工具函数来完成模型的裁剪、固定、加载等操作
+  - 大部分的框架都是使用tensor来连接OP的，但AK中是OP直接相连，这点需要注意
+  - AK的shape默认是4维的，有的参数的shape不足4维，需要Parser补全
+
+### 4、对中间类型的图进行优化
+
+  可以参考med_graph.py
+
+  - 由于AK不支持普通OP多输出的情况，需要在多输出的OP后面补上Splite类型的OP节点
+  - 对于Convlution后接Batchnorm这种可以合并又不会导致OP定义改变的情况，需要Parser在这一步做掉
+  - AK规定所有的输入类型OP的名字必须是input_x这种命名方式，其中x为从0开始的数字
+
+### 5、将中间类型的图以GraphProtoIO的方式保存
+
+  可以参考parse_med_2_ak.py 和 parser_tf.py
+
+  - 你首先需要构造Node节点，Node节点的名字是OP的名字(如conv2d_1_a_0)，Node节点中OP成员变量的名字是Node节点的类型(如Convlution)
+  - Node节点需要按照输入的顺序用Node的add_in方法填写输入Node的名字，add_out方法按顺序填写输出Node的名字
+  - 通过调用GraphProtoIO的add_node方法将构造好的Node的__call__方法的返回值作为参数，将Node节点加入AK的graph中
+  - 调用GraphProtoIO的add_in_edge和add_out_edge完成AK图中OP间关系的构建。如果Node中的in和out填写正确，你也可以通过调用GraphProtoIO的format_edge_from_nodes方法完成这个工作
+  - AK的模型需要Parser给出输出Node的名字，使用GraphProtoIO的add_out方法填写输出Node的名字
+
+### 6、检查模型解析的正确性
+
+  - 默认的config.yaml配置会在解析结束后启动一个web服务器展示解析后的AK模型图，你需要对比原框架的模型图进行验证。这里最容易出现的错误是边关系的错误，表现为图非常乱，你需要逐条边地检查错误；第二个容易出错的地方是参数漏填，需要你检查OP中的属性
+  - 将解析后的模型放入AK中执行，使用相同的输入，原框架与AK有相同的输出。若果输出不一致可以开启AK的DEBUG模式，在net.cpp中将没层的输出打印；如果AK在解析阶段陷入死循环，大概率是边的关系出错
+
+## 如何添加新OP
+
+  - 需要在AK代码中加入该OP的实现，包括对应设备Saber的OP，Saber单测和Framework中的OP
+  - 根据Framework的OP在ops.py中添加Parser公共的OP定义
+  - 从原框架的模型中解析出该OP的节点，并在AK的graph中填入该OP节点
+
+## AK模型与其他框架模型的不同之处
+
+  + AK模型与caffe的模型相似，因此与其他模型有很多不同的地方，需要Parser在解析过程中处理掉
+  + 最大的不同是与PaddlePaddle或TensorFlow的模型中OP粒度很细，而AK的模型中OP的粒度很粗（目的是为了节省访存开销）。这会导致解析这些框架的模型时存在大量的合并操作
+  + 其次是OP的行为不同,如TensorFlow中Pooling默认都是exclusive的，而AK中是inclusive的。TensorFlow的Padding，如果是奇数pad，则在右方和下方多pad，而AK是在左方和上方多Pad
+  + AK默认的布局是NCHW，如果其他框架的OP是其他形式的，需要在Parser中做weights的布局转换，并处理reshape的问题
+  + AK中有的weights是需要预先做布局转换的(如GRU，LSTM)，AK中也支持同一OP的不同算法，如(GRU，Pooling)
+
diff --git a/doc/fluid/advanced_usage/deploy/anakin_run_on_arm.md b/doc/fluid/advanced_usage/deploy/anakin_run_on_arm.md
new file mode 100644
index 0000000000000000000000000000000000000000..cdebd4ae090668ea2f4d417da99f7e50e34e323e
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_run_on_arm.md
@@ -0,0 +1,193 @@
+## ARM 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台，采用Android NDK交叉编译工具链，已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### <span id = '0001'> 1. 系统需求 </span> ###
+
+*  宿主机: linux, mac
+*  cmake 3.8.2+
+*  Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### <span id = '0002'> 2. 安装第三方依赖 </span> ###
+
+- 2.1 protobuf3.4.0
+
+  源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)
+
+  - 2.1.1 为宿主机编译protobuf
+
+  ```bash
+    $ tar -xzf protobuf-3.4.0.tar.gz
+    $ cd protobuf-3.4.0
+    $ ./autogen.sh
+    $ ./configure
+    $ make
+    $ make check
+    $ make install
+  ```
+
+  上述 $make install 执行后，可在 `/usr/local/include/google` 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下, 然后将已经生成文件清除。
+
+  如有问题，请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
+
+  ```bash
+    $ make distclean
+  ```
+
+  - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf，注意设置ANDROID_NDK的路径，以及ARCH_ABI、HOSTOSN的值
+
+  ```bash
+
+    $ export ANDROID_NDK=your_ndk_path
+    $ ARCH_ABI="arm-linux-androideabi-4.9"
+    $ HOSTOSN="darwin-x86_64"
+    $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm
+    $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+    $ export LDFLAGS="--sysroot=$SYSROOT"
+    $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+    $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+    $ export CPPFLAGS=""
+    $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+    $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+    $ export CCFLAGS="$CXXFLAGS"
+    $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+    $ export CC="$CXX"
+    $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"
+    $ ./autogen.sh
+    $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"
+    $ make
+  ```
+
+  编译生成 *.a 静态库，若希望编译*.so 动态链接库 ，请在./configure参数中改--disable-shared为--disable-static --enable-shared
+
+  生成文件在`src/.libs/`下，将生成的文件拷贝至`Anakin/third-party/arm-android/protobuf/lib`下
+
+  在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。
+
+  ```cmake
+    set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+  ```
+
+- 2.2 opencv 2.4.3+(optional)
+
+    Anakin只在examples示例中使用opencv
+
+    Android系统的opencv从[这里下载](https://opencv.org/releases.html)
+
+    解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`
+
+    在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`
+
+    并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径
+
+    ```cmake
+      include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+      LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+    ```
+
+### <span id = '0003'> 3. Anakin源码编译 </span> ###
+
+#### 编译Android版本
+
+克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+
+```bash
+  cd your_dir
+  git clone https://github.com/PaddlePaddle/Anakin.git
+  cd Anakin
+  git fetch origin arm
+  git checkout arm
+```
+
+修改`android_build.sh`
+
+  - 修改NDK路径
+
+  ```bash
+    #modify "your_ndk_path" to your NDK path
+    export ANDROID_NDK=your_ndk_path
+  ```
+
+  - 修改ARM 处理器架构
+
+    对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`
+    对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`
+    目前我们只支持 `armeabi-v7a with NEON`；`arm64-v8a` 还在开发中
+
+  ```bash
+      -DANDROID_ABI="armeabi-v7a with NEON"
+  ```
+
+- 设置Android API
+
+  根据Android系统的版本设置API level， 例如API Level 21 -> Android 5.0.1
+
+  ```bash
+      -DANDROID_NATIVE_API_LEVEL=21
+  ```
+
+- 选择编译静态库或动态库
+
+  设置`BUILD_SHARED=NO`编译静态库
+  设置`BUILD_SHARED=YES`编译动态库
+
+  ```bash
+      -DBUILD_SHARED=NO
+  ```
+
+- OpenMP多线程支持
+
+  设置`USE_OPENMP=YES`开启OpenMP多线程
+
+  ```bash
+      -DUSE_OPENMP=YES
+  ```
+
+- 编译单测文件
+
+  设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件
+
+  ```bash
+      -DBUILD_WITH_UNIT_TEST=YES
+  ```
+
+- 编译示例文件
+
+  设置`BUILD_EXAMPLES=YES`将会编译示例文件
+    ```bash
+        -DBUILD_EXAMPLES=YES
+    ```
+
+- 开启opencv
+
+  如果使用opencv，设置`USE_OPENCV=YES`
+
+  ```bash
+    -DUSE_OPENCV=YES
+  ```
+
+- 开始编译
+
+  运行脚本 `android_build.sh` 将自动编译Anakin
+
+  ```bash
+      ./android_build.sh
+  ```
+
+### <span id = '0004'> 4. 验证安装 </span> ###
+
+编译好的库会放在目录`${Anakin_root}/output`下；
+
+编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下；
+
+编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
+
+对于Android系统，打开设备的调试模式，通过ADB可以访问的目录是`data/local/tmp`，通过ADB push将测试文件、模型和数据发送到设备目录， 运行测试文件。
diff --git a/doc/fluid/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..befc969d99e23d0b27c9e56643175a1dd6e298f1
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
@@ -0,0 +1,642 @@
+# Anakin 使用教程 ##
+
+本教程将会简略的介绍Anakin的工作原理，一些基本的Anakin API，以及如何调用这些API。
+
+## 内容 ###
+
+- [Anakin的工作原理](#principle)
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## <span id = 'principle'> Anakin的工作原理</span> ###
+
+![Anakin_principle](../pics/anakin_fm_ch.png)
+
+用Anakin来进行前向计算主要分为三个步骤：
+
+  - 将外部模型通过[Anakin Parser](./convert_paddle_to_anakin.html)解析为Anakin模型
+    在使用Anakin之前，用户必须将所有其他模型转换成Anakin模型，我们提供了转换脚本，用户可通过[Anakin Parser](./convert_paddle_to_anakin.html)进行模型转换。
+  - 生成Anakin计算图
+    加载Anakin模型生成原始计算图，然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
+  - 执行计算图
+    Anakin会选择不同硬件平台执行计算图。
+
+
+## <span id ='api'>Anakin APIs </span> ###
+
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理，为ops提供统一的数据接口。`Tensor`包含以下几个属性：
+
+- Buffer
+  数据存储区
+- Shape
+  数据的维度信息
+- Event
+  用于异步计算的同步
+
+`Tensor`类包含三个`Shape`对象， 分别是`_shape`, `_valid_shape`和 `offset`
+
+  - `_shape`为`tensor`真正空间信息
+  - `_valid_shape`表示当前`tensor`使用的空间信息
+  - `tensor`使用的空间信息
+  - `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息
+
+`Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示
+
+Dimentions | Math entity |
+:----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template<typename TargetType, DataType datatype, typename LayOutType = NCHW>
+ class Tensor .../* Inherit other class */{
+  //some implements
+  ...
+ };
+```
+
+TargetType是平台类型，如X86，GPU等等，在Anakin内部有相应的标识与之对应；datatype是普通的数据类型，在Anakin内部也有相应的标志与之对应
+
+[LayOutType](#layout)是数据分布类型，如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识
+
+Anakin中数据类型与基本数据类型的对应如下:
+
+  1. <span id = 'target'> TargetType </span>
+
+    Anakin TargetType | platform
+    :----: | :----:
+    NV | NVIDIA GPU
+    ARM | ARM
+    AMD | AMD GPU
+    X86 | X86
+    NVHX86 | NVIDIA GPU with Pinned Memory
+
+  2. <sapn id='datatype'> DataType </span>
+
+    Anakin DataType | C++ | Description
+    :---: | :---: | :---:
+    AK_HALF | short | fp16
+    AK_FLOAT | float | fp32
+    AK_DOUBLE | double | fp64
+    AK_INT8 | char | int8
+    AK_INT16 | short | int16
+    AK_INT32 | int | int32
+    AK_INT64 | long | int64
+    AK_UINT8 | unsigned char | uint8
+    AK_UINT16 | unsigned short | uint8
+    AK_UINT32 | unsigned int | uint32
+    AK_STRING | std::string | /
+    AK_BOOL | bool | /
+    AK_SHAPE | / | Anakin Shape
+    AK_TENSOR | / | Anakin Tensor
+
+  3. <span id = 'layout'> LayOutType </span>
+
+    Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+    :---: | :---: | :---: | :---:
+    W | 1-D | YES | NO
+    HW | 2-D | YES | NO
+    WH | 2-D | YES | NO
+    NW | 2-D | YES | YES
+    NHW | 3-D | YES |YES
+    NCHW ( default ) | 4-D | YES | YES
+    NHWC | 4-D | YES | NO
+    NCHW_C4 | 5-D | YES | YES
+
+  理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOuteType，NCHW_C4是专门针对于int8这种数据类型的。
+
+  **例子：**
+
+下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
+
+要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
+
+1. 使用shape对象初始化tensor
+
+    ```cpp
+    //create a null tensor. A null tensor holds for nothing.
+    //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
+    //tensor's Layout is NCHW(default)
+    Tensor<X86, AK_FLOAT> mytensor;
+
+    //1. using shape object to create a tensor.
+    Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+    Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
+
+    // A 4-D shape
+    Shape shape2(N, C, H, W); // batch x channel x height x width
+    ```
+
+    `注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`
+
+    ```c++
+    // A 4-D tensor.
+    Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+
+    //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+    Tensor<NV, AK_INT8> mytensor3(shape2);   //right
+
+    Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+    Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
+    ```
+
+2. 使用现有的数据和shape初始化tensor
+
+    ```c++
+    /**
+    *  A construtor of Tensor.
+    *  data_ptr is a pointer to any data type of data
+    *  TargetType is type of a platform [Anakin TargetType]
+    *  id : device id
+    *  shape: a Anakin shape
+    */
+    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+    //using existing data feed to a tensor
+    Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+    ```
+
+3. 使用tensor初始化tensor
+
+    ```c++
+    Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+    ```
+
+>提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
+
+#### 填充tensor数据区
+
+填充数据区得看你申明tensor的方式， 下面展示了如何填充tensor的数据区。
+
+首先来看看tensor的四种声明方式：
+
+```c++
+  1. Tensor<X86, AK_FLOAT> mytensor;
+  2. Tensor<X86, AK_FLOAT, W> mytensor1(shape1);
+  3. Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape);
+  4. Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+```
+
+相关的声明方式的数据填充方法如下：
+
+- 声明一个空的tensor，此时没有为其分配内存，所以，我们需要手动的为其分配内存。
+
+```c++
+
+        //parama shape
+        mytensor.re_alloc(Shape shape);
+
+        //Get writable pointer to mytensor.
+        //parama index (int): where you start to write.
+        //Dtype is your data type such int, float or double.
+        Dtype *p = mytensor.mutable_data(index/*=0*/);
+        //write data to mytensor
+        for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+        }
+        //do something ...
+```
+
+- 这种声明方式会自动分配内存
+
+```c++
+        //Get writable pointer to mytensor.
+        //parama index (int): where you start to write.
+        //Dtype is your data type such int, float or double.
+        Dtype *p = mytensor1.mutable_data(index/*=0*/);
+        //write data to mytensor
+        for(int i = 0; i < mytensor.size(); i++){
+           p[i] = 1.0f;
+        }
+        //do something ...
+```
+
+- 在该种声明方式中，我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存，得依情况而定。如果data_ptr和申明的
+  tensor都在都一个目标平台上，那么该tensor就会与data_ptr共享内存空间，相反，如果他们不在同一个平台上（如data_ptr在X86上，而
+  tensor在GPU上），那么此时tensor就会开辟一个新的内存空间，并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+```c++
+        //Get writable pointer to mytensor.
+        //parama index (int): where you start to write.
+        //Dtype is your data type such int, float or double.
+        Dtype *p = mytensor.mutable_data(index/*=0*/);
+        //write data to mytensor
+        for(int i = 0; i < mytensor.size(); i++){
+          p[i] = 1.0f;
+        }
+        //do something ...
+```
+
+- 该种方式仍不需要手动分配内存
+
+```c++
+        //Get writable pointer to mytensor.
+        //parama index (int): where you start to write.
+        //Dtype is your data type such int, float or double.
+        Dtype *p = mytensor.mutable_data(index/*=0*/);
+        //write data to mytensor
+        for(int i = 0; i < mytensor.size(); i++){
+           p[i] = 1.0f;
+        }
+       //do something ...
+```
+
+- 另外，你还可以获取一个tensor的可读指针，示例如下：
+
+```c++
+        //Get read-only pointer to mytensor.
+        //parama index (int): where you start to read.
+        //Dtype is your data type such int, float or double.
+        Dtype *p = mytensor.data(index/*=0*/);
+        //do something ...
+```
+
+如果想更详细的了解tensor，请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+  //some declarations
+  // ...
+  Shape shape = mytensor.shape();
+
+  //Get a first dimetion size of tesor, if it has.
+  int d1 = shape[0];
+
+  //Get a second dimention size of tensor, if it has.
+  int d2 = shape[1];
+
+  ...
+
+  //Get a n-th dimention size of tensor, if it has.
+  int dn = shape[n-1];
+
+
+  //Get a tensor's dimention
+  int dims = mytensor.dims();
+
+  //Get the size of tensor.
+  //size = d1 x d2 x ... x dn.
+  int size = mytensor.size();
+
+  //Get the size of tensor at interval [Di, Dj)
+  // form i-th dimention to j-th dimention, but not including the j-th dimention.
+  // which means di x (di+1) x ... x (dj -1)
+  int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+```c++
+  /**
+   * \brief set a tensor's shape
+   * \param valid_shape [a Shape object]
+   * \param shape [a Shape object]
+   * \param offset [a Shape object]
+   * \return the status of this operation, that means whether it success * or not.
+   */
+  SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value));
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同，如果不同就会出错，返回SaberInvalidValue。 如果相同，那么将成功设置tensor的shape。
+
+```c++
+
+  // some declarations
+  // ...
+  //valid_shape, shape , offset are Shape object;
+  //All these Shape object's LayOutType must be equal to mytensor's.
+  mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+  //some declarations
+  Shape shape, valid_shape, offset;
+
+  //do some initializations
+  ...
+  mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意： Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样，graph也接受三个模板参数。
+
+```c++
+
+  template<typename TargetType, DataType Dtype, Precision Ptype>
+  class Graph ... /* inherit other class*/{
+
+    //some implements
+    ...
+
+  };
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+```c++
+
+  //Create a empty graph object.
+  Graph graph = Graph<NV, AK_FLOAT, Precision::FP32> tmp();
+
+  //Create a pointer to a empty graph.
+  Graph *graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+  //Create a pointer to a empty graph.
+  auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+  //some declarations
+  ...
+  auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+  std::string model_path = "the/path/to/where/your/models/are";
+  const char *model_path1 = "the/path/to/where/your/models/are";
+
+  //Loading Anakin model to generate a compute graph.
+  auto status = graph->load(model_path);
+
+  //Or this way.
+  auto status = graph->load(model_path1);
+  //Check whether load operation success.
+  if(!status){
+    std::cout << "error" << endl;
+    //do something...
+  }
+
+```
+
+#### 优化计算图
+
+```c++
+  //some declarations
+  ...
+  //Load graph.
+  ...
+  //According to the ops of loaded graph, optimize compute graph.
+  graph->Optimize();
+
+```
+
+> 注意： 第一次加载原始图，必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型， 特别的， 你可以保存一个优化的模型，这样，下次再加载模型时，就不必进行优化操作。
+
+```c++
+  //some declarations
+  ...
+  //Load graph.
+  ...
+  // save a model
+  //save_model_path: the path to where your model is.
+  auto status = graph->save(save_model_path);
+
+  //Checking
+  if(!status){
+    cout << "error" << endl;
+    //do somethin...
+  }
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+  //some declarations
+  ...
+  //Load graph.
+  ...
+  vector<int> shape{10, 256, 256, 10};
+  //input_name : std::string.
+  //Reshape a tensor named input_name.
+  graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+  //some declarations
+  ...
+  //Load graph.
+  ...
+  //input_name : std::string.
+  //Reset a tensor named input_name.
+  int new_batch_size = 4;
+  graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+###  Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。
+
+
+```c++
+  template<typename TargetType, DataType Dtype, Precision PType OpRunType RunType = OpRunType::ASYNC>
+  class Net{
+    //some implements
+    ...
+
+  };
+```
+由于有些Op可能支持多种精度，我们可以通过Precision来指定。OpRunType表示同步或异步类型，异步是默认类型。OpRunType::SYNC表示同步，在GPU上只有单个流；OpRunType::ASYNC表示异步，在GPU上有多个流并以异步方式执行。实际上，Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. <span id = 'precision'> Precision </span>
+
+  Precision | Op support
+  :---: | :---:
+  Precision::INT4 | NO
+  Precision::INT8 | NO
+  Precision::FP16 | NO
+  Precision::FP32 | YES
+  Precision::FP64 | NO
+
+现在Op的精度只支持FP32， 但在将来我们会支持剩下的Precision.
+
+2.  <span id = '1'> OpRunType </span>
+
+  OpRunType | Sync/Aync |Description
+  :---: | :---: | :---:
+  OpRunType::SYNC | Synchronization | single-stream on GPU
+  OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器
+
+```c++
+  //some declarations
+  ...
+  //Create a pointer to a graph.
+  auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+  //do something...
+  ...
+
+  //create a executor
+  Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+获取输入输出tensor，并填充输入tensor的buffer。如果想要获取输入和输出tensor，那么必须指定输入的名字，如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外，如果想知道input_i对应哪个输入，你需要去dash board查看，如何使用dash board请看[Anakin Parser](./convert_paddle_to_anakin.html)。请看如下示例代码
+
+```c++
+  //some declaratinos
+  ...
+
+  //create a executor
+  //TargetType is NV [NVIDIA GPU]
+  Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+  //Get the first input tensor.
+  //The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+  //Note: Member function get_in returns an pointer to tensor.
+  Tensor<NV, AK_FLOAT>* tensor_in0 = executor.get_in("input_0");
+
+  //If you have multiple input tensors
+  //You just type this code below.
+  Tensor<NV, AK_FLOAT>* tensor_in1 = executor.get_in("input_1");
+  ...
+  auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后，就可以填充它的数据区了。
+
+```c++
+    //This tensor is resident at GPU.
+    auto tensor_d_in = executor.get_in("input_0");
+
+    //If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+    //using Tensor4d = Tensor<Ttype, Dtype>;
+    Tensor4d<X86, AK_FLOAT> tensor_h_in; //host tensor;
+    //Tensor<X86, AK_FLOAT> tensor_h_in;
+
+    //Allocate memory for host tensor.
+    tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+    //Get a writable pointer to tensor.
+    float *h_data = tensor_h_in.mutable_data();
+
+    //Feed your tensor.
+    /** example
+    for(int i = 0; i < tensor_h_in.size(); i++){
+      h_data[i] = 1.0f;
+    }
+    */
+    //Copy host tensor's data to device tensor.
+    tensor_d_in->copy_from(tensor_h_in);
+
+    // And then
+```
+
+类似的，我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是， 我们需要指定输入tensor结点的名字，这个可以从dash board中看到，请从[Anakin Parser](./convert_paddle_to_anakin.html)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor：
+
+```c++
+  //Note: this tensor are resident at GPU.
+  Tensor<NV, AK_FLOAT>* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+#### Executing graph
+
+当一切准备就绪后，我们就可以执行真正的计算了！
+```c++
+  executor.prediction();
+```
+
+## <span id='example'> 示例代码 </span> ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前， 请确保你已经有了Anakin模型。如果还没有，那么请使用[Anakin Parser](./convert_paddle_to_anakin.html)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *`source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+  std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+  // Create an empty graph object.
+  auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+  // Load Anakin model.
+  auto status = graph->load(model_path);
+  if(!status ) {
+      LOG(FATAL) << " [ERROR] " << status.info();
+  }
+  // Reshape
+  graph->Reshape("input_0", {10, 384, 960, 10});
+  // You must optimize graph for the first time.
+  graph->Optimize();
+  // Create a executer.
+  Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
+
+  //Get your input tensors through some specific string such as "input_0", "input_1", and
+  //so on.
+  //And then, feed the input tensor.
+  //If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+  auto d_tensor_in_p = net_executer.get_in("input_0");
+  Tensor4d<X86, AK_FLOAT> h_tensor_in;
+  auto valid_shape_in = d_tensor_in_p->valid_shape();
+  for (int i=0; i<valid_shape_in.size(); i++) {
+      LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; //see tensor's dimentions
+  }
+  h_tensor_in.re_alloc(valid_shape_in);
+  float* h_data = h_tensor_in.mutable_data();
+  for (int i=0; i<h_tensor_in.size(); i++) {
+      h_data[i] = 1.0f;
+  }
+  d_tensor_in_p->copy_from(h_tensor_in);
+
+  //Do inference.
+  net_executer.prediction();
+
+  //Get result tensor through the name of output node.
+  //And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+  //For example, you've got a output node named obj_pre_out
+  //Then, you can get an output tensor.
+  auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+  auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+  //......
+  // do something else ...
+  //...
+  //save model.
+  //You might not optimize the graph when you load the saved model again.
+  std::string save_model_path = model_path + std::string(".saved");
+  auto status = graph->save(save_model_path);
+  if (!status ) {
+      LOG(FATAL) << " [ERROR] " << status.info();
+  }
+
+```
diff --git a/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a35875404ce460705de7559fd5eea1247fb69f5
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
@@ -0,0 +1,73 @@
+# 模型转换指南
+
+Anakin 支持不同框架的模型预测。但由于格式的差别，Anakin 需要您预先转换模型, 本文档介绍如何转换模型。
+
+## 简介
+
+Anakin 模型转换器输入支持 Caffe 和 Paddle 两种格式的预测模型，模型包含网络结构（model 或 prototxt）和权重参数（param 或 caffemodel）。
+
+模型转换的输出是一个 bin 文件，它作为 Anakin 框架的 graph 参数导入。
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。
+
+
+## 系统要求
+
+- python 2.7+
+- pyyaml
+- flask
+- protobuf 3.5+
+
+
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于*系统要求*一节。
+
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例，下面作进一步说明。
+
+#### config.yaml
+```bash
+OPTIONS:
+    Framework: CAFFE       # 依框架类型填写 CAFFE 或 Paddle
+    SavePath: ./output     # 转换结束后模型的保存位置
+    ResultName: googlenet  # 输出模型的名字
+    Config:
+        LaunchBoard: ON    # 是否生成网络结构预览页面
+        Server:
+            ip: 0.0.0.0
+            port: 8888     # 从一个可用端口访问预览页面
+        OptimizedGraph:    # 当您使用了 Anakin 框架的 Optimized 功能时，才应该打开此项
+            enable: OFF
+            path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
+    LOGGER:
+        LogToPath: ./log/  # 生成日志的路径
+        WithColor: ON
+
+TARGET:
+    CAFFE:
+        # 当 Framework 为 CAFFE 时需填写
+        ProtoPaths:
+            - /path/to/caffe/src/caffe/proto/caffe.proto
+        PrototxtPath: /path/to/your/googlenet.prototxt
+        ModelPath: /path/to/your/googlenet.caffemodel
+
+    Paddle:
+        # 当 Framework 为 Paddle 时需填写
+        Debug: NULL
+        ProtoPaths:
+            - /
+        PrototxtPath: /path/to/paddle/inference_model
+        ModelPath: /path/to/paddle/inference_model
+	# ...
+```
+
+### 3、转换
+在完成配置文件的修改后，您只需执行 ```python converter.py``` 就可以进行模型转换了。
+
+
+### 4、预览
+最后一步，就是在浏览器中查看转换结果！网址是在 *config.yaml* 中配置的，例如 http://0.0.0.0:8888 。
+
+> 注意：若您使用了默认的 IP 地址 0.0.0.0，请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2783eb9f591a31443f2a692ce0eb1bcc9b1063a
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念，详情请参考设计文档。
+
+```framework```: 上层的逻辑代码，负责从parser中获取参数及weights，添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码，Anakin通过saber封装了不同的backends，不同的实现(impl)分别特化出自己的实现，外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中，增加op主要修改saber/funcs下的内容。
+
+saber的文件结构：
+* saber/funcs下的是各个funcs的外部接口，这一层的op与具体的设备实现无关，只与各op完成的功能有关。由于跟实现(impl)无关，本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明，特定设备需要完成该层声明的特化版本，如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本，saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关，均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件，添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类，位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类，提供统一的对外接口，位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape，并通过```tensor```的```set_shape```接口(只设置shape，不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口，所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类，一类以```vender_```为前缀，带有```vender_```代码意为使用第三方库来实现该op，如cudnn的conv，或mkl的conv等等，这类op的性能我们难以调优，因此单独列为一类。另一类是带有源码的saber实现，这些实现都带有```saber_```为前缀，此类实现带有源码，能够通过后续优化不断提升性能，实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步：
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步，以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下：$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件：```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param，这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数，含参数的构造函数，复制构造函数，```operator=()```及```operator==()```。
+```
+template <typename opTensor> // 能够获得target, datatype, layout
+struct MulParam{
+  MulParam()
+    : alpha(0)
+  {}
+  MulParam(float alpha_in)
+    : alpha(alpha_in)
+  {}
+  MulParam(const MulParam& right)
+    : alpha(right.alpha)
+  {}
+  MulParam &operator=(const MulParam &right) {
+    alpha = right.alpha;
+  }
+  bool operator==(const MulParam &right) {
+    return alpha == right.alpha;
+  }
+  float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类，这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现，需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_FLOAT,
+        DataType outDtype = AK_FLOAT,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW>
+class Mul : public BaseFunc<
+        Tensor<TargetType, inDtype, LayOutType_in>,
+        Tensor<TargetType, outDtype, LayOutType_out>,
+        Tensor<TargetType, OpDtype, LayOutType_op>,
+        ImplBase, MulParam> {
+public:
+    using BaseFunc<
+            Tensor<TargetType, inDtype, LayOutType_in>,
+            Tensor<TargetType, outDtype, LayOutType_out>,
+            Tensor<TargetType, OpDtype, LayOutType_op>,
+            ImplBase, MulParam>::BaseFunc;
+    Mul() = default;
+    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
+    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
+    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
+    typedef MulParam<OpTensor> Param_t;
+    typedef std::vector<InDataTensor *> Input_v;
+    typedef std::vector<OutDataTensor *> Output_v;
+    typedef std::vector<Shape> Shape_v;
+
+    virtual SaberStatus compute_output_shape(const Input_v &input,
+                                             Output_v &output, Param_t &param) override {
+        //计算输出的shape，
+        Shape output_shape = (input[0]->valid_shape());
+        /* code */
+        return output[0]->set_shape(output_shape);
+    }
+    virtual SaberStatus init_impl(ImplEnum implenum) override {
+      // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+      switch (implenum) {
+            case VENDER_IMPL:
+                this->_impl.push_back(new VenderMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            case SABER_IMPL:
+                this->_impl.push_back(new SaberMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            default:
+                return SaberUnImplError;
+        }
+    }
+private:
+    virtual void pick_best_static() override {
+        if (true) // some condition?
+            this->_best_impl = this->_impl[0];
+    }
+    virtual void pick_best_specify(ImplEnum implenum) override {
+        this->_best_impl = this->_impl[0];
+    }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl<span id="declare">声明</span>
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明，特化版本放在对应的文件夹下，这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字，第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端<span id="impl">实现</span>
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op，```saber```指的源码实现的op。这里以cuda的vender实现为例，简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderMul<NV, //偏特化出需要的后端。
+    OpDtype, inDtype, outDtype,
+    LayOutType_op, LayOutType_in, LayOutType_out> :
+    public ImplBase<
+        Tensor<NV, inDtype, LayOutType_in>,
+        Tensor<NV, outDtype, LayOutType_out>,
+        Tensor<NV, OpDtype, LayOutType_op>,
+        MulParam<Tensor<NV, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<NV, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<NV, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<NV, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+    VenderMul(){}
+    ~VenderMul() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        this->_ctx = ctx;
+        create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        // set内部参数
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                        MulParam<OpTensor>& param) {
+        // dispatch kernel.
+    }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别：```init```接口是第一次初始化op的时候进入的接口，此函数只在第一次初始化op时调用，这个接口一般放一些只需要执行一次的代码，如malloc或者create之类的函数。```create```函数除了第一次init执行外，在输入发生变化或者param发生变化时会再次触发，create一般放置set函数，设置内部变量，当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内，如果```create```函数执行了一些严重耗时的操作，这里会拖慢整个op的执行时间，需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper;
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class Mul : public Operator<Ttype, Dtype, Ptype> {
+public:
+    Mul() {}
+    /// forward impl
+    virtual void operator() (OpContext<Ttype> &ctx,
+                             const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                             std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+        LOG(ERROR) << "Not Impl Yet Operator power<TargetType:"<<"unknown"<<","
+                   <<type_id<typename DataTypeWarpper<Dtype>::type>().type_info()<<">";
+    }
+    friend class MulHelper<Ttype, Dtype, Ptype>;
+};
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper : public OperatorHelper<Ttype, Dtype, Ptype> {
+public:
+    MulHelper() = default;
+    ~MulHelper();
+    Status InitParam() override;
+
+    Status Init(OpContext<Ttype> &ctx,
+                const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+    Status InferShape(const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                      std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+
+public:
+    saber::MulParam<Tensor4d<Ttype, Dtype>> _param_mul;
+    saber::Mul<Ttype, Dtype> _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下：
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul<NV, AK_FLOAT, Precision::FP32>::operator()(
+    OpContext<NV>& ctx,
+    const std::vector<Tensor4dPtr<NV, AK_FLOAT> >& ins,
+    std::vector<Tensor4dPtr<NV, AK_FLOAT> >& outs) {
+    auto* impl =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper);
+    auto& param =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper)->_param_mul;
+    impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InitParam() {
+    auto alpha = GET_PARAMETER(float, alpha);
+    MulParam<Tensor4d<Ttype, Dtype>> param_mul(alpha);
+    _param_mul = param_mul;
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::Init(OpContext<Ttype>& ctx,
+        const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+
+    SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InferShape(const
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+    SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+    return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper<NV, AK_FLOAT, Precision::FP32>;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper<ARM, AK_FLOAT, Precision::FP32>;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__<NV, AK_FLOAT, Precision::FP32>("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__<ARM, AK_FLOAT, Precision::FP32>("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args<float>("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+    // init tensors and some param.
+
+    // start Reshape & doInfer
+    Context<NV> ctx1(0, 1, 1);
+
+    // create param
+    MulParam<Tensor<NV, AK_FLOAT, NCHW> > param(alpha);
+
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> input;
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> output;
+
+    // create saber op
+    Mul<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> mul;
+
+    // compute output shape
+    mul.compute_output_shape(input, output, param);
+
+    // re_alloc output tensors memory based on output shape
+    output[0]->re_alloc(output[0]->shape());
+
+    // init saber op(calling init and create)
+    mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    // call operator()
+    mul(input, output, param, ctx1);
+
+    // cuda specified, record events
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+    output_dev.sync();
+    
+    // param changed 
+    param.alpha = 2.0;
+    // auto calling saber op(create and dispatch)
+    mul(input, output, param, ctx1);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<NV>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现，由于存在saber/funcs/impl的非特化版本声明，当有op在某种设备下没有对应实现时，也能够编译，但此时是没有任何实现的空实现，
diff --git a/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..da2c64cf4d842b3136adc21872e66f6101a9fbc7
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤：
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## <span id = '0001'> 在`CMakeList`中添加设备的支持 </span> ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下，修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## <span id = '0002'> 在`saber`中添加设备的实现 </span> ##
+`saber`是`Anakin`的基础计算库，对外提供设备无关的统一的API，设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+    eINVALID = -1,
+    eNV = 1,
+    eAMD = 2,
+    eARM = 3,
+    eX86 = 4,
+    eNVHX86 = 5,
+    eTNEW = 6
+};
+
+typedef TargetType<eNV> NV;
+typedef TargetType<eARM> ARM;
+typedef TargetType<eAMD> AMD;
+typedef TargetType<eX86> X86;
+typedef TargetType<eTNEW> TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits<TNEW> {
+    typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+    typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型，则特化出设备的`DataTrait`类的实现，例如opencl数据类型的实现如下：
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+    ClMem(){
+        dmem = nullptr;
+        offset = 0;
+    }
+
+    ClMem(cl_mem* mem_in, int offset_in = 0) {
+        dmem = mem_in;
+        offset = offset_in;
+    }
+
+    ClMem(ClMem& right) {
+        dmem = right.dmem;
+        offset = right.offset;
+    }
+
+    ClMem& operator=(ClMem& right) {
+        this->dmem = right.dmem;
+        this->offset = right.offset;
+        return *this;
+    }
+
+    ClMem& operator+(int offset_in) {
+        this->offset += offset_in;
+        return *this;
+    }
+
+    int offset{0};
+    cl_mem* dmem;
+};
+
+template <>
+struct DataTrait<AMD, AK_FLOAT> {
+    typedef ClMem Dtype;
+    typedef float dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_DOUBLE> {
+    typedef ClMem Dtype;
+    typedef double dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_INT8> {
+    typedef ClMem Dtype;
+    typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类，在`target_wrapper.h`中声明函数，具体如下：
+```c++
+template <>
+struct TargetWrapper<TNEW, __xxx_target> { //根据TNEW的具体类型修改__xxx_target，__host_target或者__device_target
+
+    typedef xxx_event event_t;          //根据设备实现xxx_event
+    typedef xxx_stream stream_t;        //根据设备实现xxx_stream
+
+    static void get_device_count(int& count);
+
+    static void set_device(int id);
+
+    //We should add strategy to avoid malloc directly
+    static void mem_alloc(void** ptr, size_t n);
+
+    static void mem_free(void* ptr);
+
+    static void mem_set(void* ptr, int value, size_t n);
+
+    static void create_event(event_t& event, bool flag = false);
+
+    static void create_stream(stream_t& stream);
+
+    static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+    static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+    static void destroy_stream(stream_t& stream);
+
+    static void destroy_event(event_t& event);
+
+    static void record_event(event_t& event, stream_t stream);
+
+    static void query_event(event_t& event);
+
+    static void sync_event(event_t& event);
+
+    static void sync_stream(event_t& event, stream_t& stream);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __HtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __HtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoH);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoH);
+
+    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                int src_dev, size_t count);
+
+    static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                 int src_dev, size_t count, stream_t& stream);
+
+    static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper<TNEW, __xxx_target>`结构体中各函数的定义。
+如果`TargetWrapper<TNEW, __xxx_target>`的实现与默认的模板类一致，则不用特化出该类。
+
+```c++
+typedef TargetWrapper<TNEW, __xxx_target> TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+    // add implementation
+}
+
+void TNEW_API::set_device(int id){
+    // add implementation
+}
+
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+    // add implementation
+}
+
+void TNEW_API::mem_free(void* ptr){
+    if(ptr != nullptr){
+        // add implementation
+    }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device<TNEW>`
+
+```c++
+template <>
+void Device<TNEW>::create_stream() {
+    // add implementation
+}
+
+template <>
+void Device<TNEW>::get_info() {
+
+    // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](./how_to_add_anakin_op.html)
+
+
+## <span id = '0003'> 在`framework`中添加设备的具体化或实例化 </span> ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc<TNEW, AK_FLOAT, Precision::FP32>;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP32>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP16>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::INT8>(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template<typename Dtype>
+class PBlock<Dtype, TNEW> {
+public:
+	typedef Tensor4d<TNEW, DataTypeRecover<Dtype>::type> type;
+
+	PBlock() {
+		_inner_tensor = std::make_shared<type>();
+	}
+	...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host<saber::TNEW> {
+    typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+
+```c++
+  #ifdef USE_TNEW_PLACE
+  template class Graph<TNEW, AK_FLOAT, Precision::FP32>;
+  template class Graph<TNEW, AK_FLOAT, Precision::FP16>;
+  template class Graph<TNEW, AK_FLOAT, Precision::INT8>;
+  #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+
+```c++
+  #ifdef USE_TNEW_PLACE
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  #endif
+```
+
+* `model_io.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP32>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP16>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::INT8>;
+#endif
+```
+
+### `framework/operators`
+
+为`framework/operators`目录下所有op添加实例化或具体化
+以`activation.cpp`为例，实例化如下：
+
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template class ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>;
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+如果TNEW设备函数的实现与现有模板实现不一致，可以特化实现如下（以init()为例）：
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template <>
+Status ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>::Init(OpContext<TNEW> &ctx,\
+        const std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& ins, \
+                std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& outs) {
+    SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
+    return Status::OK();
+}
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
+
+```c++
+#ifdef USE_TNEW_PLACE
+.__alias__<TNEW, AK_FLOAT, Precision::FP32>("activation")
+#endif
+```
+
+## 注意事项
+不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/doc/fluid/advanced_usage/deploy/index_anakin.rst b/doc/fluid/advanced_usage/deploy/index_anakin.rst
new file mode 100644
index 0000000000000000000000000000000000000000..32d26156aed1d340482dbe2cb5a273c0679395cd
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/index_anakin.rst
@@ -0,0 +1,28 @@
+Anakin 预测引擎
+#######################
+
+
+使用文档
+~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   install_anakin.md
+   convert_paddle_to_anakin.md
+   anakin_tutorial.md
+   anakin_run_on_arm.md
+   anakin_example.md
+   anakin_gpu_benchmark.md
+   anakin_arm_benchmark.md
+
+
+开发文档
+~~~~~~~
+
+.. toctree::
+   :maxdepth: 1
+
+   how_to_add_anakin_op.md
+   how_to_support_new_device_in_anakin.md
+   anakin_parser_design.md
diff --git a/doc/fluid/advanced_usage/deploy/index_mobile.rst b/doc/fluid/advanced_usage/deploy/index_mobile.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c7a338d9be2b5c7cc6adf046072a592383f0be1a
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/index_mobile.rst
@@ -0,0 +1,8 @@
+移动端部署
+##########
+
+.. toctree::
+   :maxdepth: 2
+
+   mobile_readme.md
+   mobile_build.md
diff --git a/doc/fluid/advanced_usage/deploy/install_anakin.md b/doc/fluid/advanced_usage/deploy/install_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b44a6be3baa51598fa8b2f2af863bed6c9c64e9
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/install_anakin.md
@@ -0,0 +1,76 @@
+## 源码编译安装Anakin ##
+
+我们已经在CentOS 7.3上成功的安装和测试了Anakin，对于其他操作系统，我们将很快支持。
+
+### 安装概览 ###
+
+* [在CentOS上安装 Anakin]()
+* [在Ubuntu上安装 Anakin]()
+* [在ARM上安装 Anakin](./anakin_run_on_arm.html)
+* [验证安装]()
+
+
+### 在CentOS上安装 Anakin ###
+#### 1. 系统要求 ####
+
+*  make 3.82+
+*  cmake 2.8.12+
+*  gcc 4.8.2+
+*  g++ 4.8.2+
+
+#### 2. 编译CPU版Anakin ####
+
+暂时不支持
+
+#### 3. 编译支持NVIDIA GPU的Anakin ####
+
+- 3.1. 安装依赖
+
+  - 3.1.1 protobuf
+
+  ```
+    > git clone https://github.com/google/protobuf
+    > cd protobuf
+    > git submodule update --init --recursive
+    > ./autogen.sh
+    > ./configure --prefix=/path/to/your/insall_dir
+    > make
+    > make check
+    > make install
+    > sudo ldconfig
+  ```
+
+  如安装protobuf遇到任何问题，请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
+
+- 3.2 CUDA Toolkit
+
+  - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher, 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  - [cuDNN v7](https://developer.nvidia.com/cudnn), 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+
+- 3.3  编译Anakin
+
+  ```
+    > git clone https:/xxxxx
+    > cd anakin
+    > mkdir build
+    > camke ..
+    > make
+  ```
+
+#### 4. 编译支持AMD GPU的Anakin ####
+
+暂时还不支持
+
+
+### 在Ubuntu上安装 Anakin ###
+
+暂时还不支持
+
+
+### 在ARM上安装 Anakin ###
+
+请参考[ARM安装文档](./anakin_run_on_arm.html)
+
+### 验证安装 ###
+
+安装完成后，如果没有报错信息，你可以通过运行 `output/unit_test`路径下的单测示例验证是否编译成功。
diff --git a/doc/fluid/advanced_usage/deploy/mobile_build.md b/doc/fluid/advanced_usage/deploy/mobile_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..e51593164987d548e256ddebbc5fa8d960fb5255
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/mobile_build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/doc/fluid/advanced_usage/deploy/mobile_readme.md b/doc/fluid/advanced_usage/deploy/mobile_readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..10fde9c70b16ec74b060f88ca661851f06bf55f8
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/mobile_readme.md
@@ -0,0 +1,87 @@
+# 项目简介
+
+<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
+
+
+欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架
+
+## Features
+
+- 高性能支持ARM CPU 
+- 支持Mali GPU
+- 支持Andreno GPU
+- 支持苹果设备的GPU Metal实现
+- 支持ZU5、ZU9等FPGA开发板
+- 支持树莓派等arm-linux开发板
+
+## Demo
+[ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
+
+### 原Domo目录
+
+请参考这里[这里](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+
+## 文档
+
+### 设计文档
+
+关于paddle-mobile设计文档请参考[这里](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)，如果想了解更多内容，[Issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程
+
+
+### 开发文档
+
+开发文档主要是关于编译、运行等问题。作为开发者，它可以和贡献文档共同结合使用
+
+[iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+
+[Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+
+[FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
+
+[ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
+
+### 贡献代码
+
+- [贡献代码](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+
+- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[Issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理
+
+
+## 模型获得
+目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行
+
+### 1. 直接使用Paddle Fluid训练
+
+该方式最为可靠，推荐方式
+
+### 2. Caffe转为Paddle Fluid模型
+
+请参考这里[这里](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/caffe2fluid)
+
+### 3. ONNX
+
+ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”，该项目的目的是让不同的神经网络开发框架做到互通互用
+
+除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle Fluid模型
+
+目前，百度也在做onnx支持工作。相关转换项目在[这里](https://github.com/PaddlePaddle/paddle-onnx)
+
+### 4. 部分测试模型和测试图片下载
+
+[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+
+如下gif是简单搜索app的线上主体检测应用效果
+
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
+
+## 问题解决
+
+欢迎提出或解决我们的问题，有疑问可以发[Issue](https://github.com/PaddlePaddle/paddle-mobile/issues)
+
+## Copyright and License
+Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE)
+
+
+## 旧版 Mobile-Deep-Learning
+原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
diff --git a/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
new file mode 100644
index 0000000000000000000000000000000000000000..f61beca7ef21198fc992f0dafd9bfc464b4a60f5
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
@@ -0,0 +1,185 @@
+## ARM 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台，采用Android NDK交叉编译工具链，已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### <span id = '0001'> 1. 系统需求 </span> ###
+
+*  宿主机: linux, mac
+*  cmake 3.8.2+
+*  Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### <span id = '0002'> 2. 安装第三方依赖 </span> ###
+
+- 2.1 protobuf3.4.0
+
+   源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)
+
+ - 2.1.1 为宿主机编译protobuf
+
+```bash
+   $ tar -xzf protobuf-3.4.0.tar.gz
+   $ cd protobuf-3.4.0
+   $ ./autogen.sh
+   $ ./configure
+   $ make
+   $ make check
+   $ make install
+```
+
+上述 $make install 执行后，可在 /usr/local/include/google 找到 libprotobuf 所需的头文件，将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下
+
+如有问题，请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)，然后将已经生成文件清除。
+
+```bash
+   $ make distclean
+```
+
+ - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf，注意设置ANDROID_NDK的路径，以及ARCH_ABI、HOSTOSN的值
+
+ ```bash
+
+   $ export ANDROID_NDK=your_ndk_path
+   $ ARCH_ABI="arm-linux-androideabi-4.9"
+   $ HOSTOSN="darwin-x86_64"
+   $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm
+   $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+   $ export LDFLAGS="--sysroot=$SYSROOT"
+   $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+   $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+   $ export CPPFLAGS=""
+   $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+   $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+   $ export CCFLAGS="$CXXFLAGS"
+   $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+   $ export CC="$CXX"
+   $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"
+   $ ./autogen.sh
+   $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"
+   $ make
+```
+
+编译生成 *.a 静态库，若希望编译*.so 动态链接库 ，请在./configure参数中改--disable-shared为--disable-static --enable-shared。
+生成文件在src/.libs/下，将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。
+在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。
+
+```cmake
+  set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+```
+
+- 2.2 opencv 2.4.3+(optional)
+
+  Anakin只在examples示例中使用opencv
+  Android系统的opencv从[这里下载](https://opencv.org/releases.html)
+  解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`
+  在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`,
+  并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。
+
+  ```cmake
+    include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+    LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+  ```
+### <span id = '0003'> 3. Anakin源码编译 </span> ###
+
+#### 编译Android版本
+
+  克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+
+```bash
+    cd your_dir
+    git clone https://github.com/PaddlePaddle/Anakin.git
+    cd Anakin
+    git fetch origin arm
+    git checkout arm
+  ```
+
+  修改`android_build.sh`
+
+- 修改NDK路径
+
+  ```bash
+    #modify "your_ndk_path" to your NDK path
+    export ANDROID_NDK=your_ndk_path
+  ```
+
+- 修改ARM 处理器架构
+
+  对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`，
+  对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。
+  目前我们只支持 `armeabi-v7a with NEON`；`arm64-v8a` 还在开发中。
+
+  ```bash
+      -DANDROID_ABI="armeabi-v7a with NEON"
+  ```
+
+- 设置Android API
+
+  根据Android系统的版本设置API level， 例如API Level 21 -> Android 5.0.1
+  ```bash
+      -DANDROID_NATIVE_API_LEVEL=21
+  ```
+
+- 选择编译静态库或动态库
+
+  设置`BUILD_SHARED=NO`编译静态库
+  设置`BUILD_SHARED=YES`编译动态库
+
+  ```bash
+      -DBUILD_SHARED=NO
+  ```
+- OpenMP多线程支持
+
+  设置`USE_OPENMP=YES`开启OpenMP多线程
+
+  ```bash
+      -DUSE_OPENMP=YES
+  ```
+
+- 编译单测文件
+
+  设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件
+
+  ```bash
+     -DBUILD_WITH_UNIT_TEST=YES
+  ```
+
+- 编译示例文件
+
+  设置`BUILD_EXAMPLES=YES`将会编译示例文件
+
+  ```bash
+     -DBUILD_EXAMPLES=YES
+  ```
+
+- 开启opencv
+
+  如果使用opencv，设置`USE_OPENCV=YES`
+
+  ```bash
+    -DUSE_OPENCV=YES
+  ```
+
+- 开始编译
+
+  运行脚本 `android_build.sh` 将自动编译Anakin
+
+  ```bash
+      ./android_build.sh
+  ```
+
+### <span id = '0004'> 4. 验证安装 </span> ###
+
+  编译好的库会放在目录`${Anakin_root}/output`下
+
+  编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下
+
+  编译好的示例文件会放在`${Anakin_root}/output/examples`目录下
+
+  对于Android系统，打开设备的调试模式，通过ADB可以访问的目录是`data/local/tmp`，通过ADB push将测试文件、模型和数据发送到设备目录，运行测试文件。
diff --git a/doc/fluid/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/advanced_usage/development/contribute_to_paddle.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3c45c72301a0275b6f6b5300000b02950f62a7f
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/contribute_to_paddle.md
@@ -0,0 +1,249 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
+
+## 克隆（Clone）
+
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
+```
+
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
+
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像，在源码目录树的根目录中运行：
+
+```bash
+➜  docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+如果你需要在此基础上编译基于Python3的PaddlePaddle，可以：
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "PY_VERSION=3.5" -e "WITH_FLUID_ONLY=ON" -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
+
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试，可以用如下命令：
+
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
+
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
+
+选择目标分支：
+
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
+
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
+
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后，删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+
+至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/doc/fluid/advanced_usage/development/new_op.md b/doc/fluid/advanced_usage/development/new_op.md
new file mode 120000
index 0000000000000000000000000000000000000000..a2b0044b0138522dab0fbf1531abdc391f10c6db
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/new_op.md
@@ -0,0 +1 @@
+../../dev/new_op_cn.md
\ No newline at end of file
diff --git a/doc/fluid/advanced_usage/development/profiling/benchmark.rst b/doc/fluid/advanced_usage/development/profiling/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7854263bf8f64c840492550fb22152582c7d2361
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/profiling/benchmark.rst
@@ -0,0 +1,120 @@
+#################
+如何进行基准测试
+#################
+
+本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面，下文包含搭建测试环境，选择基准测试模型，验证测试结果等几方面内容。
+
+验证深度学习框架，可分为训练和测试两个阶段， 验证指标略有不同，本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度，训练集是完备的，因此关注大batch\_size下的训练速度,关注吞吐量，例如图像模型常用的batch\_size=128, 多卡情况下会加大；预测阶段关注的是在测试集上的精度，线上服务测试数据不能提前收集，因此关注小batch\_size下的预测速度，关注延迟，例如预测服务常用的batch\_size=1, 4等。
+
+`Fluid <https://github.com/PaddlePaddle/Paddle>`__ 是PaddlePaddle从0.11.0版本开始引入的设计，本文的基准测试在该版本上完成。
+
+
+环境搭建
+""""""""""""
+
+基准测试中模型精度和硬件、框架无关，由模型结构和数据共同决定；性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异，控制硬件环境，系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行.
+
+
+不同架构的GPU卡性能差异巨大，在验证模型在GPU上训练性能时，可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号，如果测试多卡训练性能，需确认硬件连接是 `nvlink <https://zh.wikipedia.org/zh/NVLink>`__ 或 `PCIe <https://zh.wikipedia.org/zh-hans/PCI_Express>`__ 。 同样地，CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数，确认当前正在使用的CPU型号。
+
+下载GPU对应的Cuda Tool Kit和 Cudnn，或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`__, 镜像内包含了Cuda和Cudnn，本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库，影响在此基础上编译出的Fluid二进制运行性能。
+
+准备好Cuda环境后，从github上的下载Paddle并源码编译，会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`__\ 。另外，cudnn对卷积类任务影响巨大，在基准测试中需要小版本一致，例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。
+
+
+选择基准模型
+""""""""""""
+
+对框架做基准测试，需要覆盖不同训练任务和不同大小的模型，本文中选取了图像和NLP的最为常用的5个模型。
+
+============  ============  =================  ============
+任务种类        模型名称       网络结构         数据集     
+============  ============  =================  ============
+图像分类      mnist         Lenet              mnist
+图像分类      VGG           VGG-16             Flowers102
+图像分类      Resnet        Resnet-50          Flowers102
+文本分类      Stacked-LSTM  Stacked-LSTM       IMDB 
+机器翻译      seq-seq       Stacked-LSTM       wmt14 
+============  ============  =================  ============
+
+其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。
+`benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__
+基准模型测试脚本中，均跳过了前几个batch的训练过程，原因是加载数据和分配显存受系统当前运行情况影响，会导致统计性能不准确。运行完若干个轮次后，统计对应指标。
+
+
+基准模型的数据的选择方面，数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 <http://www.robots.ox.ac.uk/~vgg/data/flowers/102/>`__ ，图像大小预处理为和Imagenet相同大小，因此性能可直接对比
+NLP模型的公开且影响力大数据集较少，seq2seq模型选择了wmt14数据，stacked-lstm模型中选择了 `imdb <https://www.imdb.com/interfaces/>`__ 数据。
+
+
+注意，图像模型每条样本大小相同，图像经过变换后大小一致，因此经过的计算路径基本相同，计算速度和显存占用波动较小，可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定，计算路径和显存占用也不相同，因此只能完整运行若干个轮次后，统计速度和显存消耗。
+显存分配是特别耗时的操作，因此Fluid默认会占用所有可用显存空间形成显存池，用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗，可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`，观察最大显存开销。
+
+
+测试过程
+""""""""""""
+
+-  CPU 单机单线程测试
+
+测试CPU上单线程的性能，先设置CUDA的环境变量为空，``CUDA_VISIBLE_DEVICES=``，并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1``， ``MKL_NUM_THREADS=1;``。
+然后代码中设置为使用CPUPlace，如果使用Paddle代码库中的脚本，只需要命令行参数传入 use_gpu=False即可。
+
+.. code-block:: python
+
+    >>> import paddle.fluid as fluid
+    >>> place = fluid.CPUPlace() 
+
+.. code:: bash
+
+    docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash
+
+
+-  GPU 单机单卡测试
+
+本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
+
+.. code:: bash
+
+    nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash
+在单卡上测试，设置CUDA的环境变量使用一块GPU，``CUDA_VISIBLE_DEVICES=0``
+然后代码中设置为使用CUDAPlace，如果使用Paddle代码库中的脚本，只需要命令行参数传入 use_gpu=True即可。
+
+.. code-block:: python
+
+    >>> import paddle.fluid as fluid
+    >>> place = fluid.CUDAPlace(0) // 0 指第0块GPU
+
+
+测试结果
+""""""""""""
+
+本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。
+硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。
+系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境，系统版本为nvidia-docker17.05.0-ce。
+测试的Fluid版本为\ `v.0.12.0 <https://github.com/PaddlePaddle/Paddle/releases/tag/v.0.12.0>`__ 。
+TensorFlow版本为\ `v.1.4.0-rc1 <https://github.com/tensorflow/tensorflow/tree/v1.4.0-rc1>`__ 。
+使用的脚本和配置见\ `benchmark <https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/fluid>`__ 。
+图表中统计单位为samples/秒。
+
+- CPU 单机单线程测试结果
+
+  ================  ====================  ===================
+   Speed            Fluid CPU              TensorFlow CPU    
+  ================  ====================  ===================
+  mnist             1298.75 samples/s     637.57 samples/s  
+  VGG-16            0.4147 images/s       0.1229 images/s   
+  Resnet-50         1.6935 images/s       0.3657 images/s   
+  Stacked-LSTM      472.3225 words/s      48.2293words/s    
+  Seq2Seq           217.1655 words/s      28.6164 words/s   
+  ================  ====================  ===================
+
+- GPU 单机单卡测试结果
+
+  =============== =====================  =================
+   Speed           Fluid GPU              TensorFlow GPU      
+  =============== =====================  =================
+   mnist           19710.90 samples/s    15576.3 samples/s        
+   VGG-16          59.83327 images/s     40.9967 images/s    
+   Resnet-50       105.84412             97.8923 images/s    
+   Stacked-LSTM    1319.99315            1608.2526 words/s   
+   Seq2Seq         7147.89081            6845.1161 words/s   
+  =============== =====================  =================
diff --git a/doc/fluid/advanced_usage/development/profiling/cpu_profiling_cn.md b/doc/fluid/advanced_usage/development/profiling/cpu_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..1381a3b05f6761c60742eb9365708d94ad8a2642
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/profiling/cpu_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/cpu_profiling_cn.md
\ No newline at end of file
diff --git a/doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst b/doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..481fc916652b7f20c2223c1b30b6eaac78659d55
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst
@@ -0,0 +1,239 @@
+============
+GPU性能调优
+============
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+2. :code:`REGISTER_GPU_PROFILER` 是 :code:`cudaProfilerStart` 和 :code:`cudaProfilerStop` 的通用包装对象，避免当CPU版本的PaddlePaddle调用它们时程序崩溃。
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/fluid/advanced_usage/development/profiling/host_memory_profiling_cn.md b/doc/fluid/advanced_usage/development/profiling/host_memory_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..904968ba4a8d6cc6489c91a0a751e0a33dcc873c
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/profiling/host_memory_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/host_memory_profiling_cn.md
\ No newline at end of file
diff --git a/doc/fluid/advanced_usage/development/profiling/index.rst b/doc/fluid/advanced_usage/development/profiling/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d2eedc439d948396a5372197e66f00f0fa75a56
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/profiling/index.rst
@@ -0,0 +1,10 @@
+##########
+性能调优
+##########
+..  toctree::
+    
+    benchmark.rst
+    cpu_profiling_cn.md
+    gpu_profiling_cn.rst
+    host_memory_profiling_cn.md
+    timeline_cn.md
diff --git a/doc/fluid/advanced_usage/development/profiling/nvvp1.png b/doc/fluid/advanced_usage/development/profiling/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/nvvp1.png differ
diff --git a/doc/fluid/advanced_usage/development/profiling/nvvp2.png b/doc/fluid/advanced_usage/development/profiling/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/nvvp2.png differ
diff --git a/doc/fluid/advanced_usage/development/profiling/nvvp3.png b/doc/fluid/advanced_usage/development/profiling/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/nvvp3.png differ
diff --git a/doc/fluid/advanced_usage/development/profiling/nvvp4.png b/doc/fluid/advanced_usage/development/profiling/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/nvvp4.png differ
diff --git a/doc/fluid/advanced_usage/development/profiling/pprof_1.png b/doc/fluid/advanced_usage/development/profiling/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/pprof_1.png differ
diff --git a/doc/fluid/advanced_usage/development/profiling/pprof_2.png b/doc/fluid/advanced_usage/development/profiling/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/pprof_2.png differ
diff --git a/doc/fluid/advanced_usage/development/profiling/timeline.jpeg b/doc/fluid/advanced_usage/development/profiling/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/advanced_usage/development/profiling/timeline.jpeg differ
diff --git a/doc/fluid/advanced_usage/development/profiling/timeline_cn.md b/doc/fluid/advanced_usage/development/profiling/timeline_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..a05540e82a7fa795dcd8e7306261ef9bef57426f
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/profiling/timeline_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/timeline_cn.md
\ No newline at end of file
diff --git a/doc/fluid/advanced_usage/development/tracing.jpeg b/doc/fluid/advanced_usage/development/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/advanced_usage/development/tracing.jpeg differ
diff --git a/doc/fluid/advanced_usage/development/write_docs_cn.md b/doc/fluid/advanced_usage/development/write_docs_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..ea03c90b51c19db2e9aa37b6091ef92c8ccf1790
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/write_docs_cn.md
@@ -0,0 +1 @@
+../../dev/write_docs_cn.md
\ No newline at end of file
diff --git a/doc/fluid/advanced_usage/index.rst b/doc/fluid/advanced_usage/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c8f6521af18e269966d375ad2f0d13f6b97a0452
--- /dev/null
+++ b/doc/fluid/advanced_usage/index.rst
@@ -0,0 +1,49 @@
+########
+进阶使用
+########
+
+=====================
+        概览
+=====================
+..  todo::
+
+如果您非常熟悉 Fluid，期望获得更高效的模型或者定义自己的Operator，请阅读：
+
+	- `移动端部署 <../advanced_usage/deploy/index_mobile.html>`_：介绍了 PaddlePaddle 组织下的嵌入式平台深度学习框架——Paddle-Mobile，包括：
+
+	- `简介 <../advanced_usage/deploy/mobile_readme.html>`_：简要介绍了 Paddle-Mobile 的应用效果，特点以及使用说明
+	- `环境搭建 <../advanced_usage/deploy/mobile_build.html>`_：从使用 Docker 和不使用 Docker 两种方法下分别介绍如何搭建环境
+	- `ios开发文档 <../advanced_usage/deploy/mobile_dev.html>`_：介绍如何在 ios 系统下运用 Paddle-Mobile 进行开发
+
+	- `Anakin预测引擎 <../advanced_usage/deploy/index_anakin.html>`_：介绍如何使用 Anakin 在不同硬件平台实现深度学习的高速预测
+
+	- `如何写新的Operator <../advanced_usage/development/new_op.html>`_ ：介绍如何在 Fluid 中添加新的 Operator
+
+	- `性能调优 <../advanced_usage/development/profiling/index.html>`_ ：介绍 Fluid 使用过程中的调优方法，包括：
+
+	  - `如何进行基准测试 <../advanced_usage/development/profiling/benchmark.html>`_：介绍如何选择基准模型，从而验证模型的精度和性能
+	  - `CPU性能调优 <../advanced_usage/development/profiling/cpu_profiling_cn.html>`_：介绍如何使用 cProfile 包、yep库、Google perftools 进行性能分析与调优
+	  - `GPU性能调优 <../advanced_usage/development/profiling/gpu_profiling_cn.html>`_：介绍如何使用 Fluid 内置的定时工具、nvprof 或 nvvp 进行性能分析和调优
+	  - `堆内存分析和优化 <../advanced_usage/development/profiling/host_memory_profiling_cn.html>`_：介绍如何使用 gperftool 进行堆内存分析和优化，以解决内存泄漏的问题
+	  - `Timeline工具简介 <../advanced_usage/development/profiling/timeline_cn.html>`_ ：介绍如何使用 Timeline 工具进行性能分析和调优
+
+
+非常欢迎您为我们的开源社区做出贡献，关于如何贡献您的代码或文档，请阅读：
+
+	- `如何贡献代码 <../advanced_usage/development/contribute_to_paddle.html>`_：介绍如何向 PaddlePaddle 开源社区贡献代码
+
+	- `如何贡献文档 <../advanced_usage/development/write_docs_cn.html>`_：介绍如何向 PaddlePaddle 开源社区贡献文档
+
+=====================
+        目录
+=====================
+
+..  toctree::
+    :maxdepth: 2
+
+    deploy/index_mobile.rst
+    deploy/index_anakin.rst
+    development/contribute_to_paddle.md
+    development/write_docs_cn.md
+    development/new_op.md
+    development/profiling/index.rst
diff --git a/doc/fluid/advanced_usage/pics/anakin_fm_ch.png b/doc/fluid/advanced_usage/pics/anakin_fm_ch.png
new file mode 100644
index 0000000000000000000000000000000000000000..52d4992a22397119af949aa7c11a9ea6365c167c
Binary files /dev/null and b/doc/fluid/advanced_usage/pics/anakin_fm_ch.png differ
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..435d6e10fb02e9b2a8147f37da33e8848cc9b98a
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,25 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/api/api_guides/high_low_level_api.md b/doc/fluid/api/api_guides/high_low_level_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ce2695737c45306c9a0944c35b2e865ce380f05
--- /dev/null
+++ b/doc/fluid/api/api_guides/high_low_level_api.md
@@ -0,0 +1,15 @@
+## High/Low-level API简介
+
+Paddle目前有2套API接口：
+
+- Low-level（底层） API：
+	
+	- 灵活性强并且已经相对成熟，使用它训练的模型，能直接支持C++预测上线。
+	- 提供了大量的模型作为使用示例，包括[Book](https://github.com/PaddlePaddle/book)中的第7和8章，以及[models](https://github.com/PaddlePaddle/models)中的所有章节。
+	- 适用人群：对深度学习有一定了解，需要自定义网络进行训练/预测/上线部署的用户。
+
+- High-level（高层）API：
+	
+	- 使用简单，[Book](https://github.com/PaddlePaddle/book)中前六章提供了示例。
+	- 尚未成熟，接口暂时在[paddle.fluid.contrib](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/contrib)下面。
+	- 适用人群：想通过Book课程进行深度学习基础知识学习的初级用户。
diff --git a/doc/fluid/api/api_guides/index.rst b/doc/fluid/api/api_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..05009c3607c6071d71edac61c0d27f3e86395b2b
--- /dev/null
+++ b/doc/fluid/api/api_guides/index.rst
@@ -0,0 +1,16 @@
+===========
+API使用指南
+===========
+
+..  toctree::
+    :titlesonly: 
+
+    high_low_level_api.md
+    low_level/layers/index.rst
+    low_level/executor.rst
+    low_level/optimizer.rst
+    low_level/metrics.rst
+    low_level/model_save_reader.rst
+    low_level/inference.rst
+  
+  
diff --git a/doc/fluid/api/api_guides/low_level/executor.rst b/doc/fluid/api/api_guides/low_level/executor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5617d14c143e17f59042eff2509eda024adefd40
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/executor.rst
@@ -0,0 +1,20 @@
+..  _api_guide_executor:
+
+##########
+执行引擎
+##########
+
+:code:`Executor` 即 :code:`执行器` 。PaddlePaddle Fluid中有两种执行器可以选择。
+:code:`Executor` 实现了一个简易的执行器，所有Operator会被顺序执行。用户可以使用
+Python脚本驱动 :code:`Executor` 执行。默认情况下 :code:`Executor` 是单线程的，如果
+想使用数据并行，请参考另一个执行器， :ref:`api_guide_parallel_executor` 。
+
+:code:`Executor` 的代码逻辑非常简单。建议用户在调试过程中，先使用
+:code:`Executor` 跑通模型，再切换到多设备计算，甚至多机计算。
+
+:code:`Executor` 在构造的时候接受一个 :code:`Place`， 它们可以是 :ref:`api_fluid_CPUPlace`
+或 :ref:`api_fluid_CUDAPlace` 。 :code:`Executor` 在执行的时候可以选择执行的
+:ref:`api_guide_low_level_program` 。
+
+简单的使用方法，请参考 `quick_start_fit_a_line <http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/quick_start/fit_a_line/README.cn.html>`_ , API Reference 请参考
+:ref:`api_fluid_Executor` 。
diff --git a/doc/fluid/api/api_guides/low_level/inference.rst b/doc/fluid/api/api_guides/low_level/inference.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2a61fb307534ec5b5bbc4b89219be9fc31961430
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/inference.rst
@@ -0,0 +1,55 @@
+..  _api_guide_inference:
+
+#########
+预测引擎
+#########
+
+预测引擎提供了存储预测模型 :ref:`api_fluid_io_save_inference_model` 和加载预测模型 :ref:`api_fluid_io_load_inference_model` 两个接口。
+
+预测模型的存储格式
+=================
+
+预测模型的存储格式有两种，由上述两个接口中的 :code:`model_filename` 和 :code:`params_filename` 变量控制：
+
+- 参数保存到各个独立的文件，如设置 :code:`model_filename` 为 :code:`None` 、:code:`params_filename` 为 :code:`None`
+
+  .. code-block:: bash
+
+      ls recognize_digits_conv.inference.model/*
+      __model__ conv2d_1.w_0 conv2d_2.w_0 fc_1.w_0 conv2d_1.b_0 conv2d_2.b_0 fc_1.b_0
+
+- 参数保存到同一个文件，如设置 :code:`model_filename` 为 :code:`None` 、:code:`params_filename` 为 :code:`__params__`
+
+  .. code-block:: bash
+
+      ls recognize_digits_conv.inference.model/*
+      __model__ __params__
+
+存储预测模型
+===========
+
+.. code-block:: python
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./infer_model"
+    fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'], 
+        target_vars=[predict_var], executor=exe)
+
+在这个示例中，:code:`fluid.io.save_inference_model` 接口对默认的 :code:`fluid.Program` 进行裁剪，只保留预测 :code:`predict_var` 所需部分。
+裁剪后的 :code:`program` 会保存在 :code:`./infer_model/__model__` 下，参数会保存到 :code:`./infer_model` 下的各个独立文件。
+
+加载预测模型
+===========
+
+.. code-block:: python
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./infer_model"
+    [inference_program, feed_target_names, fetch_targets] = 
+        fluid.io.load_inference_model(dirname=path, executor=exe)
+    results = exe.run(inference_program,
+                  feed={feed_target_names[0]: tensor_img},
+                  fetch_list=fetch_targets)
+
+在这个示例中，首先调用 :code:`fluid.io.load_inference_model` 接口，获得预测的 :code:`program` 、输入数据的 :code:`variable` 名称和输出结果的 :code:`variable` ;
+然后调用 :code:`executor` 执行预测的 :code:`program` 获得预测结果。
diff --git a/doc/fluid/api/api_guides/low_level/layers/activations.rst b/doc/fluid/api/api_guides/low_level/layers/activations.rst
new file mode 100644
index 0000000000000000000000000000000000000000..615e364d525b483adcdcad89272a06fc5ade70e7
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/activations.rst
@@ -0,0 +1,28 @@
+.. _api_guide_activations:
+
+####
+激活函数
+#### 
+
+激活函数将非线性的特性引入到神经网络当中。
+
+PaddlePaddle Fluid 对大部分的激活函数进行了支持，其中有:        
+
+:ref:`api_fluid_layers_relu`, :ref:`api_fluid_layers_tanh`, :ref:`api_fluid_layers_sigmoid`, :ref:`api_fluid_layers_elu`, :ref:`api_fluid_layers_relu6`, :ref:`api_fluid_layers_pow`, :ref:`api_fluid_layers_stanh`, :ref:`api_fluid_layers_hard_sigmoid`, :ref:`api_fluid_layers_swish`, :ref:`api_fluid_layers_prelu`, :ref:`api_fluid_layers_brelu`, :ref:`api_fluid_layers_leaky_relu`, :ref:`api_fluid_layers_soft_relu`, :ref:`api_fluid_layers_thresholded_relu`, :ref:`api_fluid_layers_maxout`, :ref:`api_fluid_layers_logsigmoid`, :ref:`api_fluid_layers_hard_shrink`, :ref:`api_fluid_layers_softsign`, :ref:`api_fluid_layers_softplus`, :ref:`api_fluid_layers_tanh_shrink`, :ref:`api_fluid_layers_softshrink`, :ref:`api_fluid_layers_exp`。
+ 
+
+**Fluid提供了两种使用激活函数的方式：**
+
+- 如果一个层的接口提供了 :code:`act` 变量（默认值为None），我们可以通过该变量指定该层的激活函数类型。该方式支持常见的激活函数: :code:`relu`, :code:`tanh`, :code:`sigmoid`, :code:`identity`。
+
+.. code-block:: python
+
+	conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+
+
+- Fluid为每个Activation提供了接口，我们可以显式的对它们进行调用。
+
+.. code-block:: python
+
+	conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3)
+	relu1 = fluid.layers.relu(conv2d)
diff --git a/doc/fluid/api/api_guides/low_level/layers/control_flow.rst b/doc/fluid/api/api_guides/low_level/layers/control_flow.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c2192b498cb335c29e1e56dedc1c248cada47198
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/control_flow.rst
@@ -0,0 +1,53 @@
+.. api_guide_control_flow:
+
+######
+控制流
+######
+
+在程序语言中，控制流(control flow)决定了语句的执行顺序，常见的控制流包括顺序执行、分支和循环等。PaddlePaddle Fluid继承了这一概念，提供了多种控制流API, 以控制深度学习模型在训练或者预测过程中的执行逻辑。
+
+IfElse
+======
+
+条件分支，允许对同一个batch的输入，根据给定的条件，分别选择 :code:`true_block` 或 :code:`false_block` 中的逻辑进行执行，执行完成之后再将两个分支的输出合并为同一个输出。通常，条件表达式可由 :ref:`api_fluid_layers_less_than`, :ref:`api_fluid_layers_equal` 等逻辑比较 API 产生。
+
+请参考 :ref:`api_fluid_layers_IfElse`            
+
+
+Switch
+======
+
+多分支选择结构，如同程序语言中常见的 :code:`switch-case` 声明, 其根据输入表达式的取值不同，选择不同的分支执行。具体来说，Fluid 所定义的 :code:`Switch` 控制流有如下特性：
+
+* case的条件是个bool类型的值，即在Program中是一个张量类型的Variable；
+* 依次检查逐个case，选择第一个满足条件的case执行，完成执行后即退出所属的block；
+* 如果所有case均不满足条件，会选择默认的case进行执行。
+
+请参考 :ref:`api_fluid_layers_Switch`
+
+While
+=====
+
+While 循环，当条件判断为真时，循环执行 :code:`While` 控制流所属 :code:`block` 内的逻辑，条件判断为假时退出循环。与之相关的API有
+
+* :ref:`api_fluid_layers_increment` ：累加API，通常用于对循环次数进行计数；
+* :ref:`api_fluid_layers_array_read` ：从 :code:`LOD_TENSOR_ARRAY` 中指定的位置读入Variable，进行计算；
+* :ref:`api_fluid_layers_array_write` ：将 Variable 写回到 :code:`LOD_TENSOR_ARRAY` 指定的位置，存储计算结果。
+
+请参考 :ref:`api_fluid_layers_While`
+
+DynamicRNN
+==========
+
+即动态RNN，可处理一个batch不等长的序列数据，其接受 :code:`lod_level=1` 的 Variable 作为输入，在 :code:`DynamicRNN` 的 :code:`block` 内，用户需自定义RNN的单步计算逻辑。在每一个时间步，用户可将需记忆的状态写入到 :code:`DynamicRNN` 的 :code:`memory` 中，并将需要的输出写出到其 :code:`output` 中。
+
+:ref:`api_fluid_layers_sequence_last_step` 可获取 :code:`DynamicRNN` 最后一个时间步的输出。
+
+请参考 :ref:`api_fluid_layers_DynamicRNN`
+
+StaticRNN
+=========
+
+即静态RNN，只能处理固定长度的序列数据，接受 :code:`lod_level=0` 的 Variable 作为输入。与 :code:`DynamicRNN` 类似，在RNN的每单个时间步，用户需自定义计算逻辑，并可将状态和输出写出。
+
+请参考 :ref:`api_fluid_layers_StaticRNN`
diff --git a/doc/fluid/api/api_guides/low_level/layers/conv.rst b/doc/fluid/api/api_guides/low_level/layers/conv.rst
new file mode 100644
index 0000000000000000000000000000000000000000..018c3d56ef96f6101f43b82b5e68ced24ce4942c
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/conv.rst
@@ -0,0 +1,64 @@
+.. _api_guide_conv:
+
+#####
+卷积
+#####
+
+卷积有两组输入：特征图和卷积核，依据输入特征和卷积核的形状、Layout不同、计算方式的不同，在Fluid里，有针对变长序列特征的一维卷积，有针对定长图像特征的二维(2D Conv)、三维卷积(3D Conv)，同时也有卷积计算的逆向过程，下面先介绍Fluid里的2D/3D卷积，再来介绍序列卷积。
+
+
+2D/3D卷积
+==============
+
+1. 卷积输入参数：
+---------------------
+
+卷积需要依据滑动步长(stride)、填充长度(padding)、卷积核窗口大小(filter size)、分组数(groups)、扩张系数(dilation rate)来决定如何计算。groups最早在 `AlexNet <https://www.nvidia.cn/content/tesla/pdf/machine-learning/imagenet-classification-with-deep-convolutional-nn.pdf>`_ 中引入, 可以理解为将原始的卷积分为独立若干组卷积计算。
+  
+  **注意**: 同cuDNN的方式，Fluid目前只支持在特征图上下填充相同的长度，左右也是。
+
+- 输入输出Layout: 
+
+  2D卷积输入特征的Layout为[N, C, H, W]或[N, H, W, C], N即batch size，C是通道数，H、W是特征的高度和宽度，输出特征和输入特征的Layout一致。(相应的3D卷积输入特征的Layout为[N, C, D, H, W]或[N, D, H, W, C]，但**注意**，Fluid的卷积当前只支持[N, C, H, W]，[N, C, D, H, W]。)
+   
+- 卷积核的Layout: 
+  
+  Fluid中2D卷积的卷积核(也称权重)的Layout为[C_o, C_in / groups, f_h, f_w]，C_o、C_in表示输出、输入通道数，f_h、f_w表示卷积核窗口的高度和宽度，按行序存储。(相应的2D卷积的卷积核Layout为[C_o, C_in / groups, f_d, f_h, d_w]，同样按行序存储。)
+  
+- 深度可分离卷积(depthwise separable convolution): 
+   
+  在深度可分离卷积中包括depthwise convolution和pointwise convolution两组，这两个卷积的接口和上述普通卷积接口相同。前者可以通过给普通卷积设置groups来做，后者通过设置卷积核filters的大小为1x1，深度可分离卷积减少参数的同时减少了计算量。
+  
+  对于depthwise convolution，可以设置groups等于输入通道数，此时，2D卷积的卷积核形状为[C_o, 1, f_h, f_w]。
+  对于pointwise convolution，卷积核的形状为[C_o, C_in, 1, 1]。
+  
+  **注意**：Fluid针对depthwise convolution的GPU计算做了高度优化，您可以通过在 :code:`fluid.layers.conv2d`接口设置 :code:`use_cudnn=False`来使用Fluid自身优化的CUDA程序。
+   
+- 空洞卷积(dilated convolution):
+  
+  空洞卷积相比普通卷积而言，卷积核在特征图上取值时不在连续，而是间隔的，这个间隔数称作dilation，等于1时，即为普通卷积，空洞卷积相比普通卷积的感受野更大。
+  
+- API汇总:
+ - :ref:`api_fluid_layers_conv2d`
+ - :ref:`api_fluid_layers_conv3d`
+ - :ref:`api_fluid_layers_conv2d_transpose`
+ - :ref:`api_fluid_layers_conv3d_transpose`
+
+
+1D序列卷积
+==============
+
+Fluid可以表示变长的序列结构，这里的变长是指不同样本的时间步(step)数不一样，通常是一个2D的Tensor和一个能够区分的样本长度的辅助结构来表示。假定，2D的Tensor的形状是shape，shape[0]是所有样本的总时间步数，shape[1]是序列特征的大小。
+
+基于此数据结构的卷积在Fluid里称作序列卷积，也表示一维卷积。同图像卷积，序列卷积的输入参数有卷积核大小、填充大小、滑动步长，但与2D卷积不同的是，这些参数个数都为1。**注意**，目前仅支持stride为1的情况，输出序列的时间步数和输入序列相同。 
+
+假如：输入序列形状为(T, N)， T即该序列的时间步数，N是序列特征大小；卷积核的上下文步长为K，输出序列长度为M，则卷积核权重形状为(K * N, M），输出序列形状为(T, M)。
+  
+另外，参考DeepSpeech，Fluid实现了行卷积row convolution, 或称
+`look ahead convolution <http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf>`_ ，
+该卷积相比上述普通序列卷积可以减少参数。
+ 
+
+- API汇总:
+ - :ref:`api_fluid_layers_sequence_conv`
+ - :ref:`api_fluid_layers_row_conv`
diff --git a/doc/fluid/api/api_guides/low_level/layers/data_in_out.rst b/doc/fluid/api/api_guides/low_level/layers/data_in_out.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ca12c41526553a7acbbcdb3b2c104d59b01c3780
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/data_in_out.rst
@@ -0,0 +1,33 @@
+..  _api_guide_data_in_out:
+
+数据输入输出
+###############
+
+
+数据输入
+-------------
+
+Fluid支持两种数据输入方式，包括：
+
+1. Python Reader: 纯Python的Reader。用户在Python端定义 :code:`fluid.layers.data` 层构建网络，并通过
+:code:`executor.run(feed=...)` 的方式读入数据。数据读取和模型训练/预测的过程是同步进行的。
+
+2. PyReader: 高效灵活的C++ Reader接口。PyReader内部维护容量为 :code:`capacity` 的队列（队列容量由
+:code:`fluid.layers.py_reader` 接口中的 :code:`capacity` 参数设置），Python端调用队列的 :code:`push`
+方法送入训练/预测数据，C++端的训练/预测程序调用队列的 :code:`pop` 方法取出Python端送入的数据。PyReader可与
+:code:`double_buffer` 配合使用，实现数据读取和训练/预测的异步执行。
+
+具体使用方法请参考 :ref:`user_guide_use_py_reader`。
+
+
+数据输出
+------------
+
+Fluid支持在训练/预测阶段获取当前batch的数据。
+
+用户可通过 :code:`executor.run(fetch_list=[...], return_numpy=...)` 的方式
+fetch期望的输出变量，通过设置 :code:`return_numpy` 参数设置是否将输出数据转为numpy array。
+若 :code:`return_numpy` 为 :code:`False` ，则返回 :code:`LoDTensor` 类型数据。
+
+具体使用方式请参考相关API文档 :ref:`api_fluid_executor_Executor` 和
+:ref:`api_fluid_ParallelExecutor`。
\ No newline at end of file
diff --git a/doc/fluid/api/api_guides/low_level/layers/detection.rst b/doc/fluid/api/api_guides/low_level/layers/detection.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f277c27ceaa5e1c5649f5212d377442e3fd73860
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/detection.rst
@@ -0,0 +1,63 @@
+..  _api_guide_detection:
+
+
+图像检测
+#########
+
+PaddlePaddle Fluid在图像检测任务中实现了多个特有的操作。以下分模型介绍各个api：
+
+通用操作
+-------------
+
+图像检测中的一些通用操作，是对检测框的一系列操作，其中包括：
+
+* 对检测框的编码，解码（box_coder）：实现两种框之间编码和解码的转换。例如训练阶段对先验框和真实框进行编码得到训练目标值。API Reference 请参考 :ref:`api_fluid_layers_box_coder`
+
+* 比较两个检测框并进行匹配：
+
+  * iou_similarity：计算两组框的IOU值。API Reference 请参考 :ref:`api_fluid_layers_iou_similarity`
+
+  * bipartite_match：通过贪心二分匹配算法得到每一列中距离最大的一行。API Reference 请参考 :ref:`api_fluid_layers_bipartite_match`
+
+* 根据检测框和标签得到分类和回归目标值（target_assign）：通过匹配索引和非匹配索引得到目标值和对应权重。API Reference 请参考 :ref:`api_fluid_layers_target_assign`
+
+
+Faster RCNN
+-------------
+
+`Faster RCNN <https://arxiv.org/abs/1506.01497>`_ 是典型的两阶段目标检测器，相较于传统提取区域的方法，Faster RCNN中RPN网络通过共享卷积层参数大幅提高提取区域的效率，并提出高质量的候选区域。RPN网络需要对输入anchor和真实值进行比较生成初选候选框，并对初选候选框分配分类和回归值，>需要如下四个特有api：
+
+* rpn_target_assign：通过anchor和真实框为anchor分配RPN网络的分类和回归目标值。API Reference 请参考 :ref:`api_fluid_layers_rpn_target_assign`
+
+* anchor_generator：为每个位置生成一系列anchor。API Reference 请参考 :ref:`api_fluid_layers_anchor_generator`
+
+* generate_proposal_labels: 通过generate_proposals得到的候选框和真实框得到RCNN部分的分类和回归的目标值。API Reference 请参考 :ref:`api_fluid_layers_generate_proposal_labels`
+
+* generate_proposals: 对RPN网络输出box解码并筛选得到新的候选框。API Reference 请参考 :ref:`api_fluid_layers_generate_proposals`
+
+
+SSD
+----------------
+
+`SSD <https://arxiv.org/abs/1512.02325>`_ 全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点。与两阶段的检测方法不同，单阶段目标检测并不进行区域推荐，而是直接从特征图回归出目标的边界框和分类概率。SSD网络对六个尺度特>征图计算损失，进行预测，需要如下五种特有api：
+
+* Prior Box：根据不同参数为每个输入位置生成一系列候选框。API Reference 请参考 :ref:`api_fluid_layers_prior_box`
+
+* multi_box_head ：得到不同prior box的位置和置信度。API Reference 请参考 :ref:`api_fluid_layers_multi_box_head`
+
+* detection_output：对prioir box解码，通过多分类NMS得到检测结果。API Reference 请参考 :ref:`api_fluid_layers_detection_output`
+
+* ssd_loss：通过位置偏移预测值，置信度，检测框位置和真实框位置和标签计算损失。API Reference 请参考 :ref:`api_fluid_layers_ssd_loss`
+
+* detection map: 利用mAP评估SSD网络模型。API Reference 请参考 :ref:`api_fluid_layers_detection_map`
+
+OCR
+---------
+
+场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。OCR任务中需要对检测框进行不规则变换，其中需要如下两个api：
+
+* roi_perspective_transform：对输入roi做透视变换。API Reference 请参考 :ref:`api_fluid_layers_roi_perspective_transform`
+
+* polygon_box_transform：对不规则检测框进行坐标变换。API Reference 请参考 :ref:`api_fluid_layers_polygon_box_transform`
+
+
diff --git a/doc/fluid/api/api_guides/low_level/layers/index.rst b/doc/fluid/api/api_guides/low_level/layers/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a74e681aa49dced14a885d75891359edafaf43ba
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/index.rst
@@ -0,0 +1,17 @@
+=============
+神经网络层
+=============
+
+..  toctree::
+    :maxdepth: 1
+
+    conv.rst
+    pooling.rst
+    detection.rst
+    sequence.rst
+    math.rst
+    activations.rst
+    loss_function.rst
+    data_in_out.rst
+    control_flow.rst
+    
diff --git a/doc/fluid/api/api_guides/low_level/layers/loss_function.rst b/doc/fluid/api/api_guides/low_level/layers/loss_function.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5802fc4b934183572279f2d5d0d260eda5346710
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/loss_function.rst
@@ -0,0 +1,60 @@
+..  _api_guide_loss_function:
+
+#######
+损失函数
+#######
+
+损失函数定义了拟合结果和真实结果之间的差异，作为优化的目标直接关系模型训练的好坏，很多研究工作的内容也集中在损失函数的设计优化上。
+Paddle Fluid 中提供了面向多种任务的多种类型的损失函数，以下列出了一些 Paddle Fluid 中包含的较为常用的损失函数。
+
+回归
+====
+
+平方误差损失（squared error loss）使用预测值和真实值之间误差的平方作为样本损失，是回归问题中最为基本的损失函数。
+API Reference 请参考 :ref:`api_fluid_layers_square_error_cost`。
+
+平滑 L1 损失（smooth_l1 loss）是一种分段的损失函数，较平方误差损失其对异常点相对不敏感，因而更为鲁棒。
+API Reference 请参考 :ref:`api_fluid_layers_smooth_l1`。
+
+
+分类
+====
+
+`交叉熵（cross entropy） <https://en.wikipedia.org/wiki/Cross_entropy>`_ 是分类问题中使用最为广泛的损失函数，Paddle Fluid 中提供了接受归一化概率值和非归一化分值输入的两种交叉熵损失函数的接口，并支持 soft label 和 hard label 两种样本类别标签。
+API Reference 请参考 :ref:`api_fluid_layers_cross_entropy` 和 :ref:`api_fluid_layers_softmax_with_cross_entropy`。
+
+多标签分类
+---------
+对于多标签分类问题，如一篇文章同属于政治、科技等多个类别的情况，需要将各类别作为独立的二分类问题计算损失，Paddle Fluid 中为此提供了 sigmoid_cross_entropy_with_logits 损失函数，
+API Reference 请参考 :ref:`api_fluid_layers_sigmoid_cross_entropy_with_logits`。
+
+大规模分类
+---------
+对于大规模分类问题，通常需要特殊的方法及相应的损失函数以加速训练，常用的方法有 `噪声对比估计（Noise-contrastive estimation，NCE） <http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf>`_ 和 `层级 sigmoid <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_ 。
+
+* 噪声对比估计通过将多分类问题转化为学习分类器来判别数据来自真实分布和噪声分布的二分类问题，基于二分类来进行极大似然估计，避免在全类别空间计算归一化因子从而降低了计算复杂度。
+* 层级 sigmoid 通过二叉树进行层级的二分类来实现多分类，每个样本的损失对应了编码路径上各节点二分类交叉熵的和，避免了归一化因子的计算从而降低了计算复杂度。
+这两种方法对应的损失函数在 Paddle Fluid 中均有提供，API Reference 请参考 :ref:`api_fluid_layers_nce` 和 :ref:`api_fluid_layers_hsigmoid`。
+
+序列分类
+-------
+序列分类可以分为以下三种：
+
+* 序列分类（Sequence Classification）问题，整个序列对应一个预测标签，如文本分类。这种即是普通的分类问题，可以使用 cross entropy 作为损失函数。
+* 序列片段分类（Segment Classification）问题，序列中的各个片段对应有自己的类别标签，如命名实体识别。对于这种序列标注问题，`（线性链）条件随机场（Conditional Random Field，CRF） <http://www.cs.columbia.edu/~mcollins/fb.pdf>`_ 是一种常用的模型方法，其使用句子级别的似然概率，序列中不同位置的标签不再是条件独立，能够有效解决标记偏置问题。Paddle Fluid 中提供了 CRF 对应损失函数的支持，API Reference 请参考 :ref:`api_fluid_layers_linear_chain_crf`。
+* 时序分类（Temporal Classification）问题，需要对未分割的序列进行标注，如语音识别。对于这种时序分类问题，`CTC（Connectionist Temporal Classification） <http://people.idsia.ch/~santiago/papers/icml2006.pdf>`_ 损失函数不需要对齐输入数据及标签，可以进行端到端的训练，Paddle Fluid 提供了 warpctc 的接口来计算相应的损失，API Reference 请参考 :ref:`api_fluid_layers_warpctc`。
+
+排序
+====
+
+`排序问题 <https://en.wikipedia.org/wiki/Learning_to_rank>`_ 可以使用 Pointwise、Pairwise 和 Listwise 的学习方法，不同的方法需要使用不同的损失函数：
+
+* Pointwise 的方法通过近似为回归问题解决排序问题，可以使用回归问题的损失函数。
+* Pairwise 的方法需要特殊设计的损失函数，其通过近似为分类问题解决排序问题，使用两篇文档与 query 的相关性得分以偏序作为二分类标签来计算损失。Paddle Fluid 中提供了两种常用的 Pairwise 方法的损失函数，API Reference 请参考 :ref:`api_fluid_layers_rank_loss` 和 :ref:`api_fluid_layers_margin_rank_loss`。
+
+更多
+====
+
+对于一些较为复杂的损失函数，可以尝试使用其他损失函数组合实现；Paddle Fluid 中提供的用于图像分割任务的 :ref:`api_fluid_layers_dice_loss` 即是使用其他 OP 组合（计算各像素位置似然概率的均值）而成；多目标损失函数也可看作这样的情况，如 Faster RCNN 就使用 cross entropy 和 smooth_l1 loss 的加权和作为损失函数。
+
+**注意**，在定义损失函数之后为能够使用 :ref:`api_guide_optimizer` 进行优化，通常需要使用 :ref:`api_fluid_layers_mean` 或其他操作将损失函数返回的高维 Tensor 转换为 Scalar 值。
\ No newline at end of file
diff --git a/doc/fluid/api/api_guides/low_level/layers/math.rst b/doc/fluid/api/api_guides/low_level/layers/math.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6ae121ccd5540b3f9f7a9df5f9815de1889c06df
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/math.rst
@@ -0,0 +1,211 @@
+..  _api_guide_math:
+
+
+数学操作
+#########
+
+Paddle提供了丰富的数学操作，以下列出的数学操作都是对目标张量进行逐元素的操作。其中，如果二元操作的两个输入有不同形状，会先进行 :code:`broadcast`. 部分数学操作还支持数学操作符，比如： :code:`+`,  :code:`-`, :code:`*`, :code:`/` 等。数学操作符不仅支持张量，还支持标量。
+
+
+一元操作
+==================
+
+exp
+------------------
+
+对输入 :code:`Tensor` 逐元素做 :code:`exp` 操作。
+
+API Reference 请参考 :ref:`api_fluid_layers_exp`
+
+tanh
+------------------
+
+对输入 :code:`Tensor` 逐元素取正切。
+
+API Reference 请参考 :ref:`api_fluid_layers_tanh`
+
+sqrt
+------------------
+
+对输入 :code:`Tensor` 逐元素取平方根。
+
+API Reference 请参考 :ref:`api_fluid_layers_sqrt`
+
+abs
+------------------
+
+对输入 :code:`Tensor` 逐元素取绝对值。
+
+API Reference 请参考 :ref:`api_fluid_layers_abs`
+
+ceil
+------------------
+
+对输入 :code:`Tensor` 逐元素向上取整。
+
+API Reference 请参考 :ref:`api_fluid_layers_ceil`
+
+floor
+------------------
+
+对输入 :code:`Tensor` 逐元素向下取整。
+
+API Reference 请参考 :ref:`api_fluid_layers_floor`
+
+sin
+------------------
+
+对输入 :code:`Tensor` 逐元素取正玄。
+
+API Reference 请参考 :ref:`api_fluid_layers_sin`
+
+cos
+------------------
+
+对输入 :code:`Tensor` 逐元素取余玄。
+
+API Reference 请参考 :ref:`api_fluid_layers_cos`
+
+round
+------------------
+
+对输入 :code:`Tensor` 逐元素四舍五入取整。
+
+API Reference 请参考 :ref:`api_fluid_layers_round`
+
+square
+------------------
+
+对输入 :code:`Tensor` 逐元素取平方。
+
+API Reference 请参考 :ref:`api_fluid_layers_square`
+
+reciprocal
+------------------
+
+对输入 :code:`Tensor` 逐元素取倒数。
+
+API Reference 请参考 :ref:`api_fluid_layers_reciprocal`
+
+
+reduce
+------------------
+
+对输入 :code:`Tensor` 在指定的若干轴上做reduce操作，包括：min, max, sum, mean, product
+
+API Reference 请参考:
+:ref:`api_fluid_layers_reduce_min`
+:ref:`api_fluid_layers_reduce_max`
+:ref:`api_fluid_layers_reduce_sum`
+:ref:`api_fluid_layers_reduce_mean`
+:ref:`api_fluid_layers_reduce_prod`
+
+
+二元操作
+==================
+
+elementwise_add
+------------------
+
+对两个 :code:`Tensor` 逐元素相加，对应的数学操作符为 :code:`+`
+
+API Reference 请参考 :ref:`api_fluid_layers_elementwise_add`
+
+elementwise_sub
+------------------
+
+对两个 :code:`Tensor` 逐元素相减，对应数学操作符 :code:`-`
+
+API Reference 请参考 :ref:`api_fluid_layers_elementwise_sub`
+
+elementwise_mul
+------------------
+
+对两个 :code:`Tensor` 逐元素相乘， 对应数学操作符 :code:`*`
+
+API Reference 请参考 :ref:`api_fluid_layers_elementwise_mul`
+
+elementwise_div
+------------------
+
+对两个 :code:`Tensor` 逐元素相除， 对应数学操作符 :code:`/` 或 :code:`//`
+
+API Reference 请参考 :ref:`api_fluid_layers_elementwise_div`
+
+
+elementwise_pow
+------------------
+
+对两个 :code:`Tensor` 逐元素做次幂操作， 对应数学操作符 :code:`**`
+
+API Reference 请参考 :ref:`api_fluid_layers_elementwise_pow`
+
+equal
+------------------
+
+对两个 :code:`Tensor` 逐元素判断是否相等， 对应数学操作符 :code:`==`
+
+API Reference 请参考 :ref:`api_fluid_layers_equal`
+
+not_equal
+------------------
+
+对两个 :code:`Tensor` 逐元素判断是否不等， 对应数学操作符 :code:`!=`
+
+API Reference 请参考 :ref:`api_fluid_layers_elementwise_not_equal`
+
+less_than
+------------------
+
+对两个 :code:`Tensor` 逐元素判断是否满足小于关系， 对应数学操作符 :code:`<`
+
+API Reference 请参考 :ref:`api_fluid_layers_less_than`
+
+less_equal
+------------------
+
+对两个 :code:`Tensor` 逐元素判断是否满足小于或等于关系， 对应数学操作符 :code:`<=`
+
+API Reference 请参考 :ref:`api_fluid_layers_less_equal`
+
+greater_than
+------------------
+
+对两个 :code:`Tensor` 逐元素判断是否满足大于关系， 对应数学操作符 :code:`>`
+
+API Reference 请参考 :ref:`api_fluid_layers_greater_than`
+
+greater_equal
+------------------
+
+对两个 :code:`Tensor` 逐元素判断是否满足大于或等于关系， 对应数学操作符 :code:`>=`
+
+API Reference 请参考 :ref:`api_fluid_layers_greater_equal`
+
+sum
+------------------
+
+对两个 :code:`Tensor` 逐元素相加。
+
+API Reference 请参考 :ref:`api_fluid_layers_sum`
+
+min
+------------------
+
+对两个 :code:`Tensor` 逐元素进行 :code:`min(x, y)` 操作。
+
+API Reference 请参考 :ref:`api_fluid_layers_min`
+
+max
+------------------
+
+对两个 :code:`Tensor` 逐元素进行 :code:`max(x, y)` 操作。
+
+API Reference 请参考 :ref:`api_fluid_layers_max`
+
+matmul
+------------------
+
+对两个 :code:`Tensor` 进行矩阵乘操作。
+
+API Reference 请参考 :ref:`api_fluid_layers_matmul`
diff --git a/doc/fluid/api/api_guides/low_level/layers/pooling.rst b/doc/fluid/api/api_guides/low_level/layers/pooling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..de4a1bd82cc144b6237d4ec171576d93b1d8aaf9
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/pooling.rst
@@ -0,0 +1,80 @@
+.. _api_guide_pool:
+
+#####
+池化
+#####
+
+池化的作用是对输入特征做下采样和降低过拟合。降低过拟合是减小输出大小的结果，它同样也减少了后续层中的参数的数量。
+
+池化通常只需要将前一层的特征图作为输入，此外需要一些参数来确定池化具体的操作。在PaddlePaddle中我们同样通过设定池化的大小，方式，步长，是否是全局池化，是否使用cudnn，是否使用ceil函数计算输出等参数来选择具体池化的方式。
+PaddlePaddle中有针对定长图像特征的二维(pool2d)、三维卷积(pool3d)，RoI池化(roi_pool)，以及针对序列的序列池化(sequence_pool)，同时也有池化计算的反向过程，下面先介绍2D/3D池化，以及RoI池化，再来介绍序列池化。
+
+--------------
+
+1. pool2d/pool3d
+------------------------
+
+-  ``input`` : 池化操作接收任何符合layout是：\ ``N（batch size）* C(channel size) * H(height) * W(width)``\ 格式的\ ``Tensor``\ 类型作为输入。
+
+-  ``pool_size``\ : 用来确定池化\ ``filter``\ 的大小，即将多大范围内的数据池化为一个值。
+
+-  ``num_channels``\ : 用来确定输入的\ ``channel``\ 数量，如果未设置参数或设置为\ ``None``\ ，其实际值将自动设置为输入的\ ``channel``\ 数量。
+
+-  ``pooling_type``\ : 接收\ ``avg``\ 和\ ``max``\ 2种类型之一作为pooling的方式，默认值为\ ``max``\ 。其中\ ``max``\ 意为最大池化，即计算池化\ ``filter``\ 区域内的数据的最大值作为输出；而\ ``avg``\ 意为平均池化，即计算池化\ ``filter``\ 区域内的数据的平均值作为输出。
+
+-  ``pool_stride``\ : 意为池化的\ ``filter``\ 在输入特征图上移动的步长。
+
+-  ``pool_padding``\ : 用来确定池化中\ ``padding``\ 的大小，\ ``padding``\ 的使用是为了对于特征图边缘的特征进行池化，选择不同的\ ``pool_padding``\ 大小确定了在特征图边缘增加多大区域的补零。从而决定边缘特征被池化的程度。
+
+-  ``global_pooling``\ : 意为是否使用全局池化，全局池化是指使用和特征图大小相同的\ ``filter``\ 来进行池化，同样这个过程也可以使用平均池化或者最大池化来做为池化的方式，全局池化通常会用来替换全连接层以大量减少参数防止过拟合。
+
+-  ``use_cudnn``\ : 选项可以来选择是否使用cudnn来优化计算池化速度。
+
+-  ``ceil_mode``\ : 是否使用ceil函数计算输出高度和宽度。\ ``ceil mode``\ 意为天花板模式，是指会把特征图中不足\ ``filter size``\ 的边给保留下来，单独另算，或者也可以理解为在原来的数据上补充了值为-NAN的边。而floor模式则是直接把不足\ ``filter size``\ 的边给舍弃了。具体计算公式如下：
+    
+    -  非\ ``ceil_mode``\ 下:\ ``输出大小 = (输入大小 - filter size + 2 * padding) / stride（步长） + 1``
+    
+    -  ``ceil_mode``\ 下:\ ``输出大小 = (输入大小 - filter size + 2 * padding + stride - 1) / stride + 1``
+    	
+
+
+api汇总：
+
+- :ref:`api_fluid_layers_pool2d`
+- :ref:`api_fluid_layers_pool3d`
+
+
+2. roi_pool
+------------------
+
+``roi_pool``\ 一般用于检测网络中，将输入特征图依据候选框池化到特定的大小。
+
+-  ``rois``\ : 接收\ ``LoDTensor``\ 类型来表示需要池化的 Regions of Interest，关于RoI的解释请参考\ `论文 <https://arxiv.org/abs/1506.01497>`__
+
+-  ``pooled_height`` 和 ``pooled_width``\ : 这里可以接受非正方的池化窗口大小
+
+-  ``spatial_scale``\ : 用作设定缩放RoI和原图缩放的比例，注意，这里的设定需要用户自行计算RoI和原图的实际缩放比例。
+ 
+
+api汇总：
+
+- :ref:`api_fluid_layers_roi_pool`
+
+
+3. sequence_pool
+--------------------
+
+``sequence_pool``\ 是一个用作对于不等长序列进行池化的接口，它将每一个实例的全部时间步的特征进行池化，它同样支持
+``average``, ``sum``, ``sqrt`` 和\ ``max``\ 4种类型之一作为pooling的方式。 其中:
+
+-  ``average``\ 是对于每一个时间步内的数据求和后分别取平均值做为池化的结果。
+
+-  ``sum``\ 则是对每一个时间步内的数据分别求和作为池化的结果。
+
+-  ``sqrt``\ 则是对每一个时间步内的数据分别求和再分别取平方根作为池化的结果。
+
+-  ``max``\ 则是对每一个时间步内的数据分别求取最大值作为池化的结果。
+
+api汇总：
+
+- :ref:`api_fluid_layers_sequence_pool`
\ No newline at end of file
diff --git a/doc/fluid/api/api_guides/low_level/layers/sequence.rst b/doc/fluid/api/api_guides/low_level/layers/sequence.rst
new file mode 100644
index 0000000000000000000000000000000000000000..455e8e9b51c4196a2adc450a0f14e87512381480
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/layers/sequence.rst
@@ -0,0 +1,112 @@
+..  _api_guide_sequence:
+
+########
+序列
+########
+
+在深度学习领域许多问题涉及到对 `序列（sequence） <https://en.wikipedia.org/wiki/Sequence>`_ 的处理。
+从Wiki上的释义可知，序列可以表征多种物理意义，但在深度学习中，最常见的仍然是"时间序列"——一个序列包含多个时间步的信息。
+
+在Paddle Fluid中，我们将序列表示为 :ref:`api_fluid_LoDTensor` 。
+因为一般进行神经网络计算时都是一个batch一个batch地计算，所以我们用一个LoDTensor来存储一个mini batch的序列。
+一个LoDTensor的第0维包含该mini batch中所有序列的所有时间步，并且用LoD来记录各个序列的长度，区分不同序列。
+而在运算时，还需要根据LoD信息将LoDTensor中一个mini batch的第0维拆开成多个序列。（具体请参考上述LoD相关的文档。）
+所以，对这类LoDTensor第0维的操作不能简单地使用一般的layer来进行，针对这一维的操作必须要结合LoD的信息。
+(例如，你不能用 :code:`layers.reshape` 来对一个序列的第0维进行reshape)。
+
+为了实行各类针对序列的操作，我们设计了一系列序列相关的API，专门用于正确处理序列相关的操作。
+实践中，由于一个LoDTensor包括一个mini batch的序列，同一个mini batch中不同的序列通常属于多个sample，它们彼此之间不会也不应该发生相互作用。
+因此，若一个layer以两个（或多个）LoDTensor为输入（或者以一个list的LoDTensor为输入），每一个LoDTensor代表一个mini batch的序列，则第一个LoDTensor中的第一个序列只会和第二个LoDTensor中的第一个序列发生计算，
+第一个LoDTensor中的第二个序列只会和第二个LoDTensor中的第二个序列发生计算，第一个LoDTensor中的第i个序列只会和第二个LoDTensor中第i个序列发生计算，依此类推。
+
+**总而言之，一个LoDTensor存储一个mini batch的多个序列，其中的序列个数为batch size；多个LoDTensor间发生计算时，每个LoDTensor中的第i个序列只会和其他LoDTensor中第i个序列发生计算。理解这一点对于理解接下来序列相关的操作会至关重要。**
+
+1. sequence_softmax
+-------------------
+这个layer以一个mini batch的序列为输入，在每个序列内做softmax操作。其输出为一个mini batch相同shape的序列，但在序列内是经softmax归一化过的。
+这个layer往往用于在每个sequence内做softmax归一化。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_softmax`
+
+
+2. sequence_concat
+------------------
+这个layer以一个list为输入，该list中可以含有多个LoDTensor，每个LoDTensor为一个mini batch的序列。
+该layer会将每个batch中第i个序列在时间维度上拼接成一个新序列，作为返回的batch中的第i个序列。
+理所当然地，list中每个LoDTensor的序列必须有相同的batch size。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_concat`
+
+
+3. sequence_first_step
+----------------------
+这个layer以一个LoDTensor作为输入，会取出每个序列中的第一个元素（即第一个时间步的元素），并作为返回值。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_first_step`
+
+
+4. sequence_last_step
+---------------------
+同 :code:`sequence_first_step` ，除了本layer是取每个序列中最后一个元素（即最后一个时间步）作为返回值。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_last_step`
+
+
+5. sequence_expand
+------------------
+这个layer有两个LoDTensor的序列作为输入，并按照第二个LoDTensor中序列的LoD信息来扩展第一个batch中的序列。
+通常用来将只有一个时间步的序列（例如 :code:`sequence_first_step` 的返回结果）延展成有多个时间步的序列，以此方便与有多个时间步的序列进行运算。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_expand`
+
+
+6. sequence_expand_as
+---------------------
+这个layer需要两个LoDTensor的序列作为输入，然后将第一个Tensor序列中的每一个序列延展成和第二个Tensor中对应序列等长的序列。
+不同于 :code:`sequence_expand` ，这个layer会将第一个LoDTensor中的序列严格延展为和第二个LoDTensor中的序列等长。
+如果无法延展成等长的（例如第二个batch中的序列长度不是第一个batch中序列长度的整数倍），则会报错。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_expand_as`
+
+
+7. sequence_enumerate
+---------------------
+这个layer需要一个LoDTensor的序列作为输入，同时需要指定一个 :code:`win_size` 的长度。这个layer将依次取所有序列中长度为 :code:`win_size` 的子序列，并组合成新的序列。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_enumerate`
+
+
+8. sequence_reshape
+-------------------
+这个layer需要一个LoDTensor的序列作为输入，同时需要指定一个 :code:`new_dim` 作为新的序列的维度。
+该layer会将mini batch内每个序列reshape为new_dim给定的维度。注意，每个序列的长度会改变（因此LoD信息也会变），以适应新的形状。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_reshape`
+
+
+9. sequence_scatter
+-------------------
+这个layer可以将一个序列的数据scatter到另一个tensor上。这个layer有三个input，一个要被scatter的目标tensor :code:`input`；
+一个是序列的数据 :code:`update` ，一个是目标tensor的上坐标 :code:`index` 。Output为scatter后的tensor，形状和 :code:`input` 相同。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_scatter`
+
+
+10. sequence_pad
+----------------
+这个layer可以将不等长的序列补齐成等长序列。使用这个layer需要提供一个 :code:`PadValue` 和一个 :code:`padded_length`。
+前者是用来补齐序列的元素，可以是一个数也可以是一个tensor；后者是序列补齐的目标长度。
+这个layer会返回补齐后的序列，以及一个记录补齐前各个序列长度的tensor :code:`Length`。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_pad`
+
+
+11. sequence_mask
+-----------------
+这个layer会根据 :code:`input` 生成一个mask，:code:`input` 是一个记录了每个序列长度的tensor。
+此外这个layer还需要一个参数 :code:`maxlen` 用于指定序列中最长的序列长度。
+通常这个layer用于生成一个mask，将被pad后的序列中pad的部分过滤掉。
+:code:`input` 的长度tensor通常可以直接用 :code:`sequence_pad` 返回的 :code:`Length`。
+
+API Reference 请参考 :ref:`api_fluid_layers_sequence_mask`
+
diff --git a/doc/fluid/api/api_guides/low_level/metrics.rst b/doc/fluid/api/api_guides/low_level/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..90531d61635a7f35dd2101e554bdbaaa0fe31a17
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/metrics.rst
@@ -0,0 +1,51 @@
+..  _api_guide_metrics:
+
+
+评价指标
+#########
+在神经网络训练过程中或者训练完成后，需要评价模型的训练效果。评价的方法一般是计算全体预测值和全体真值(label)之间的距离，不同类型的任务会使用不同的评价方法，或者综合使用多个评价方法。在具体的任务中，可以选用一种或者多种评价方法。下面对常用的评价方法按照任务类型做介绍。
+
+分类任务评价
+------------------
+分类任务中最常用的是二分类，而多分类任务也可以转化为多个二分类任务的组合，二分类任务常用的评价指标有准确率、正确率、召回率、AUC和平均准确度。
+
+- 准确率: :code:`Precision` ，用来衡量二分类中召回真值和召回值的比例。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_Precision` 
+
+- 正确率: :code:`Accuracy` ，用来衡量二分类中召回真值和总样本数的比例。需要注意的是，准确率和正确率的定义是不同的，可以类比于误差分析中的 :code:`Variance` 和 :code:`Bias` 。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_Accuracy` 
+
+
+- 召回率: :code:`Recall` ，用来衡量二分类中召回值和总样本数的比例。准确率和召回率的选取相互制约，实际模型中需要进行权衡，可以参考文档 `Precision_and_recall <https://en.wikipedia.org/wiki/Precision_and_recall>`_ 。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_Recall` 
+
+- AUC: :code:`Area Under Curve`， 适用于二分类的分类模型评估，用来计算 `ROC曲线的累积面积 <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>`_。:code:`Auc` 通过python计算实现，如果关注性能，可以使用 :code:`fluid.layers.auc` 代替。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_Auc` 
+
+- 平均准确度: :code:`Average Precision` ，常用在Faster R-CNN和SSD等物体检测任务中。在不同召回条件下，计算了准确率的平均值，具体可以参考文档 `Average-precision <https://sanchom.wordpress.com/tag/average-precision/>`_ 和 `SSD: Single Shot MultiBox Detector <https://arxiv.org/abs/1512.02325>`_。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_DetectionMAP`
+
+
+
+序列标注任务评价
+------------------
+序列标注任务中，token的分组称为语块(chunk)，模型会同时将输入的token分组和分类，常用的评估方法是语块评估方法。
+
+- 语块评估方法: :code:`ChunkEvaluator` ，接收 :code:`chunk_eval` 接口的输出，累积每一个minibatch的语块统计值，最后计算准确率、召回率和F1值。:code:`ChunkEvaluator` 支持IOB, IOE, IOBES和IO四种标注模式。可以参考文档 `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ 。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_ChunkEvaluator`
+
+
+生成任务评价
+------------------
+生成任务会依据输入直接产生输出。对应NLP任务中(比如语音识别)，则生成新字符串。评估生成字符串和目标字符串之间距离的方法也有多种，比如多分类评估方法，而另外一种常用的方法叫做编辑距离。
+
+- 编辑距离: :code:`EditDistance` ，用来衡量两个字符串的相似度。可以参考文档 `Edit_distance <https://en.wikipedia.org/wiki/Edit_distance>`_。
+
+  API Reference 请参考 :ref:`api_fluid_metrics_EditDistance`
+
diff --git a/doc/fluid/api/api_guides/low_level/model_save_reader.rst b/doc/fluid/api/api_guides/low_level/model_save_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8edae985082970cc94c867bc7802e7714af90670
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/model_save_reader.rst
@@ -0,0 +1,59 @@
+..  _api_guide_model_save_reader:
+
+#########
+模型保存与加载
+#########
+
+模型的保存与加载主要涉及到如下八个API：
+:code:`fluid.io.save_vars`、:code:`fluid.io.save_params`、:code:`fluid.io.save_persistables`、:code:`fluid.io.save_inference_model`、:code:`fluid.io.load_vars`、:code:`fluid.io.load_params`、:code:`fluid.io.load_persistables` 和 :code:`fluid.io.load_inference_model`。
+
+变量、持久性变量和参数
+====================
+
+在 :code:`Paddle` 中，算子(:code:`Operator`)的每一个输入和输出都是一个变量（:code:`Variable`），而参数（:code:`Parameter`）是变量（:code:`Variable`）的子类。持久性变量（:code:`Persistables`）是一种在每次迭代结束后均不会被删除的变量。参数是一种持久性变量，其在每次迭代后都会被优化器（:ref:`api_guide_optimizer`）更新。训练神经网络本质上就是在更新参数。
+
+模型保存API介绍
+====================
+
+- :code:`fluid.io.save_vars`：通过执行器（:ref:`api_guide_executor`）保存变量到指定的目录中。保存变量的方式有两种：
+
+  1）通过接口中的 :code:`vars` 指定需要保存的变量列表。
+
+  2）将一个已经存在的程序（:code:`Program`）赋值给接口中的 :code:`main_program`，然后这个程序中的所有变量都将被保存下来。
+
+  第一种保存方式的优先级要高于第二种。
+
+  API Reference 请参考 :ref:`api_fluid_io_save_vars`。
+
+- :code:`fluid.io.save_params`：通过接口中的 :code:`main_program` 指定好程序（:code:`Program`），该接口会将所指定程序中的全部参数（:code:`Parameter`）过滤出来，并将它们保存到 :code:`dirname` 指定的文件夹或 :code:`filename` 指定的文件中。
+
+  API Reference 请参考 :ref:`api_fluid_io_save_params`。
+
+- :code:`fluid.io.save_persistables`：通过接口中的 :code:`main_program` 指定好程序（:code:`Program`），该接口会将所指定程序中的全部持久性变量（:code:`persistable==True`）过滤出来，并将它们保存到 :code:`dirname` 指定的文件夹或 :code:`filename` 指定的文件中。
+
+  API Reference 请参考 :ref:`api_fluid_io_save_persistables`。
+
+- :code:`fluid.io.save_inference_model`：请参考  :ref:`api_guide_inference`。  
+
+模型加载API介绍
+====================
+
+- :code:`fluid.io.load_vars`：通过执行器（:code:`Executor`）加载指定目录中的变量。加载变量的方式有两种：
+  
+  1）通过接口中的 :code:`vars` 指定需要加载的变量列表。
+  
+  2）将一个已经存在的程序（:code:`Program`）赋值给接口中的 :code:`main_program`，然后这个程序中的所有变量都将被加载。
+
+  第一种加载方式的优先级要高于第二种。
+
+  API Reference 请参考 :ref:`api_fluid_io_load_vars`。
+
+- :code:`fluid.io.load_params`：该接口从 :code:`main_program` 指定的程序中过滤出全部参数（:code:`Parameter`），并试图从 :code:`dirname` 指定的文件夹或 :code:`filename` 指定的文件中加载这些参数。
+
+  API Reference 请参考 :ref:`api_fluid_io_load_params`。
+
+- :code:`fluid.io.load_persistables`：该接口从 :code:`main_program` 指定的程序中过滤出全部持久性变量（:code:`persistable==True`），并试图从 :code:`dirname` 指定的文件夹或 :code:`filename` 指定的文件中加载这些持久性变量。
+
+  API Reference 请参考 :ref:`api_fluid_io_load_persistables`。
+
+-  :code:`fluid.io.load_inference_model`：请参考  :ref:`api_guide_inference`。
diff --git a/doc/fluid/api/api_guides/low_level/optimizer.rst b/doc/fluid/api/api_guides/low_level/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e068bd4801081e129ca24a9208699f1f0d5bf927
--- /dev/null
+++ b/doc/fluid/api/api_guides/low_level/optimizer.rst
@@ -0,0 +1,92 @@
+..  _api_guide_optimizer:
+
+###########
+优化器
+###########
+
+神经网络最终是一个 `最优化问题 <https://en.wikipedia.org/wiki/Optimization_problem>`_ ，
+在经过 `前向计算和反向传播 <https://zh.wikipedia.org/zh-hans/反向传播算法>`_ 后，
+:code:`Optimizer` 使用反向传播梯度，优化神经网络中的参数。
+
+1.SGD/SGDOptimizer
+------------------
+
+:code:`SGD` 是实现 `随机梯度下降 <https://arxiv.org/pdf/1609.04747.pdf>`_ 的一个 :code:`Optimizer` 子类，是 `梯度下降 <https://zh.wikipedia.org/zh-hans/梯度下降法>`_ 大类中的一种方法。
+当需要训练大量样本的时候，往往选择 :code:`SGD` 来使损失函数更快的收敛。  
+
+API Reference 请参考 :ref:`api_fluid_optimizer_SGDOptimizer`
+
+
+2.Momentum/MomentumOptimizer
+----------------------------
+
+:code:`Momentum` 优化器在 :code:`SGD` 基础上引入动量，减少了随机梯度下降过程中存在的噪声问题。
+用户在使用时可以将 :code:`ues_nesterov` 参数设置为False或True，分别对应传统 `Momentum(论文4.1节)
+<https://arxiv.org/pdf/1609.04747.pdf>`_  算法和 `Nesterov accelerated gradient(论文4.2节)
+<https://arxiv.org/pdf/1609.04747.pdf>`_ 算法。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_MomentumOptimizer`
+
+
+3. Adagrad/AdagradOptimizer
+---------------------------
+`Adagrad <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_ 优化器可以针对不同参数样本数不平均的问题，自适应地为各个参数分配不同的学习率。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_AdagradOptimizer`
+
+
+4.RMSPropOptimizer
+------------------
+`RMSProp优化器 <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_ ，是一种自适应调整学习率的方法，
+主要解决使用Adagrad后，模型训练中后期学习率急剧下降的问题。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_RMSPropOptimizer`
+
+
+
+5.Adam/AdamOptimizer
+--------------------
+`Adam <https://arxiv.org/abs/1412.6980>`_ 的优化器是一种自适应调整学习率的方法，
+适用于大多非 `凸优化 <https://zh.wikipedia.org/zh/凸優化>`_ 、大数据集和高维空间的场景。在实际应用中，:code:`Adam` 是最为常用的一种优化方法。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_AdamOptimizer`
+
+
+
+6.Adamax/AdamaxOptimizer
+------------------------
+
+`Adamax <https://arxiv.org/abs/1412.6980>`_ 是 :code:`Adam` 算法的一个变体，对学习率的上限提供了一个更简单的范围，使学习率的边界范围更简单。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_AdamaxOptimizer`
+
+
+
+7.DecayedAdagrad/ DecayedAdagradOptimizer
+-------------------------------------------
+
+`DecayedAdagrad <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_ 优化器，可以看做是引入了衰减速率的 :code:`Adagrad` 算法，解决使用Adagrad后，模型训练中后期学习率急剧下降的问题。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_DecayedAdagrad`
+
+
+
+
+8. Ftrl/FtrlOptimizer
+----------------------
+
+`FtrlOptimizer <https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf>`_ 优化器结合了 `FOBOS算法 <https://stanford.edu/~jduchi/projects/DuchiSi09b.pdf>`_ 的高精度与 `RDA算法
+<http://www1.se.cuhk.edu.hk/~sqma/SEEM5121_Spring2015/dual-averaging.pdf>`_ 的稀疏性，是目前效果非常好的一种 `Online Learning <https://en.wikipedia.org/wiki/Online_machine_learning>`_ 算法。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_FtrlOptimizer`
+
+
+
+9.ModelAverage
+-----------------
+
+:code:`ModelAverage` 优化器，在训练中通过窗口来累计历史 parameter，在预测时使用取平均值后的paramet，整体提高预测的精度。
+
+API Reference 请参考 :ref:`api_fluid_optimizer_ModelAverage`
+
+
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0076394543c2f87e90fa1ea989d7b5cbf468a6f7
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,15 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
diff --git a/source/api_reference/clip.rst b/doc/fluid/api/clip.rst
similarity index 70%
rename from source/api_reference/clip.rst
rename to doc/fluid/api/clip.rst
index 3ba096388fc87dda3096a9030fe5749e61112c06..8597d40ca4e6a80abb85a82c9748b91bb7d4bafb 100644
--- a/source/api_reference/clip.rst
+++ b/doc/fluid/api/clip.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-====
-clip
-====
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
 
 ErrorClipByValue
 ----------------
@@ -12,13 +14,17 @@ ErrorClipByValue
     :members:
     :noindex:
 
-GradientClipByValue
--------------------
+.. _api_fluid_clip_GradientClipByGlobalNorm:
 
-..  autoclass:: paddle.fluid.clip.GradientClipByValue
+GradientClipByGlobalNorm
+------------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByNorm:
+
 GradientClipByNorm
 ------------------
 
@@ -26,22 +32,12 @@ GradientClipByNorm
     :members:
     :noindex:
 
-GradientClipByGlobalNorm
-------------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
-    :members:
-    :noindex:
-
-append_gradient_clip_ops
-------------------------
+.. _api_fluid_clip_GradientClipByValue:
 
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-
-error_clip_callback
+GradientClipByValue
 -------------------
 
-..  autofunction:: paddle.fluid.clip.error_clip_callback
+..  autoclass:: paddle.fluid.clip.GradientClipByValue
+    :members:
     :noindex:
 
diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0
--- /dev/null
+++ b/doc/fluid/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/fluid/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/fluid/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/source/api_reference/data_feeder.rst b/doc/fluid/api/data_feeder.rst
similarity index 67%
rename from source/api_reference/data_feeder.rst
rename to doc/fluid/api/data_feeder.rst
index 3df5c0307ffed9d101da58b385840b115920e906..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8 100644
--- a/source/api_reference/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-data_feeder
-===========
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
 
 DataFeeder
 ----------
diff --git a/source/api_reference/executor.rst b/doc/fluid/api/executor.rst
similarity index 58%
rename from source/api_reference/executor.rst
rename to doc/fluid/api/executor.rst
index f67a14c49f372e67d18ec8e6f87da01109376d22..eab798b3d6866dd140b87d468ed5e26ea883623a 100644
--- a/source/api_reference/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -1,9 +1,19 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-========
-executor
-========
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid.executor._switch_scope
+    :noindex:
+
+.. _api_fluid_executor_Executor:
 
 Executor
 --------
@@ -12,27 +22,19 @@ Executor
     :members:
     :noindex:
 
+.. _api_fluid_executor_global_scope:
+
 global_scope
 ------------
 
 ..  autofunction:: paddle.fluid.executor.global_scope
     :noindex:
 
+.. _api_fluid_executor_scope_guard:
+
 scope_guard
 -----------
 
 ..  autofunction:: paddle.fluid.executor.scope_guard
     :noindex:
 
-switch_scope
-------------
-
-..  autofunction:: paddle.fluid.executor.switch_scope
-    :noindex:
-
-fetch_var
----------
-
-..  autofunction:: paddle.fluid.executor.fetch_var
-    :noindex:
-
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b6ed4bdff823c362ef47554f3657ab7bb33ae715
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,248 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspilerConfig:
+
+DistributeTranspilerConfig
+--------------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspilerConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_LoDTensorArray:
+
+LoDTensorArray
+--------------
+
+..  autoclass:: paddle.fluid.LoDTensorArray
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_name_scope:
+
+name_scope
+----------
+
+..  autofunction:: paddle.fluid.name_scope
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9979a9f1a628a7a72951ee4cb81003f62b8d62
--- /dev/null
+++ b/doc/fluid/api/gen_doc.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
+        self.stream = stream
+        if module_name is None:
+            self.module_name = "fluid"
+        else:
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(self.module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in sorted(submodule.__all__,key=str.lower):
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in sorted(self.module.__all__,key=str.lower):
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            pass
+
+    def print_class(self, name):
+        self._print_ref_(name)
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_ref_(name)
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/source/api_reference/gen_doc.sh b/doc/fluid/api/gen_doc.sh
similarity index 51%
rename from source/api_reference/gen_doc.sh
rename to doc/fluid/api/gen_doc.sh
index 85315fce40f9b6677160dc1c617a700cae151cbe..b5d2169d459141ea9eb715c6ec692f481a34fa43 100755
--- a/source/api_reference/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op > layers.rst
 
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler  backward average profiler
 do
   python gen_doc.py ${module} > ${module}.rst
 done
+
+python gen_doc.py "" > fluid.rst
diff --git a/doc/fluid/api/gen_index.py b/doc/fluid/api/gen_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6b86270964aec33bf1c546a52afeee1d81a4dc1
--- /dev/null
+++ b/doc/fluid/api/gen_index.py
@@ -0,0 +1,21 @@
+import os.path, time
+import exceptions
+import glob
+import os
+if __name__ == '__main__':
+    
+    file_object = open('index_en.rst', 'w')
+    file_object.write('''=============
+API Reference
+=============
+
+..  toctree::
+    :maxdepth: 1
+
+''')
+    file_object.write('    api_guides/index.rst'+'\n')
+    file_object.write('    fluid.rst'+'\n')
+    for file_name in sorted(glob.glob("*.rst")):
+        if file_name != ('index_en.rst' and 'fluid.rst'):
+            file_object.write('    '+file_name + "\n")
+    file_object.close( )
diff --git a/source/api_reference/index.rst b/doc/fluid/api/index_en.rst
similarity index 75%
rename from source/api_reference/index.rst
rename to doc/fluid/api/index_en.rst
index 691d43074ee902ed4aef0fef2f974873f1aa209d..46b8fa067c8c4724565c61378e4fb789eab48a06 100644
--- a/source/api_reference/index.rst
+++ b/doc/fluid/api/index_en.rst
@@ -5,16 +5,23 @@ API Reference
 ..  toctree::
     :maxdepth: 1
 
-    layers.rst
+    api_guides/index.rst
+
+    fluid.rst
+    average.rst
+    backward.rst
+    clip.rst
+    data.rst
     data_feeder.rst
     executor.rst
     initializer.rst
+    io.rst
+    layers.rst
     metrics.rst
     nets.rst
-    clip.rst
     optimizer.rst
     param_attr.rst
     profiler.rst
     regularizer.rst
-    io.rst
-    data.rst
+    transpiler.rst
+
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ce2036675581b81b8dff6492ed2046e6f656f644
--- /dev/null
+++ b/doc/fluid/api/initializer.rst
@@ -0,0 +1,149 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Bilinear:
+
+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_BilinearInitializer:
+
+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Constant:
+
+Constant
+--------
+
+..  autoclass:: paddle.fluid.initializer.Constant
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_ConstantInitializer:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
+force_init_on_cpu
+-----------------
+
+..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
+    :noindex:
+
+.. _api_fluid_initializer_init_on_cpu:
+
+init_on_cpu
+-----------
+
+..  autofunction:: paddle.fluid.initializer.init_on_cpu
+    :noindex:
+
+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Normal:
+
+Normal
+------
+
+..  autoclass:: paddle.fluid.initializer.Normal
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_NormalInitializer:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_TruncatedNormal:
+
+TruncatedNormal
+---------------
+
+..  autoclass:: paddle.fluid.initializer.TruncatedNormal
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_TruncatedNormalInitializer:
+
+TruncatedNormalInitializer
+--------------------------
+
+..  autoclass:: paddle.fluid.initializer.TruncatedNormalInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Uniform:
+
+Uniform
+-------
+
+..  autoclass:: paddle.fluid.initializer.Uniform
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_UniformInitializer:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Xavier:
+
+Xavier
+------
+
+..  autoclass:: paddle.fluid.initializer.Xavier
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_XavierInitializer:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+
diff --git a/source/api_reference/io.rst b/doc/fluid/api/io.rst
similarity index 50%
rename from source/api_reference/io.rst
rename to doc/fluid/api/io.rst
index 21334c9edaada4398ec53455e31625d29f67dc54..b3dc7f83ca7c604dbf87b06b756e66e172f0463f 100644
--- a/source/api_reference/io.rst
+++ b/doc/fluid/api/io.rst
@@ -1,33 +1,19 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==
-io
-==
+========
+fluid.io
+========
 
-save_vars
----------
-
-..  autofunction:: paddle.fluid.io.save_vars
-    :noindex:
-
-save_params
------------
-
-..  autofunction:: paddle.fluid.io.save_params
-    :noindex:
+.. _api_fluid_io_load_inference_model:
 
-save_persistables
------------------
+load_inference_model
+--------------------
 
-..  autofunction:: paddle.fluid.io.save_persistables
+..  autofunction:: paddle.fluid.io.load_inference_model
     :noindex:
 
-load_vars
----------
-
-..  autofunction:: paddle.fluid.io.load_vars
-    :noindex:
+.. _api_fluid_io_load_params:
 
 load_params
 -----------
@@ -35,63 +21,51 @@ load_params
 ..  autofunction:: paddle.fluid.io.load_params
     :noindex:
 
+.. _api_fluid_io_load_persistables:
+
 load_persistables
 -----------------
 
 ..  autofunction:: paddle.fluid.io.load_persistables
     :noindex:
 
-save_inference_model
---------------------
+.. _api_fluid_io_load_vars:
 
-..  autofunction:: paddle.fluid.io.save_inference_model
-    :noindex:
-
-load_inference_model
---------------------
+load_vars
+---------
 
-..  autofunction:: paddle.fluid.io.load_inference_model
+..  autofunction:: paddle.fluid.io.load_vars
     :noindex:
 
-get_inference_program
----------------------
+.. _api_fluid_io_save_inference_model:
 
-..  autofunction:: paddle.fluid.io.get_inference_program
-    :noindex:
-
-save_checkpoint
----------------
+save_inference_model
+--------------------
 
-..  autofunction:: paddle.fluid.io.save_checkpoint
+..  autofunction:: paddle.fluid.io.save_inference_model
     :noindex:
 
-load_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
+.. _api_fluid_io_save_params:
 
-clean_checkpoint
-----------------
+save_params
+-----------
 
-..  autofunction:: paddle.fluid.io.clean_checkpoint
+..  autofunction:: paddle.fluid.io.save_params
     :noindex:
 
-load_persist_vars_without_grad
-------------------------------
+.. _api_fluid_io_save_persistables:
 
-..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
-    :noindex:
-
-save_persist_vars_without_grad
-------------------------------
+save_persistables
+-----------------
 
-..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+..  autofunction:: paddle.fluid.io.save_persistables
     :noindex:
 
-get_latest_checkpoint_serial
-----------------------------
+.. _api_fluid_io_save_vars:
+
+save_vars
+---------
 
-..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+..  autofunction:: paddle.fluid.io.save_vars
     :noindex:
 
diff --git a/source/api_reference/layers.rst b/doc/fluid/api/layers.rst
similarity index 61%
rename from source/api_reference/layers.rst
rename to doc/fluid/api/layers.rst
index 1d6108244e91352063c2626c501f2171c0aa1465..6aa5a4e24b35b412ab3a33e762b9be88c6a9ba65 100644
--- a/source/api_reference/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1,90 +1,72 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-======
-layers
-======
+============
+fluid.layers
+============
 
 control_flow
 ============
 
-split_lod_tensor
-----------------
-
-..  autofunction:: paddle.fluid.layers.split_lod_tensor
-    :noindex:
+.. _api_fluid_layers_array_length:
 
-merge_lod_tensor
-----------------
+array_length
+------------
 
-..  autofunction:: paddle.fluid.layers.merge_lod_tensor
+..  autofunction:: paddle.fluid.layers.array_length
     :noindex:
 
-BlockGuard
+.. _api_fluid_layers_array_read:
+
+array_read
 ----------
 
-..  autoclass:: paddle.fluid.layers.BlockGuard
-    :members:
+..  autofunction:: paddle.fluid.layers.array_read
     :noindex:
 
-BlockGuardWithCompletion
-------------------------
+.. _api_fluid_layers_array_write:
 
-..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
-    :members:
+array_write
+-----------
+
+..  autofunction:: paddle.fluid.layers.array_write
     :noindex:
 
-StaticRNNMemoryLink
--------------------
+.. _api_fluid_layers_create_array:
 
-..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
-    :members:
+create_array
+------------
+
+..  autofunction:: paddle.fluid.layers.create_array
     :noindex:
 
-WhileGuard
+.. _api_fluid_layers_DynamicRNN:
+
+DynamicRNN
 ----------
 
-..  autoclass:: paddle.fluid.layers.WhileGuard
+..  autoclass:: paddle.fluid.layers.DynamicRNN
     :members:
     :noindex:
 
-While
+.. _api_fluid_layers_equal:
+
+equal
 -----
 
-..  autoclass:: paddle.fluid.layers.While
-    :members:
+..  autofunction:: paddle.fluid.layers.equal
     :noindex:
 
-Switch
+.. _api_fluid_layers_IfElse:
+
+IfElse
 ------
 
-..  autoclass:: paddle.fluid.layers.Switch
+..  autoclass:: paddle.fluid.layers.IfElse
     :members:
     :noindex:
 
-lod_rank_table
---------------
-
-..  autofunction:: paddle.fluid.layers.lod_rank_table
-    :noindex:
-
-max_sequence_len
-----------------
-
-..  autofunction:: paddle.fluid.layers.max_sequence_len
-    :noindex:
-
-lod_tensor_to_array
--------------------
-
-..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
-    :noindex:
-
-array_to_lod_tensor
--------------------
-
-..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
-    :noindex:
+.. _api_fluid_layers_increment:
 
 increment
 ---------
@@ -92,244 +74,252 @@ increment
 ..  autofunction:: paddle.fluid.layers.increment
     :noindex:
 
-array_write
------------
-
-..  autofunction:: paddle.fluid.layers.array_write
-    :noindex:
+.. _api_fluid_layers_is_empty:
 
-create_array
-------------
+is_empty
+--------
 
-..  autofunction:: paddle.fluid.layers.create_array
+..  autofunction:: paddle.fluid.layers.is_empty
     :noindex:
 
+.. _api_fluid_layers_less_than:
+
 less_than
 ---------
 
 ..  autofunction:: paddle.fluid.layers.less_than
     :noindex:
 
-equal
+.. _api_fluid_layers_Print:
+
+Print
 -----
 
-..  autofunction:: paddle.fluid.layers.equal
+..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
-array_read
-----------
-
-..  autofunction:: paddle.fluid.layers.array_read
-    :noindex:
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
 
-shrink_memory
--------------
+reorder_lod_tensor_by_rank
+--------------------------
 
-..  autofunction:: paddle.fluid.layers.shrink_memory
+..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
     :noindex:
 
-array_length
-------------
-
-..  autofunction:: paddle.fluid.layers.array_length
-    :noindex:
+.. _api_fluid_layers_StaticRNN:
 
-IfElse
-------
+StaticRNN
+---------
 
-..  autoclass:: paddle.fluid.layers.IfElse
+..  autoclass:: paddle.fluid.layers.StaticRNN
     :members:
     :noindex:
 
-DynamicRNN
-----------
-
-..  autoclass:: paddle.fluid.layers.DynamicRNN
-    :members:
-    :noindex:
+.. _api_fluid_layers_Switch:
 
-ConditionalBlock
-----------------
+Switch
+------
 
-..  autoclass:: paddle.fluid.layers.ConditionalBlock
+..  autoclass:: paddle.fluid.layers.Switch
     :members:
     :noindex:
 
-StaticRNN
----------
+.. _api_fluid_layers_While:
 
-..  autoclass:: paddle.fluid.layers.StaticRNN
+While
+-----
+
+..  autoclass:: paddle.fluid.layers.While
     :members:
     :noindex:
 
-reorder_lod_tensor_by_rank
---------------------------
-
-..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
-    :noindex:
+device
+======
 
-ParallelDo
-----------
+io
+==
 
-..  autoclass:: paddle.fluid.layers.ParallelDo
-    :members:
-    :noindex:
+.. _api_fluid_layers_batch:
 
-Print
+batch
 -----
 
-..  autofunction:: paddle.fluid.layers.Print
+..  autofunction:: paddle.fluid.layers.batch
     :noindex:
 
-is_empty
---------
+.. _api_fluid_layers_data:
 
-..  autofunction:: paddle.fluid.layers.is_empty
+data
+----
+
+..  autofunction:: paddle.fluid.layers.data
     :noindex:
 
-device
-======
+.. _api_fluid_layers_double_buffer:
 
-get_places
-----------
+double_buffer
+-------------
 
-..  autofunction:: paddle.fluid.layers.get_places
+..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
-io
-==
+.. _api_fluid_layers_load:
 
-data
+load
 ----
 
-..  autofunction:: paddle.fluid.layers.data
+..  autofunction:: paddle.fluid.layers.load
     :noindex:
 
-BlockGuardServ
---------------
+.. _api_fluid_layers_open_files:
 
-..  autoclass:: paddle.fluid.layers.BlockGuardServ
-    :members:
+open_files
+----------
+
+..  autofunction:: paddle.fluid.layers.open_files
     :noindex:
 
-ListenAndServ
--------------
+.. _api_fluid_layers_Preprocessor:
 
-..  autoclass:: paddle.fluid.layers.ListenAndServ
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
     :members:
     :noindex:
 
-Send
-----
-
-..  autofunction:: paddle.fluid.layers.Send
-    :noindex:
+.. _api_fluid_layers_py_reader:
 
-open_recordio_file
-------------------
+py_reader
+---------
 
-..  autofunction:: paddle.fluid.layers.open_recordio_file
+..  autofunction:: paddle.fluid.layers.py_reader
     :noindex:
 
-open_files
-----------
+.. _api_fluid_layers_random_data_generator:
 
-..  autofunction:: paddle.fluid.layers.open_files
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
     :noindex:
 
+.. _api_fluid_layers_read_file:
+
 read_file
 ---------
 
 ..  autofunction:: paddle.fluid.layers.read_file
     :noindex:
 
+.. _api_fluid_layers_shuffle:
+
 shuffle
 -------
 
 ..  autofunction:: paddle.fluid.layers.shuffle
     :noindex:
 
-batch
------
+nn
+==
 
-..  autofunction:: paddle.fluid.layers.batch
+.. _api_fluid_layers_autoincreased_step_counter:
+
+autoincreased_step_counter
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
     :noindex:
 
-double_buffer
--------------
+.. _api_fluid_layers_batch_norm:
 
-..  autofunction:: paddle.fluid.layers.double_buffer
+batch_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.batch_norm
     :noindex:
 
-random_data_generator
----------------------
+.. _api_fluid_layers_beam_search:
 
-..  autofunction:: paddle.fluid.layers.random_data_generator
+beam_search
+-----------
+
+..  autofunction:: paddle.fluid.layers.beam_search
     :noindex:
 
-Preprocessor
-------------
+.. _api_fluid_layers_beam_search_decode:
 
-..  autoclass:: paddle.fluid.layers.Preprocessor
-    :members:
+beam_search_decode
+------------------
+
+..  autofunction:: paddle.fluid.layers.beam_search_decode
     :noindex:
 
-load
-----
+.. _api_fluid_layers_brelu:
 
-..  autofunction:: paddle.fluid.layers.load
+brelu
+-----
+
+..  autofunction:: paddle.fluid.layers.brelu
     :noindex:
 
-nn
-==
+.. _api_fluid_layers_chunk_eval:
 
-fc
---
+chunk_eval
+----------
 
-..  autofunction:: paddle.fluid.layers.fc
+..  autofunction:: paddle.fluid.layers.chunk_eval
     :noindex:
 
-embedding
----------
+.. _api_fluid_layers_clip:
 
-..  autofunction:: paddle.fluid.layers.embedding
+clip
+----
+
+..  autofunction:: paddle.fluid.layers.clip
     :noindex:
 
-dynamic_lstm
+.. _api_fluid_layers_clip_by_norm:
+
+clip_by_norm
 ------------
 
-..  autofunction:: paddle.fluid.layers.dynamic_lstm
+..  autofunction:: paddle.fluid.layers.clip_by_norm
     :noindex:
 
-dynamic_lstmp
--------------
+.. _api_fluid_layers_conv2d:
 
-..  autofunction:: paddle.fluid.layers.dynamic_lstmp
+conv2d
+------
+
+..  autofunction:: paddle.fluid.layers.conv2d
     :noindex:
 
-dynamic_gru
------------
+.. _api_fluid_layers_conv2d_transpose:
 
-..  autofunction:: paddle.fluid.layers.dynamic_gru
+conv2d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
     :noindex:
 
-gru_unit
---------
+.. _api_fluid_layers_conv3d:
 
-..  autofunction:: paddle.fluid.layers.gru_unit
+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
     :noindex:
 
-linear_chain_crf
+.. _api_fluid_layers_conv3d_transpose:
+
+conv3d_transpose
 ----------------
 
-..  autofunction:: paddle.fluid.layers.linear_chain_crf
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
     :noindex:
 
-crf_decoding
-------------
-
-..  autofunction:: paddle.fluid.layers.crf_decoding
-    :noindex:
+.. _api_fluid_layers_cos_sim:
 
 cos_sim
 -------
@@ -337,554 +327,890 @@ cos_sim
 ..  autofunction:: paddle.fluid.layers.cos_sim
     :noindex:
 
+.. _api_fluid_layers_crf_decoding:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.fluid.layers.crf_decoding
+    :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
+
+.. _api_fluid_layers_cross_entropy:
+
 cross_entropy
 -------------
 
 ..  autofunction:: paddle.fluid.layers.cross_entropy
     :noindex:
 
-square_error_cost
------------------
+.. _api_fluid_layers_ctc_greedy_decoder:
 
-..  autofunction:: paddle.fluid.layers.square_error_cost
+ctc_greedy_decoder
+------------------
+
+..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
     :noindex:
 
-chunk_eval
-----------
+.. _api_fluid_layers_dice_loss:
 
-..  autofunction:: paddle.fluid.layers.chunk_eval
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
     :noindex:
 
-sequence_conv
--------------
+.. _api_fluid_layers_dropout:
 
-..  autofunction:: paddle.fluid.layers.sequence_conv
+dropout
+-------
+
+..  autofunction:: paddle.fluid.layers.dropout
     :noindex:
 
-conv2d
-------
+.. _api_fluid_layers_dynamic_gru:
 
-..  autofunction:: paddle.fluid.layers.conv2d
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.fluid.layers.dynamic_gru
     :noindex:
 
-sequence_pool
+.. _api_fluid_layers_dynamic_lstm:
+
+dynamic_lstm
+------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstm
+    :noindex:
+
+.. _api_fluid_layers_dynamic_lstmp:
+
+dynamic_lstmp
 -------------
 
-..  autofunction:: paddle.fluid.layers.sequence_pool
+..  autofunction:: paddle.fluid.layers.dynamic_lstmp
     :noindex:
 
-sequence_softmax
-----------------
+.. _api_fluid_layers_edit_distance:
 
-..  autofunction:: paddle.fluid.layers.sequence_softmax
+edit_distance
+-------------
+
+..  autofunction:: paddle.fluid.layers.edit_distance
     :noindex:
 
-softmax
--------
+.. _api_fluid_layers_elementwise_add:
 
-..  autofunction:: paddle.fluid.layers.softmax
+elementwise_add
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_add
     :noindex:
 
-pool2d
-------
+.. _api_fluid_layers_elementwise_div:
 
-..  autofunction:: paddle.fluid.layers.pool2d
+elementwise_div
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_div
     :noindex:
 
-batch_norm
-----------
+.. _api_fluid_layers_elementwise_max:
 
-..  autofunction:: paddle.fluid.layers.batch_norm
+elementwise_max
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_max
     :noindex:
 
-beam_search_decode
-------------------
+.. _api_fluid_layers_elementwise_min:
 
-..  autofunction:: paddle.fluid.layers.beam_search_decode
+elementwise_min
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_min
     :noindex:
 
-conv2d_transpose
-----------------
+.. _api_fluid_layers_elementwise_mul:
 
-..  autofunction:: paddle.fluid.layers.conv2d_transpose
+elementwise_mul
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_mul
     :noindex:
 
-sequence_expand
+.. _api_fluid_layers_elementwise_pow:
+
+elementwise_pow
 ---------------
 
-..  autofunction:: paddle.fluid.layers.sequence_expand
+..  autofunction:: paddle.fluid.layers.elementwise_pow
     :noindex:
 
-lstm_unit
+.. _api_fluid_layers_elementwise_sub:
+
+elementwise_sub
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_sub
+    :noindex:
+
+.. _api_fluid_layers_elu:
+
+elu
+---
+
+..  autofunction:: paddle.fluid.layers.elu
+    :noindex:
+
+.. _api_fluid_layers_embedding:
+
+embedding
 ---------
 
-..  autofunction:: paddle.fluid.layers.lstm_unit
+..  autofunction:: paddle.fluid.layers.embedding
     :noindex:
 
-reduce_sum
-----------
+.. _api_fluid_layers_expand:
 
-..  autofunction:: paddle.fluid.layers.reduce_sum
+expand
+------
+
+..  autofunction:: paddle.fluid.layers.expand
     :noindex:
 
-reduce_mean
------------
+.. _api_fluid_layers_fc:
 
-..  autofunction:: paddle.fluid.layers.reduce_mean
+fc
+--
+
+..  autofunction:: paddle.fluid.layers.fc
     :noindex:
 
-reduce_max
-----------
+.. _api_fluid_layers_flatten:
 
-..  autofunction:: paddle.fluid.layers.reduce_max
+flatten
+-------
+
+..  autofunction:: paddle.fluid.layers.flatten
     :noindex:
 
-reduce_min
-----------
+.. _api_fluid_layers_gather:
 
-..  autofunction:: paddle.fluid.layers.reduce_min
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
     :noindex:
 
-reduce_prod
------------
+.. _api_fluid_layers_gaussian_random:
 
-..  autofunction:: paddle.fluid.layers.reduce_prod
+gaussian_random
+---------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random
     :noindex:
 
-sequence_first_step
--------------------
+.. _api_fluid_layers_gaussian_random_batch_size_like:
 
-..  autofunction:: paddle.fluid.layers.sequence_first_step
+gaussian_random_batch_size_like
+-------------------------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
     :noindex:
 
-sequence_last_step
-------------------
+.. _api_fluid_layers_gru_unit:
 
-..  autofunction:: paddle.fluid.layers.sequence_last_step
+gru_unit
+--------
+
+..  autofunction:: paddle.fluid.layers.gru_unit
     :noindex:
 
-dropout
--------
+.. _api_fluid_layers_hard_sigmoid:
 
-..  autofunction:: paddle.fluid.layers.dropout
+hard_sigmoid
+------------
+
+..  autofunction:: paddle.fluid.layers.hard_sigmoid
     :noindex:
 
-split
------
+.. _api_fluid_layers_hsigmoid:
 
-..  autofunction:: paddle.fluid.layers.split
+hsigmoid
+--------
+
+..  autofunction:: paddle.fluid.layers.hsigmoid
     :noindex:
 
-ctc_greedy_decoder
-------------------
+.. _api_fluid_layers_im2sequence:
 
-..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
+im2sequence
+-----------
+
+..  autofunction:: paddle.fluid.layers.im2sequence
     :noindex:
 
-edit_distance
--------------
+.. _api_fluid_layers_image_resize:
 
-..  autofunction:: paddle.fluid.layers.edit_distance
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+.. _api_fluid_layers_image_resize_short:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
     :noindex:
 
+.. _api_fluid_layers_l2_normalize:
+
 l2_normalize
 ------------
 
 ..  autofunction:: paddle.fluid.layers.l2_normalize
     :noindex:
 
-matmul
-------
+.. _api_fluid_layers_label_smooth:
 
-..  autofunction:: paddle.fluid.layers.matmul
+label_smooth
+------------
+
+..  autofunction:: paddle.fluid.layers.label_smooth
     :noindex:
 
-topk
-----
+.. _api_fluid_layers_layer_norm:
 
-..  autofunction:: paddle.fluid.layers.topk
+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
     :noindex:
 
-warpctc
--------
+.. _api_fluid_layers_leaky_relu:
 
-..  autofunction:: paddle.fluid.layers.warpctc
+leaky_relu
+----------
+
+..  autofunction:: paddle.fluid.layers.leaky_relu
     :noindex:
 
-sequence_reshape
+.. _api_fluid_layers_linear_chain_crf:
+
+linear_chain_crf
 ----------------
 
-..  autofunction:: paddle.fluid.layers.sequence_reshape
+..  autofunction:: paddle.fluid.layers.linear_chain_crf
     :noindex:
 
-transpose
+.. _api_fluid_layers_lod_reset:
+
+lod_reset
 ---------
 
-..  autofunction:: paddle.fluid.layers.transpose
+..  autofunction:: paddle.fluid.layers.lod_reset
     :noindex:
 
-im2sequence
+.. _api_fluid_layers_log:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+.. _api_fluid_layers_logical_and:
+
+logical_and
 -----------
 
-..  autofunction:: paddle.fluid.layers.im2sequence
+..  autofunction:: paddle.fluid.layers.logical_and
     :noindex:
 
-nce
----
+.. _api_fluid_layers_logical_not:
 
-..  autofunction:: paddle.fluid.layers.nce
+logical_not
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_not
     :noindex:
 
-beam_search
+.. _api_fluid_layers_logical_or:
+
+logical_or
+----------
+
+..  autofunction:: paddle.fluid.layers.logical_or
+    :noindex:
+
+.. _api_fluid_layers_logical_xor:
+
+logical_xor
 -----------
 
-..  autofunction:: paddle.fluid.layers.beam_search
+..  autofunction:: paddle.fluid.layers.logical_xor
     :noindex:
 
-row_conv
---------
+.. _api_fluid_layers_lrn:
 
-..  autofunction:: paddle.fluid.layers.row_conv
+lrn
+---
+
+..  autofunction:: paddle.fluid.layers.lrn
     :noindex:
 
-multiplex
+.. _api_fluid_layers_lstm_unit:
+
+lstm_unit
 ---------
 
-..  autofunction:: paddle.fluid.layers.multiplex
+..  autofunction:: paddle.fluid.layers.lstm_unit
     :noindex:
 
-layer_norm
-----------
+.. _api_fluid_layers_matmul:
 
-..  autofunction:: paddle.fluid.layers.layer_norm
+matmul
+------
+
+..  autofunction:: paddle.fluid.layers.matmul
     :noindex:
 
-softmax_with_cross_entropy
---------------------------
+.. _api_fluid_layers_maxout:
 
-..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
     :noindex:
 
-smooth_l1
+.. _api_fluid_layers_mean:
+
+mean
+----
+
+..  autofunction:: paddle.fluid.layers.mean
+    :noindex:
+
+.. _api_fluid_layers_mean_iou:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
+.. _api_fluid_layers_mul:
+
+mul
+---
+
+..  autofunction:: paddle.fluid.layers.mul
+    :noindex:
+
+.. _api_fluid_layers_multiplex:
+
+multiplex
 ---------
 
-..  autofunction:: paddle.fluid.layers.smooth_l1
+..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+.. _api_fluid_layers_nce:
+
+nce
+---
+
+..  autofunction:: paddle.fluid.layers.nce
+    :noindex:
+
+.. _api_fluid_layers_one_hot:
+
 one_hot
 -------
 
 ..  autofunction:: paddle.fluid.layers.one_hot
     :noindex:
 
-autoincreased_step_counter
---------------------------
+.. _api_fluid_layers_pad:
 
-..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
     :noindex:
 
-reshape
--------
+.. _api_fluid_layers_pad2d:
 
-..  autofunction:: paddle.fluid.layers.reshape
+pad2d
+-----
+
+..  autofunction:: paddle.fluid.layers.pad2d
     :noindex:
 
-lod_reset
----------
+.. _api_fluid_layers_pad_constant_like:
 
-..  autofunction:: paddle.fluid.layers.lod_reset
+pad_constant_like
+-----------------
+
+..  autofunction:: paddle.fluid.layers.pad_constant_like
     :noindex:
 
-lrn
----
+.. _api_fluid_layers_pool2d:
 
-..  autofunction:: paddle.fluid.layers.lrn
+pool2d
+------
+
+..  autofunction:: paddle.fluid.layers.pool2d
     :noindex:
 
-pad
+.. _api_fluid_layers_pool3d:
+
+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
+.. _api_fluid_layers_pow:
+
+pow
 ---
 
-..  autofunction:: paddle.fluid.layers.pad
+..  autofunction:: paddle.fluid.layers.pow
     :noindex:
 
-label_smooth
-------------
+.. _api_fluid_layers_prelu:
 
-..  autofunction:: paddle.fluid.layers.label_smooth
+prelu
+-----
+
+..  autofunction:: paddle.fluid.layers.prelu
     :noindex:
 
-roi_pool
---------
+.. _api_fluid_layers_random_crop:
 
-..  autofunction:: paddle.fluid.layers.roi_pool
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
     :noindex:
 
-dice_loss
+.. _api_fluid_layers_rank_loss:
+
+rank_loss
 ---------
 
-..  autofunction:: paddle.fluid.layers.dice_loss
+..  autofunction:: paddle.fluid.layers.rank_loss
     :noindex:
 
-image_resize
-------------
+.. _api_fluid_layers_reduce_max:
 
-..  autofunction:: paddle.fluid.layers.image_resize
+reduce_max
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_max
     :noindex:
 
-image_resize_short
-------------------
+.. _api_fluid_layers_reduce_mean:
 
-..  autofunction:: paddle.fluid.layers.image_resize_short
+reduce_mean
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_mean
     :noindex:
 
-resize_bilinear
----------------
+.. _api_fluid_layers_reduce_min:
 
-..  autofunction:: paddle.fluid.layers.resize_bilinear
+reduce_min
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_min
     :noindex:
 
-gather
-------
+.. _api_fluid_layers_reduce_prod:
 
-..  autofunction:: paddle.fluid.layers.gather
+reduce_prod
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_prod
     :noindex:
 
-random_crop
------------
+.. _api_fluid_layers_reduce_sum:
 
-..  autofunction:: paddle.fluid.layers.random_crop
+reduce_sum
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_sum
     :noindex:
 
-ops
-===
+.. _api_fluid_layers_relu:
 
-mean
+relu
 ----
 
-..  autofunction:: paddle.fluid.layers.mean
+..  autofunction:: paddle.fluid.layers.relu
     :noindex:
 
-mul
----
+.. _api_fluid_layers_relu6:
 
-..  autofunction:: paddle.fluid.layers.mul
+relu6
+-----
+
+..  autofunction:: paddle.fluid.layers.relu6
+    :noindex:
+
+.. _api_fluid_layers_reshape:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+.. _api_fluid_layers_resize_bilinear:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+.. _api_fluid_layers_roi_pool:
+
+roi_pool
+--------
+
+..  autofunction:: paddle.fluid.layers.roi_pool
+    :noindex:
+
+.. _api_fluid_layers_row_conv:
+
+row_conv
+--------
+
+..  autofunction:: paddle.fluid.layers.row_conv
+    :noindex:
+
+.. _api_fluid_layers_sampling_id:
+
+sampling_id
+-----------
+
+..  autofunction:: paddle.fluid.layers.sampling_id
     :noindex:
 
+.. _api_fluid_layers_scale:
+
 scale
 -----
 
 ..  autofunction:: paddle.fluid.layers.scale
     :noindex:
 
-sigmoid_cross_entropy_with_logits
----------------------------------
+.. _api_fluid_layers_scatter:
 
-..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+scatter
+-------
+
+..  autofunction:: paddle.fluid.layers.scatter
     :noindex:
 
-elementwise_add
+.. _api_fluid_layers_sequence_concat:
+
+sequence_concat
 ---------------
 
-..  autofunction:: paddle.fluid.layers.elementwise_add
+..  autofunction:: paddle.fluid.layers.sequence_concat
     :noindex:
 
-elementwise_div
----------------
+.. _api_fluid_layers_sequence_conv:
 
-..  autofunction:: paddle.fluid.layers.elementwise_div
+sequence_conv
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_conv
     :noindex:
 
-elementwise_sub
----------------
+.. _api_fluid_layers_sequence_enumerate:
 
-..  autofunction:: paddle.fluid.layers.elementwise_sub
+sequence_enumerate
+------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_enumerate
     :noindex:
 
-elementwise_mul
+.. _api_fluid_layers_sequence_expand:
+
+sequence_expand
 ---------------
 
-..  autofunction:: paddle.fluid.layers.elementwise_mul
+..  autofunction:: paddle.fluid.layers.sequence_expand
     :noindex:
 
-elementwise_max
----------------
+.. _api_fluid_layers_sequence_expand_as:
 
-..  autofunction:: paddle.fluid.layers.elementwise_max
+sequence_expand_as
+------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_expand_as
     :noindex:
 
-elementwise_min
----------------
+.. _api_fluid_layers_sequence_first_step:
 
-..  autofunction:: paddle.fluid.layers.elementwise_min
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_first_step
     :noindex:
 
-elementwise_pow
----------------
+.. _api_fluid_layers_sequence_last_step:
 
-..  autofunction:: paddle.fluid.layers.elementwise_pow
+sequence_last_step
+------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_last_step
     :noindex:
 
-clip
-----
+.. _api_fluid_layers_sequence_mask:
 
-..  autofunction:: paddle.fluid.layers.clip
+sequence_mask
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_mask
     :noindex:
 
-clip_by_norm
+.. _api_fluid_layers_sequence_pad:
+
+sequence_pad
 ------------
 
-..  autofunction:: paddle.fluid.layers.clip_by_norm
+..  autofunction:: paddle.fluid.layers.sequence_pad
     :noindex:
 
-logical_and
------------
+.. _api_fluid_layers_sequence_pool:
 
-..  autofunction:: paddle.fluid.layers.logical_and
+sequence_pool
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_pool
     :noindex:
 
-logical_or
-----------
+.. _api_fluid_layers_sequence_reshape:
 
-..  autofunction:: paddle.fluid.layers.logical_or
+sequence_reshape
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_reshape
     :noindex:
 
-logical_xor
------------
+.. _api_fluid_layers_sequence_scatter:
 
-..  autofunction:: paddle.fluid.layers.logical_xor
+sequence_scatter
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_scatter
     :noindex:
 
-logical_not
------------
+.. _api_fluid_layers_sequence_softmax:
 
-..  autofunction:: paddle.fluid.layers.logical_not
+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
     :noindex:
 
-uniform_random
---------------
+.. _api_fluid_layers_shape:
 
-..  autofunction:: paddle.fluid.layers.uniform_random
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
     :noindex:
 
-uniform_random_batch_size_like
-------------------------------
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
 
-..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+sigmoid_cross_entropy_with_logits
+---------------------------------
+
+..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
     :noindex:
 
-gaussian_random
----------------
+.. _api_fluid_layers_slice:
 
-..  autofunction:: paddle.fluid.layers.gaussian_random
+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
     :noindex:
 
-gaussian_random_batch_size_like
--------------------------------
+.. _api_fluid_layers_smooth_l1:
 
-..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+smooth_l1
+---------
+
+..  autofunction:: paddle.fluid.layers.smooth_l1
     :noindex:
 
-cumsum
-------
+.. _api_fluid_layers_soft_relu:
 
-..  autofunction:: paddle.fluid.layers.cumsum
+soft_relu
+---------
+
+..  autofunction:: paddle.fluid.layers.soft_relu
     :noindex:
 
-scatter
+.. _api_fluid_layers_softmax:
+
+softmax
 -------
 
-..  autofunction:: paddle.fluid.layers.scatter
+..  autofunction:: paddle.fluid.layers.softmax
     :noindex:
 
-sum
----
-
-..  autofunction:: paddle.fluid.layers.sum
-    :noindex:
+.. _api_fluid_layers_softmax_with_cross_entropy:
 
-polygon_box_transform
----------------------
+softmax_with_cross_entropy
+--------------------------
 
-..  autofunction:: paddle.fluid.layers.polygon_box_transform
+..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
     :noindex:
 
-shape
+.. _api_fluid_layers_split:
+
+split
 -----
 
-..  autofunction:: paddle.fluid.layers.shape
+..  autofunction:: paddle.fluid.layers.split
     :noindex:
 
-maxout
-------
+.. _api_fluid_layers_square_error_cost:
 
-..  autofunction:: paddle.fluid.layers.maxout
+square_error_cost
+-----------------
+
+..  autofunction:: paddle.fluid.layers.square_error_cost
     :noindex:
 
-sigmoid
+.. _api_fluid_layers_squeeze:
+
+squeeze
 -------
 
-..  autofunction:: paddle.fluid.layers.sigmoid
+..  autofunction:: paddle.fluid.layers.squeeze
     :noindex:
 
-logsigmoid
-----------
+.. _api_fluid_layers_stack:
 
-..  autofunction:: paddle.fluid.layers.logsigmoid
+stack
+-----
+
+..  autofunction:: paddle.fluid.layers.stack
     :noindex:
 
-exp
+.. _api_fluid_layers_stanh:
+
+stanh
+-----
+
+..  autofunction:: paddle.fluid.layers.stanh
+    :noindex:
+
+.. _api_fluid_layers_sum:
+
+sum
 ---
 
-..  autofunction:: paddle.fluid.layers.exp
+..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
-relu
-----
+.. _api_fluid_layers_swish:
 
-..  autofunction:: paddle.fluid.layers.relu
+swish
+-----
+
+..  autofunction:: paddle.fluid.layers.swish
     :noindex:
 
-tanh
+.. _api_fluid_layers_topk:
+
+topk
 ----
 
-..  autofunction:: paddle.fluid.layers.tanh
+..  autofunction:: paddle.fluid.layers.topk
     :noindex:
 
-tanh_shrink
------------
+.. _api_fluid_layers_transpose:
 
-..  autofunction:: paddle.fluid.layers.tanh_shrink
+transpose
+---------
+
+..  autofunction:: paddle.fluid.layers.transpose
     :noindex:
 
-softshrink
-----------
+.. _api_fluid_layers_uniform_random_batch_size_like:
 
-..  autofunction:: paddle.fluid.layers.softshrink
+uniform_random_batch_size_like
+------------------------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
     :noindex:
 
-sqrt
-----
+.. _api_fluid_layers_unsqueeze:
 
-..  autofunction:: paddle.fluid.layers.sqrt
+unsqueeze
+---------
+
+..  autofunction:: paddle.fluid.layers.unsqueeze
+    :noindex:
+
+.. _api_fluid_layers_unstack:
+
+unstack
+-------
+
+..  autofunction:: paddle.fluid.layers.unstack
+    :noindex:
+
+.. _api_fluid_layers_warpctc:
+
+warpctc
+-------
+
+..  autofunction:: paddle.fluid.layers.warpctc
     :noindex:
 
+ops
+===
+
+.. _api_fluid_layers_abs:
+
 abs
 ---
 
 ..  autofunction:: paddle.fluid.layers.abs
     :noindex:
 
+.. _api_fluid_layers_ceil:
+
 ceil
 ----
 
 ..  autofunction:: paddle.fluid.layers.ceil
     :noindex:
 
-floor
------
-
-..  autofunction:: paddle.fluid.layers.floor
-    :noindex:
+.. _api_fluid_layers_cos:
 
 cos
 ---
@@ -892,177 +1218,267 @@ cos
 ..  autofunction:: paddle.fluid.layers.cos
     :noindex:
 
-sin
+.. _api_fluid_layers_cumsum:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+.. _api_fluid_layers_exp:
+
+exp
 ---
 
-..  autofunction:: paddle.fluid.layers.sin
+..  autofunction:: paddle.fluid.layers.exp
     :noindex:
 
-round
+.. _api_fluid_layers_floor:
+
+floor
 -----
 
-..  autofunction:: paddle.fluid.layers.round
+..  autofunction:: paddle.fluid.layers.floor
+    :noindex:
+
+.. _api_fluid_layers_hard_shrink:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
     :noindex:
 
+.. _api_fluid_layers_logsigmoid:
+
+logsigmoid
+----------
+
+..  autofunction:: paddle.fluid.layers.logsigmoid
+    :noindex:
+
+.. _api_fluid_layers_reciprocal:
+
 reciprocal
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reciprocal
     :noindex:
 
-log
----
+.. _api_fluid_layers_round:
 
-..  autofunction:: paddle.fluid.layers.log
+round
+-----
+
+..  autofunction:: paddle.fluid.layers.round
     :noindex:
 
-square
-------
+.. _api_fluid_layers_sigmoid:
+
+sigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.sigmoid
+    :noindex:
+
+.. _api_fluid_layers_sin:
 
-..  autofunction:: paddle.fluid.layers.square
+sin
+---
+
+..  autofunction:: paddle.fluid.layers.sin
     :noindex:
 
+.. _api_fluid_layers_softplus:
+
 softplus
 --------
 
 ..  autofunction:: paddle.fluid.layers.softplus
     :noindex:
 
+.. _api_fluid_layers_softshrink:
+
+softshrink
+----------
+
+..  autofunction:: paddle.fluid.layers.softshrink
+    :noindex:
+
+.. _api_fluid_layers_softsign:
+
 softsign
 --------
 
 ..  autofunction:: paddle.fluid.layers.softsign
     :noindex:
 
-brelu
------
-
-..  autofunction:: paddle.fluid.layers.brelu
-    :noindex:
+.. _api_fluid_layers_sqrt:
 
-leaky_relu
-----------
+sqrt
+----
 
-..  autofunction:: paddle.fluid.layers.leaky_relu
+..  autofunction:: paddle.fluid.layers.sqrt
     :noindex:
 
-soft_relu
----------
-
-..  autofunction:: paddle.fluid.layers.soft_relu
-    :noindex:
+.. _api_fluid_layers_square:
 
-elu
----
+square
+------
 
-..  autofunction:: paddle.fluid.layers.elu
+..  autofunction:: paddle.fluid.layers.square
     :noindex:
 
-relu6
------
-
-..  autofunction:: paddle.fluid.layers.relu6
-    :noindex:
+.. _api_fluid_layers_tanh:
 
-pow
----
+tanh
+----
 
-..  autofunction:: paddle.fluid.layers.pow
+..  autofunction:: paddle.fluid.layers.tanh
     :noindex:
 
-stanh
------
-
-..  autofunction:: paddle.fluid.layers.stanh
-    :noindex:
+.. _api_fluid_layers_tanh_shrink:
 
-hard_shrink
+tanh_shrink
 -----------
 
-..  autofunction:: paddle.fluid.layers.hard_shrink
+..  autofunction:: paddle.fluid.layers.tanh_shrink
     :noindex:
 
+.. _api_fluid_layers_thresholded_relu:
+
 thresholded_relu
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.thresholded_relu
     :noindex:
 
-hard_sigmoid
-------------
-
-..  autofunction:: paddle.fluid.layers.hard_sigmoid
-    :noindex:
+.. _api_fluid_layers_uniform_random:
 
-swish
------
+uniform_random
+--------------
 
-..  autofunction:: paddle.fluid.layers.swish
+..  autofunction:: paddle.fluid.layers.uniform_random
     :noindex:
 
 tensor
 ======
 
-create_tensor
--------------
+.. _api_fluid_layers_argmax:
 
-..  autofunction:: paddle.fluid.layers.create_tensor
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
     :noindex:
 
-create_parameter
-----------------
+.. _api_fluid_layers_argmin:
 
-..  autofunction:: paddle.fluid.layers.create_parameter
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
     :noindex:
 
-create_global_var
------------------
+.. _api_fluid_layers_argsort:
 
-..  autofunction:: paddle.fluid.layers.create_global_var
+argsort
+-------
+
+..  autofunction:: paddle.fluid.layers.argsort
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
     :noindex:
 
+.. _api_fluid_layers_cast:
+
 cast
 ----
 
 ..  autofunction:: paddle.fluid.layers.cast
     :noindex:
 
+.. _api_fluid_layers_concat:
+
 concat
 ------
 
 ..  autofunction:: paddle.fluid.layers.concat
     :noindex:
 
-sums
-----
+.. _api_fluid_layers_create_global_var:
 
-..  autofunction:: paddle.fluid.layers.sums
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
     :noindex:
 
-assign
-------
+.. _api_fluid_layers_create_parameter:
 
-..  autofunction:: paddle.fluid.layers.assign
+create_parameter
+----------------
+
+..  autofunction:: paddle.fluid.layers.create_parameter
     :noindex:
 
-fill_constant_batch_size_like
------------------------------
+.. _api_fluid_layers_create_tensor:
 
-..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
     :noindex:
 
+.. _api_fluid_layers_fill_constant:
+
 fill_constant
 -------------
 
 ..  autofunction:: paddle.fluid.layers.fill_constant
     :noindex:
 
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
 ones
 ----
 
 ..  autofunction:: paddle.fluid.layers.ones
     :noindex:
 
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
 zeros
 -----
 
@@ -1072,96 +1488,201 @@ zeros
 learning_rate_scheduler
 =======================
 
-exponential_decay
------------------
+.. _api_fluid_layers_append_LARS:
 
-..  autofunction:: paddle.fluid.layers.exponential_decay
+append_LARS
+-----------
+
+..  autofunction:: paddle.fluid.layers.append_LARS
     :noindex:
 
-natural_exp_decay
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
 -----------------
 
-..  autofunction:: paddle.fluid.layers.natural_exp_decay
+..  autofunction:: paddle.fluid.layers.exponential_decay
     :noindex:
 
+.. _api_fluid_layers_inverse_time_decay:
+
 inverse_time_decay
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.inverse_time_decay
     :noindex:
 
-polynomial_decay
-----------------
+.. _api_fluid_layers_natural_exp_decay:
 
-..  autofunction:: paddle.fluid.layers.polynomial_decay
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
     :noindex:
 
+.. _api_fluid_layers_piecewise_decay:
+
 piecewise_decay
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.piecewise_decay
     :noindex:
 
-noam_decay
-----------
+.. _api_fluid_layers_polynomial_decay:
 
-..  autofunction:: paddle.fluid.layers.noam_decay
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
     :noindex:
 
 detection
 =========
 
-prior_box
----------
-
-..  autofunction:: paddle.fluid.layers.prior_box
-    :noindex:
+.. _api_fluid_layers_anchor_generator:
 
-multi_box_head
---------------
+anchor_generator
+----------------
 
-..  autofunction:: paddle.fluid.layers.multi_box_head
+..  autofunction:: paddle.fluid.layers.anchor_generator
     :noindex:
 
+.. _api_fluid_layers_bipartite_match:
+
 bipartite_match
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.bipartite_match
     :noindex:
 
-target_assign
+.. _api_fluid_layers_box_coder:
+
+box_coder
+---------
+
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+
+.. _api_fluid_layers_detection_map:
+
+detection_map
 -------------
 
-..  autofunction:: paddle.fluid.layers.target_assign
+..  autofunction:: paddle.fluid.layers.detection_map
     :noindex:
 
+.. _api_fluid_layers_detection_output:
+
 detection_output
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.detection_output
     :noindex:
 
-ssd_loss
---------
+.. _api_fluid_layers_generate_proposal_labels:
 
-..  autofunction:: paddle.fluid.layers.ssd_loss
+generate_proposal_labels
+------------------------
+
+..  autofunction:: paddle.fluid.layers.generate_proposal_labels
     :noindex:
 
-detection_map
--------------
+.. _api_fluid_layers_generate_proposals:
 
-..  autofunction:: paddle.fluid.layers.detection_map
+generate_proposals
+------------------
+
+..  autofunction:: paddle.fluid.layers.generate_proposals
     :noindex:
 
+.. _api_fluid_layers_iou_similarity:
+
 iou_similarity
 --------------
 
 ..  autofunction:: paddle.fluid.layers.iou_similarity
     :noindex:
 
-box_coder
+.. _api_fluid_layers_multi_box_head:
+
+multi_box_head
+--------------
+
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+
+.. _api_fluid_layers_polygon_box_transform:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+.. _api_fluid_layers_prior_box:
+
+prior_box
 ---------
 
-..  autofunction:: paddle.fluid.layers.box_coder
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
+.. _api_fluid_layers_roi_perspective_transform:
+
+roi_perspective_transform
+-------------------------
+
+..  autofunction:: paddle.fluid.layers.roi_perspective_transform
+    :noindex:
+
+.. _api_fluid_layers_rpn_target_assign:
+
+rpn_target_assign
+-----------------
+
+..  autofunction:: paddle.fluid.layers.rpn_target_assign
+    :noindex:
+
+.. _api_fluid_layers_ssd_loss:
+
+ssd_loss
+--------
+
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+
+.. _api_fluid_layers_target_assign:
+
+target_assign
+-------------
+
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+
+metric_op
+=========
+
+.. _api_fluid_layers_accuracy:
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+.. _api_fluid_layers_auc:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
     :noindex:
 
diff --git a/source/api_reference/metrics.rst b/doc/fluid/api/metrics.rst
similarity index 59%
rename from source/api_reference/metrics.rst
rename to doc/fluid/api/metrics.rst
index ddf07775d7ea293acd421b8549d03b277ff0611d..5b5f7bc3b4155d4d859573234df722e098365099 100644
--- a/source/api_reference/metrics.rst
+++ b/doc/fluid/api/metrics.rst
@@ -1,31 +1,30 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-=======
-metrics
-=======
+=============
+fluid.metrics
+=============
 
-MetricBase
-----------
+.. _api_fluid_metrics_Accuracy:
 
-..  autoclass:: paddle.fluid.metrics.MetricBase
-    :members:
-    :noindex:
-
-CompositeMetric
----------------
+Accuracy
+--------
 
-..  autoclass:: paddle.fluid.metrics.CompositeMetric
+..  autoclass:: paddle.fluid.metrics.Accuracy
     :members:
     :noindex:
 
-Accuracy
---------
+.. _api_fluid_metrics_Auc:
 
-..  autoclass:: paddle.fluid.metrics.Accuracy
+Auc
+---
+
+..  autoclass:: paddle.fluid.metrics.Auc
     :members:
     :noindex:
 
+.. _api_fluid_metrics_ChunkEvaluator:
+
 ChunkEvaluator
 --------------
 
@@ -33,13 +32,17 @@ ChunkEvaluator
     :members:
     :noindex:
 
-EditDistance
-------------
+.. _api_fluid_metrics_CompositeMetric:
 
-..  autoclass:: paddle.fluid.metrics.EditDistance
+CompositeMetric
+---------------
+
+..  autoclass:: paddle.fluid.metrics.CompositeMetric
     :members:
     :noindex:
 
+.. _api_fluid_metrics_DetectionMAP:
+
 DetectionMAP
 ------------
 
@@ -47,10 +50,39 @@ DetectionMAP
     :members:
     :noindex:
 
-Auc
----
+.. _api_fluid_metrics_EditDistance:
 
-..  autoclass:: paddle.fluid.metrics.Auc
+EditDistance
+------------
+
+..  autoclass:: paddle.fluid.metrics.EditDistance
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_MetricBase:
+
+MetricBase
+----------
+
+..  autoclass:: paddle.fluid.metrics.MetricBase
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
     :members:
     :noindex:
 
diff --git a/source/api_reference/nets.rst b/doc/fluid/api/nets.rst
similarity index 61%
rename from source/api_reference/nets.rst
rename to doc/fluid/api/nets.rst
index 7ae3187304f386a08c5cb8a4ba093423a58a7f36..f3792e62946968e5438f928e3fe50e5b188fc274 100644
--- a/source/api_reference/nets.rst
+++ b/doc/fluid/api/nets.rst
@@ -1,21 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-====
-nets
-====
+==========
+fluid.nets
+==========
 
-simple_img_conv_pool
---------------------
-
-..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
-    :noindex:
-
-sequence_conv_pool
-------------------
-
-..  autofunction:: paddle.fluid.nets.sequence_conv_pool
-    :noindex:
+.. _api_fluid_nets_glu:
 
 glu
 ---
@@ -23,9 +13,35 @@ glu
 ..  autofunction:: paddle.fluid.nets.glu
     :noindex:
 
+.. _api_fluid_nets_img_conv_group:
+
+img_conv_group
+--------------
+
+..  autofunction:: paddle.fluid.nets.img_conv_group
+    :noindex:
+
+.. _api_fluid_nets_scaled_dot_product_attention:
+
 scaled_dot_product_attention
 ----------------------------
 
 ..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
     :noindex:
 
+.. _api_fluid_nets_sequence_conv_pool:
+
+sequence_conv_pool
+------------------
+
+..  autofunction:: paddle.fluid.nets.sequence_conv_pool
+    :noindex:
+
+.. _api_fluid_nets_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+
+..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
+    :noindex:
+
diff --git a/source/api_reference/optimizer.rst b/doc/fluid/api/optimizer.rst
similarity index 62%
rename from source/api_reference/optimizer.rst
rename to doc/fluid/api/optimizer.rst
index 6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005..db78fa169f3b0d3da95037783c88131e38f6cd2b 100644
--- a/source/api_reference/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -1,24 +1,21 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-=========
-optimizer
-=========
+===============
+fluid.optimizer
+===============
 
-SGD
----
-
-..  autoclass:: paddle.fluid.optimizer.SGD
-    :members:
-    :noindex:
+.. _api_fluid_optimizer_Adadelta:
 
-Momentum
+Adadelta
 --------
 
-..  autoclass:: paddle.fluid.optimizer.Momentum
+..  autoclass:: paddle.fluid.optimizer.Adadelta
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adagrad:
+
 Adagrad
 -------
 
@@ -26,6 +23,17 @@ Adagrad
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdagradOptimizer:
+
+AdagradOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adam:
+
 Adam
 ----
 
@@ -33,6 +41,8 @@ Adam
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adamax:
+
 Adamax
 ------
 
@@ -40,6 +50,26 @@ Adamax
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
+AdamaxOptimizer
+---------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdamOptimizer:
+
+AdamOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagrad:
+
 DecayedAdagrad
 --------------
 
@@ -47,48 +77,62 @@ DecayedAdagrad
     :members:
     :noindex:
 
-SGDOptimizer
-------------
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
 
-..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
+DecayedAdagradOptimizer
+-----------------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
     :members:
     :noindex:
 
-MomentumOptimizer
------------------
+.. _api_fluid_optimizer_Ftrl:
 
-..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
     :members:
     :noindex:
 
-AdagradOptimizer
-----------------
+.. _api_fluid_optimizer_FtrlOptimizer:
 
-..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
     :members:
     :noindex:
 
-AdamOptimizer
--------------
+.. _api_fluid_optimizer_ModelAverage:
 
-..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
     :members:
     :noindex:
 
-AdamaxOptimizer
----------------
+.. _api_fluid_optimizer_Momentum:
 
-..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+Momentum
+--------
+
+..  autoclass:: paddle.fluid.optimizer.Momentum
     :members:
     :noindex:
 
-DecayedAdagradOptimizer
------------------------
+.. _api_fluid_optimizer_MomentumOptimizer:
 
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+MomentumOptimizer
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
 RMSPropOptimizer
 ----------------
 
@@ -96,24 +140,30 @@ RMSPropOptimizer
     :members:
     :noindex:
 
-Adadelta
---------
+.. _api_fluid_optimizer_RMSPropOptimizer:
 
-..  autoclass:: paddle.fluid.optimizer.Adadelta
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
     :members:
     :noindex:
 
-ModelAverage
-------------
+.. _api_fluid_optimizer_SGD:
 
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
+SGD
+---
+
+..  autoclass:: paddle.fluid.optimizer.SGD
     :members:
     :noindex:
 
-Optimizer
----------
+.. _api_fluid_optimizer_SGDOptimizer:
 
-..  autoclass:: paddle.fluid.optimizer.Optimizer
+SGDOptimizer
+------------
+
+..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
     :members:
     :noindex:
 
diff --git a/source/api_reference/param_attr.rst b/doc/fluid/api/param_attr.rst
similarity index 70%
rename from source/api_reference/param_attr.rst
rename to doc/fluid/api/param_attr.rst
index 8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075..33035bbc7ca5c8d000adeaf1cb79806a3ea64604 100644
--- a/source/api_reference/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==========
-param_attr
-==========
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
 
 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
     :members:
     :noindex:
 
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
 WeightNormParamAttr
 -------------------
 
diff --git a/source/api_reference/profiler.rst b/doc/fluid/api/profiler.rst
similarity index 70%
rename from source/api_reference/profiler.rst
rename to doc/fluid/api/profiler.rst
index 39fda65863471a78895503184848a754828b71a1..cff8e8c2428f8a75b0f145605c8eec536fefc58e 100644
--- a/source/api_reference/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-========
-profiler
-========
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
 
 cuda_profiler
 -------------
@@ -11,11 +13,7 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
     :noindex:
 
-reset_profiler
---------------
-
-..  autofunction:: paddle.fluid.profiler.reset_profiler
-    :noindex:
+.. _api_fluid_profiler_profiler:
 
 profiler
 --------
@@ -23,12 +21,24 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+.. _api_fluid_profiler_reset_profiler:
+
+reset_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.reset_profiler
+    :noindex:
+
+.. _api_fluid_profiler_start_profiler:
+
 start_profiler
 --------------
 
 ..  autofunction:: paddle.fluid.profiler.start_profiler
     :noindex:
 
+.. _api_fluid_profiler_stop_profiler:
+
 stop_profiler
 -------------
 
diff --git a/source/api_reference/regularizer.rst b/doc/fluid/api/regularizer.rst
similarity index 63%
rename from source/api_reference/regularizer.rst
rename to doc/fluid/api/regularizer.rst
index 756bc53baa0625aef48dad0c35e7ae57421a70d0..dcf69eba8a8db2e02cfcc04472995376a579ccbe 100644
--- a/source/api_reference/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -1,30 +1,30 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-regularizer
-===========
+=================
+fluid.regularizer
+=================
 
-append_regularization_ops
--------------------------
+.. _api_fluid_regularizer_L1Decay:
 
-..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
-    :noindex:
-
-WeightDecayRegularizer
-----------------------
+L1Decay
+-------
 
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
+..  autoclass:: paddle.fluid.regularizer.L1Decay
     :members:
     :noindex:
 
-L1Decay
--------
+.. _api_fluid_regularizer_L1DecayRegularizer:
 
-..  autoclass:: paddle.fluid.regularizer.L1Decay
+L1DecayRegularizer
+------------------
+
+..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L2Decay:
+
 L2Decay
 -------
 
@@ -32,12 +32,7 @@ L2Decay
     :members:
     :noindex:
 
-L1DecayRegularizer
-------------------
-
-..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
-    :members:
-    :noindex:
+.. _api_fluid_regularizer_L2DecayRegularizer:
 
 L2DecayRegularizer
 ------------------
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7764f60deda2fac9b9c0e65e03dc7a3bf0f2f439
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,59 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_DistributeTranspilerConfig:
+
+DistributeTranspilerConfig
+--------------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspilerConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_HashName:
+
+HashName
+--------
+
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+
+.. _api_fluid_transpiler_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+
+.. _api_fluid_transpiler_RoundRobin:
+
+RoundRobin
+----------
+
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/beginners_guide/basics/image_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c62b06287ad333dd41082e566b0553d3a5341
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/image_classification/.gitignore
@@ -0,0 +1,8 @@
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
diff --git a/doc/fluid/beginners_guide/basics/image_classification/image b/doc/fluid/beginners_guide/basics/image_classification/image
new file mode 120000
index 0000000000000000000000000000000000000000..557ef69573cf03e06c1053970cf22695f3674ace
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/image_classification/image
@@ -0,0 +1 @@
+../../../../../external/book/03.image_classification/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/image_classification/index.md b/doc/fluid/beginners_guide/basics/image_classification/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..18ab749ec38e835f14299c09c03192919bda41bb
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/image_classification/index.md
@@ -0,0 +1 @@
+../../../../../external/book/03.image_classification/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/index.rst b/doc/fluid/beginners_guide/basics/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f293320907005d36d1141a6a0106e28bfd9b43e1
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/index.rst
@@ -0,0 +1,18 @@
+################
+深度学习基础知识
+################
+
+
+..  todo::
+
+    概述
+    
+..  toctree::
+    :titlesonly:
+
+    image_classification/index.md
+    word2vec/index.md
+    recommender_system/index.md
+    understand_sentiment/index.md
+    label_semantic_roles/index.md
+    machine_translation/index.md
diff --git a/doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29b5622a53a1b0847e9f53febf1cc50dcf4f044a
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore
@@ -0,0 +1,12 @@
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
diff --git a/doc/fluid/beginners_guide/basics/label_semantic_roles/image b/doc/fluid/beginners_guide/basics/label_semantic_roles/image
new file mode 120000
index 0000000000000000000000000000000000000000..524699b463e79ccef236b5b82e75520411f19f3c
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/label_semantic_roles/image
@@ -0,0 +1 @@
+../../../../../external/book/07.label_semantic_roles/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md b/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..8e482e13129cade3153b79fc4c334a8bff858af5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md
@@ -0,0 +1 @@
+../../../../../external/book/07.label_semantic_roles/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/learning_materials.md b/doc/fluid/beginners_guide/basics/learning_materials.md
new file mode 100644
index 0000000000000000000000000000000000000000..a27499c6ed8d1149c6d519006086febbcae943fa
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/learning_materials.md
@@ -0,0 +1,54 @@
+# 学习资料
+
+## 要读的第一本书
+基础理论习得的最直接来源就是书本。按机器学习理论、深度学习理论、编程语言三方面划分，这里推荐如下书籍辅助您。
+
+
+### 机器学习理论
+
+在开启深度学习之前，您需要先行掌握机器学习的理论。深度学习是机器学习中的一个分支，两者内在的理论基础存在强关联。
+机器学习理论的书籍教材比较多，这里推荐一本易懂易学的书籍，可以重点关注神经网络部分。
+
+书名：《机器学习》（周志华著，清华大学出版社，2016年版）
+
+### 深度学习理论
+
+打好机器学习的理论功底后，您可以开始钻研深度学习的理论。通常深度学习理论会给人留下抽象难懂的印象，且和数学结合紧密。
+为了让您能够顺利入门，这里推荐一份易学易用的教材，无论深度学习理论还是数学理论即可一本搞定。
+
+书名：《Deep Learning（深度学习）》（Goodfellow, Bengio, Courville合著，赵申剑、黎彧君、符天凡和李凯合译，人民邮电出版社，2017年版）
+此书电子版在Github上已经开源，详情可参考此链接 [《深度学习》](https://github.com/exacity/deeplearningbook-chinese)
+
+### 编程语言
+
+Python方向：这里推荐您学习Python，一方面各大主流深度学习框架的主力支撑编程语言均为Python；另一方面，对比其他语言，Python较为简单易学。
+Python的教材种类较多，这里推荐一本实操和理论性都兼顾的教材，只要完成书中52个习题，跑代码然后发现问题解决，就能逐步上手。
+
+书名：《“笨办法”学Python》（Zed Shaw著，王巍巍译，人民邮电出版社，2014年11月版）
+
+
+C++方向：C++语言在底层框架中使用较多，您逐步掌握开源框架的基本操作后，在更高阶的框架应用中会用到这个技能点。
+同前面提到的Python一样，学习C++时需要多上手操作。这里推荐迅速上手C++的书籍，不但能够学习功能和结构，还提供了解决方案的示例。
+
+书名：《Essential C++》【美】李普曼（Lippman,S.B.）著，侯捷译，电子工业出版社2013年8月版
+
+
+
+## 要看的视频公开课
+
+在学习一门新技术的同时，除了看书，如果有老师面对面教授，可以更快更好的学会知识。相比于线下授课，视频公开课能够在省钱省力的同时，达到易学易掌握的效果。
+目前深度学习的课程多是公开免费的，通过学习您可以更轻松的理解深度学习中的抽象理论，并在实操方面不绕弯路。
+综合课程生动性、可操作性、紧凑性、连续性这些特点，这里推荐如下课程，同步附上网址，便于您查找学习。
+
+### 理论知识详解视频课
+[机器学习](http://open.163.com/special/opencourse/machinelearning.html) 斯坦福大学教授吴恩达公开课程，包含相关算法的详细讲解。
+
+[AI技术](https://ai.baidu.com/paddlepaddle/player?id=13) 百度推出的“AI核心技术掌握”课程，每节课在20-30分钟左右，从AI技术到深度学习进行全面细致的解读。
+
+[深度学习](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) 台湾李宏毅教授的在线课程，其中是英文课程，会结合国外的科研成果，但也适合新手入门和理解深度学习。
+
+[编程语言](https://ai.baidu.com/paddlepaddle/openCourses) Python操作课程，从基础到进阶操作都提供详细说明，每节课时长20分钟左右。
+
+### PaddlePaddle实操视频课
+掌握好理论基础，具备编程能力后，您可以开始使用PaddlePaddle Fluid进行实操，从初阶开始学习，向着中高阶努力。
+目前已有PaddlePaddle官方视频公开课在官网呈现,内含PaddlePaddle实战、PaddlePaddle应用场景和机器学习模型讲解课程，帮助开发者从零开始使用PaddlePaddle，从简单场景逐步过渡到工业级应用。[点击这里](http://ai.baidu.com/paddlepaddle/openCourses)您即可开始视频课的学习之旅。
diff --git a/doc/fluid/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/beginners_guide/basics/machine_translation/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6129b9e8645010fcb8372d9dc3dbb568dfa80907
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/machine_translation/.gitignore
@@ -0,0 +1,9 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
diff --git a/doc/fluid/beginners_guide/basics/machine_translation/image b/doc/fluid/beginners_guide/basics/machine_translation/image
new file mode 120000
index 0000000000000000000000000000000000000000..0101c21f5870c3a796cda5f1eaaaa61855a7442f
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/machine_translation/image
@@ -0,0 +1 @@
+../../../../../external/book/08.machine_translation/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/machine_translation/index.md b/doc/fluid/beginners_guide/basics/machine_translation/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..fad1225ac49b1084e9d9a6e8e1df9367053c346b
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/machine_translation/index.md
@@ -0,0 +1 @@
+../../../../../external/book/08.machine_translation/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/beginners_guide/basics/recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/doc/fluid/beginners_guide/basics/recommender_system/image b/doc/fluid/beginners_guide/basics/recommender_system/image
new file mode 120000
index 0000000000000000000000000000000000000000..af4f41218de1544bcbb7709e44146e615e4f9804
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/recommender_system/image
@@ -0,0 +1 @@
+../../../../../external/book/05.recommender_system/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/recommender_system/index.md b/doc/fluid/beginners_guide/basics/recommender_system/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..2bbbdc54e0b27d2a437530b255091312390371d0
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/recommender_system/index.md
@@ -0,0 +1 @@
+../../../../../external/book/05.recommender_system/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..667762d327cb160376a4119fa9df9db41b6443b2
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/doc/fluid/beginners_guide/basics/understand_sentiment/image b/doc/fluid/beginners_guide/basics/understand_sentiment/image
new file mode 120000
index 0000000000000000000000000000000000000000..13bacf9fb90da1516d1f8163e3705458966c284a
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/understand_sentiment/image
@@ -0,0 +1 @@
+../../../../../external/book/06.understand_sentiment/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/understand_sentiment/index.md b/doc/fluid/beginners_guide/basics/understand_sentiment/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..db728d7ba2f547d759dd9854546cb818974920d5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/understand_sentiment/index.md
@@ -0,0 +1 @@
+../../../../../external/book/06.understand_sentiment/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/beginners_guide/basics/word2vec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a620e0279c310d213d4e6d8e99e666962c11e352
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/word2vec/.gitignore
@@ -0,0 +1,3 @@
+data/train.list
+data/test.list
+data/simple-examples*
diff --git a/doc/fluid/beginners_guide/basics/word2vec/image b/doc/fluid/beginners_guide/basics/word2vec/image
new file mode 120000
index 0000000000000000000000000000000000000000..fe0098012579714af6fa6fdf27afd370021cd29d
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/word2vec/image
@@ -0,0 +1 @@
+../../../../../external/book/04.word2vec/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/word2vec/index.md b/doc/fluid/beginners_guide/basics/word2vec/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..19186f4fee4a763bc1e4efcfa812694ca3975372
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/word2vec/index.md
@@ -0,0 +1 @@
+../../../../../external/book/04.word2vec/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/image/tensor.jpg b/doc/fluid/beginners_guide/image/tensor.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e45c28bd4b250c54e1ea380b55ee6507d4859a7f
Binary files /dev/null and b/doc/fluid/beginners_guide/image/tensor.jpg differ
diff --git a/doc/fluid/beginners_guide/index.rst b/doc/fluid/beginners_guide/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..daa189b432e92b9ece62e239bb591c2008ec27f3
--- /dev/null
+++ b/doc/fluid/beginners_guide/index.rst
@@ -0,0 +1,37 @@
+########
+新手入门
+########
+
+=========
+  概览
+=========
+
+请您首先阅读以下文档，了解安装方法：
+
+    - `安装说明 <../beginners_guide/install/Start.html>`_：我们支持在Ubunt/CentOS/Windows/MacOS环境上的安装
+
+如果您初次接触深度学习，在学习PaddlePaddle之前建议您先阅读以下资料：
+
+    - `学习资料 <../beginners_guide/basics/learning_materials.html>`_：推荐机器学习、深度学习和编程语言三个方面的书籍与视频公开课
+
+如果您已经具备一定的深度学习基础，第一次使用 Fluid 时，可以跟随下列简单的模型案例供您快速上手：
+
+    - `Fluid编程指南 <../beginners_guide/programming_guide/programming_guide.html>`_：介绍 Fluid 的基本概念和使用方法
+
+    - `快速入门 <../beginners_guide/quick_start/index.html>`_：提供线性回归和识别数字两个入门级模型，帮助您快速上手训练网络
+
+    - `深度学习基础知识 <../beginners_guide/basics/index.html>`_：覆盖图像分类、个性化推荐、机器翻译等多个深度领域的基础知识，提供 Fluid 实现案例
+
+
+=========
+  目录
+=========
+
+..  toctree::
+    :maxdepth: 2
+    
+    install/Start.rst
+    quick_start/index.rst
+    basics/index.rst
+    basics/learning_materials.md
+    programming_guide/programming_guide.md
diff --git a/doc/fluid/beginners_guide/install/FAQ.md b/doc/fluid/beginners_guide/install/FAQ.md
new file mode 100644
index 0000000000000000000000000000000000000000..80a5dd92fbdf4b2f6188c705741ddd0a10b10d9c
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/FAQ.md
@@ -0,0 +1,132 @@
+***
+<a name="FAQ"></a>        
+# **FAQ**
+- CentOS6下如何编译python2.7为共享库? 
+	
+	> 使用以下指令：
+	
+		./configure --prefix=/usr/local/python2.7 --enable-shared   
+		make && make install   
+
+<!--TODO please add more F&Q parts here-->
+
+- Ubuntu18.04下libidn11找不到？
+	
+	> 使用以下指令：
+	
+		apt install libidn11   
+
+- Ubuntu编译时出现大量的代码段不能识别？
+	
+	> 这可能是由于cmake版本不匹配造成的，请在gcc的安装目录下使用以下指令：
+		
+		apt install gcc-4.8 g++-4.8
+		cp gcc gcc.bak
+		cp g++ g++.bak
+		rm gcc
+		rm g++
+		ln -s gcc-4.8 gcc
+		ln -s g++-4.8 g++
+        
+
+
+
+- 遇到paddlepaddle*.whl is not a supported wheel on this platform？
+	
+	> 出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。 请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准， 需要使用最新的pip (>9.0.0) 才可以安装。您可以执行以下指令更新您的pip：     
+	
+		pip install --upgrade pip 
+	或者     
+		
+		python -c "import pip; print(pip.pep425tags.get_supported())"    
+
+	> 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包	 （本地）是 linux_x86_64， 可以重命名这个whl包为 manylinux1_x86_64 再安装。
+
+- 使用Docker编译出现问题？
+	
+	> 请参照GitHub上[Issue12079](https://github.com/PaddlePaddle/Paddle/issues/12079)
+
+- 什么是 Docker?
+
+  	> 如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  	> 有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  	> 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+	另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 可以选择不用Docker吗？
+
+  	> 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  	> 理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。
+  	  这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 可以用 IDE 吗？
+
+  	> 当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+	  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+	  `global-set-key "\C-cc" 'compile`
+      `setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev"`
+	  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  	> 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/paddle/scripts/paddle_build.sh)。这个脚本调用`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo？
+
+  	> 如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢？
+
+  	> Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[issue627](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够？
+
+  	> 本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 [这篇文章](https://zaiste.net/posts/removing_docker_containers) 来清理这些内容。
+
+- 在DockerToolbox下使用book时`http://localhost:8888/`无法打开？
+	
+   	> 需要将localhost替换成虚拟机ip，一般需要在浏览器中输入：`http://192.168.99.100:8888/`
+
+- pip install gpu版本的PaddlePaddle后运行出现SegmentFault如下：
+   
+  	 @ 0x7f6c8d214436 paddle::platform::EnforceNotMet::EnforceNotMet()
+	 
+   	 @ 0x7f6c8dfed666 paddle::platform::GetCUDADeviceCount() 
+	 
+  	 @ 0x7f6c8d2b93b6 paddle::framework::InitDevices()
+   
+   
+   	> 出现这个问题原因主要是由于您的显卡驱动低于对应CUDA版本的要求，请保证您的显卡驱动支持所使用的CUDA版本
+   
+
+<a name="MACPRO"></a>    
+
+- MacOS下安装PaddlePaddle后import paddle.fluid出现`Fatal Python error: PyThreadState_Get: no current thread running`错误 
+
+	  - For Python2.7.x （install by brew): 请使用`export LD_LIBRARY_PATH=/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7 && export DYLD_LIBRARY_PATH=/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7`
+	  - For Python2.7.x （install by Python.org): 请使用`export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 && export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7`
+	  - For Python3.5.x （install by Python.org): 请使用`export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/ && export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/`
+
+<a name="OPENBLAS"></a>
+
+- MACOS下使用自定义的openblas 详见issue：
+  
+  	> [ISSUE 13217](https://github.com/PaddlePaddle/Paddle/issues/13721)
+
+- 已经安装swig但是仍旧出现swig找不到的问题 详见issue：
+	
+	>  [ISSUE 13759](https://github.com/PaddlePaddle/Paddle/issues/13759)
+
+- 出现 “target pattern contain no '%'.”的问题 详见issue：
+
+	> [ISSUE 13806](https://github.com/PaddlePaddle/Paddle/issues/13806)
+
diff --git a/doc/fluid/beginners_guide/install/Start.rst b/doc/fluid/beginners_guide/install/Start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..65c7762a3c21a90ca3bdc1ab9e0dd18b7575b91e
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/Start.rst
@@ -0,0 +1,53 @@
+==========
+ 安装说明
+==========
+本说明将指导您在64位台式机或笔记本电脑上, 使用Python2.7或者Python3.5编译和安装PaddlePaddle，目前PaddlePaddle支持以下环境：
+
+* *Ubuntu 14.04 /16.04 /18.04*
+* *CentOS 7 / 6*
+* *MacOS 10.11 / 10.12 / 10.13 / 10.14*
+* *Windows7 / 8/ 10(专业版/企业版)*
+
+
+请确保您的环境满足以上条件，我们默认提供的安装同时需要您的计算机拥有64位操作系统，处理器支持AVX2指令集，否则请选择 `多版本whl包安装列表 <Tables.html/#ciwhls>`_  中 :code:`no_avx` 的版本
+
+- 如果您希望使用 `pip <https://pypi.org/pypi/>`_ 进行安装PaddlePaddle可以直接使用以下命令:      
+
+:code:`pip install paddlepaddle` （CPU版本最新）  
+ 
+:code:`pip install paddlepaddle-gpu` （GPU版本最新）  
+   
+:code:`pip install paddlepaddle==[pip版本号]`             
+
+	其中[pip版本号]请查阅 `PyPi.org <https://pypi.org/search/?q=PaddlePaddle>`_
+
+- 如果您希望使用 `docker <https://www.docker.com>`_ 安装PaddlePaddle可以直接使用以下命令:
+:code:`docker run --name [Name of container] -it -v $PWD:/paddle hub.baidubce.com/paddlepaddle/paddle:[docker版本号] /bin/bash`
+
+	其中[docker版本号]请查阅 `DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+
+
+如果对上面的指令有疑问或者不能正常使用，请参见以下内容
+
+安装PaddlePaddle
+-----------------------
+
+..	toctree::
+	:maxdepth:1
+
+	install_Ubuntu.md
+	install_CentOS.md
+	install_MacOS.md
+	install_Windows.md
+	compile/fromsource.rst
+
+参考信息
+-----------------------
+
+如在安装或编译过程中遇到问题请参见
+
+..	toctree::
+	:maxdepth:1
+
+	FAQ.md
+	Tables.md
diff --git a/doc/fluid/beginners_guide/install/Tables.md b/doc/fluid/beginners_guide/install/Tables.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef7f9d0e5e423c44d2d66b9d199ccc858296bdbc
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/Tables.md
@@ -0,0 +1,461 @@
+***
+<a name="third_party"></a>       
+# 附录
+
+## **编译依赖表**
+
+<p align="center">
+<table>
+	<thead>
+	<tr>
+		<th> 依赖包名称 </th>
+		<th> 版本 </th>
+		<th> 说明 </th>
+		<th> 安装命令 </th>
+	</tr>
+	</thead>
+	<tbody>
+	<tr>
+		<td> CMake </td>
+		<td> 3.4 </td>
+		<td>  </td>
+		<td>  </td>
+	</tr>
+	<tr>
+		<td> GCC </td>
+		<td> 4.8 / 5.4 </td>
+		<td>  推荐使用CentOS的devtools2 </td>
+		<td>  </td>
+	</tr>
+		<tr>
+		<td> Python </td>
+		<td> 2.7.x. </td>
+		<td> 依赖libpython2.7.so </td>
+		<td> <code> apt install python-dev </code> 或 <code> yum install python-devel </code></td>
+	</tr>
+	<tr>
+		<td> SWIG </td>
+		<td> 最低 2.0 </td>
+		<td>  </td>
+		<td> <code>apt install swig </code> 或 <code> yum install swig </code> </td>
+	</tr>
+	<tr>
+		<td> wget </td>
+		<td> any </td>
+		<td>  </td>
+		<td> <code> apt install wget </code>  或 <code> yum install wget </code> </td>
+	</tr>
+	<tr>
+		<td> openblas </td>
+		<td> any </td>
+		<td>  </td>
+		<td>  </td>
+	</tr>
+	<tr>
+		<td> pip </td>
+		<td> 最低9.0.1 </td>
+		<td>  </td>
+		<td> <code> apt install python-pip </code> 或 <code> yum install Python-pip </code> </td>
+	</tr>
+	<tr>
+		<td> numpy </td>
+		<td> >=1.12.0 </td>
+		<td>  </td>
+		<td> <code> pip install numpy==1.14.0 </code> </td>
+	</tr>
+	<tr>
+		<td> protobuf </td>
+		<td> 3.1.0 </td>
+		<td>  </td>
+		<td> <code> pip install protobuf==3.1.0 </code> </td>
+	</tr>
+	<tr>
+		<td> wheel </td>
+		<td> any </td>
+		<td>  </td>
+		<td> <code> pip install wheel </code> </td>
+	</tr>
+	<tr>
+		<td> patchELF </td>
+		<td> any </td>
+		<td>  </td>
+		<td> <code> apt install patchelf </code> 或参见github <a href="https://gist.github.com/ruario/80fefd174b3395d34c14">patchELF 官方文档</a></td>
+	</tr>
+	<tr>
+		<td> go </td>
+		<td> >=1.8 </td>
+		<td> 可选 </td>
+		<td>  </td>
+	</tr>
+	</tbody>
+</table>
+</p>
+
+
+***
+<a name="Compile"></a>
+</br></br>
+## **编译选项表**
+
+<p align="center">
+<table>
+	<thead>
+	<tr>
+		<th> 选项 </th>
+		<th> 说明 </th>
+		<th> 默认值 </th>
+	</tr>
+	</thead>
+	<tbody>
+	<tr>
+		<td> WITH_GPU </td>
+		<td> 是否支持GPU </td>
+		<td> ON </td>
+	</tr>
+	<tr>
+		<td> WITH_C_API </td>
+		<td> 是否仅编译CAPI </td>
+		<td>  OFF </td>
+	</tr>
+		<tr>
+		<td> WITH_DOUBLE </td>
+		<td> 是否使用双精度浮点数 </td>
+		<td> OFF </td>
+	</tr>
+	<tr>
+		<td> WITH_DSO </td>
+		<td> 是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库 </td>
+		<td> ON </td>
+	</tr>
+	<tr>
+		<td> WITH_AVX </td>
+		<td> 是否编译含有AVX指令集的PaddlePaddle二进制文件 </td>
+		<td> ON </td>
+	</tr>
+	<tr>
+		<td> WITH_PYTHON </td>
+		<td> 是否内嵌PYTHON解释器 </td>
+		<td> ON </td>
+	</tr>
+	<tr>
+		<td> WITH_STYLE_CHECK </td>
+		<td> 是否编译时进行代码风格检查 </td>
+		<td> ON </td>
+	</tr>
+	<tr>
+		<td> WITH_TESTING </td>
+		<td> 是否开启单元测试 </td>
+		<td> OFF </td>
+	</tr>
+	<tr>
+		<td> WITH_DOC </td>
+		<td> 是否编译中英文文档 </td>
+		<td> OFF </td>
+	</tr>
+	<tr>
+		<td> WITH_SWIG_PY </td>
+		<td> 是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练 </td>
+		<td> Auto </td>
+	<tr>
+		<td> WITH_GOLANG </td>
+		<td> 是否编译go语言的可容错parameter server </td>
+		<td> OFF </td>
+	</tr>
+	<tr>
+		<td> WITH_MKL </td>
+		<td> 是否使用MKL数学库，如果为否则是用OpenBLAS </td>
+		<td> ON </td>
+	</tr>
+   </tbody>
+</table>
+</p>
+
+
+
+
+
+**BLAS**
+
+PaddlePaddle支持 [MKL](https://software.intel.com/en-us/mkl) 和 [OpenBlAS](http://www.openblas.net) 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，还会下载MKL-DNN数学库，详细参考[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake) 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+**CUDA/cuDNN**
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 使用参数 `-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。 我们推荐使用最新版本的cuDNN。
+
+**编译选项的设置**
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ `/usr/liby` 和 `/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用`-D`命令可以设置，例如：
+
+> `cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5`
+
+**注意**：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（ rm -rf ）后，再指定。
+
+
+***
+<a name="whls"></a>
+</br></br>
+## **安装包列表**   
+
+<p align="center">
+<table>
+	<thead>
+	<tr>
+		<th> 版本号 </th>
+		<th> 版本说明 </th>
+	</tr>
+	</thead>
+	<tbody>
+	<tr>
+		<td> paddlepaddle==[版本号] 如 paddlepaddle==1.1.0(下载1.1.0版本只支持CPU的PaddlePaddle)</td>
+		<td> 只支持CPU对应版本的PaddlePaddle，具体版本请参见<a href=https://pypi.org/project/paddlepaddle/#history>Pypi</a> </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==1.1.0 </td>
+		<td> 使用CUDA 9.0和cuDNN 7编译的1.1.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==1.1.0.post87 </td>
+		<td> 使用CUDA 8.0和cuDNN 7编译的1.1.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==1.1.0.post85 </td>
+		<td> 使用CUDA 8.0和cuDNN 5编译的1.1.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==1.0.0 </td>
+		<td> 使用CUDA 9.0和cuDNN 7编译的1.0.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==1.0.0.post87 </td>
+		<td> 使用CUDA 8.0和cuDNN 7编译的1.0.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==1.0.0.post85 </td>
+		<td> 使用CUDA 8.0和cuDNN 5编译的1.0.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.15.0 </td>
+		<td> 使用CUDA 9.0和cuDNN 7编译的0.15.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.15.0.post87 </td>
+		<td> 使用CUDA 8.0和cuDNN 7编译的0.15.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.15.0.post85 </td>
+		<td> 使用CUDA 8.0和cuDNN 5编译的0.15.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.14.0 </td>
+		<td> 使用CUDA 9.0和cuDNN 7编译的0.15.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.14.0.post87 </td>
+		<td> 使用CUDA 8.0和cuDNN 7编译的0.15.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.14.0.post85 </td>
+		<td> 使用CUDA 8.0和cuDNN 5编译的0.15.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.13.0 </td>
+		<td> 使用CUDA 9.0和cuDNN 7编译的0.13.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.12.0 </td>
+		<td> 使用CUDA 8.0和cuDNN 5编译的0.12.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.11.0.post87 </td>
+		<td> 使用CUDA 8.0和cuDNN 7编译的0.11.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.11.0.post85 </td>
+		<td> 使用CUDA 8.0和cuDNN 5编译的0.11.0版本 </td>
+	</tr>
+	<tr>
+		<td> paddlepaddle-gpu==0.11.0 </td>
+		<td> 使用CUDA 7.5和cuDNN 5编译的0.11.0版本 </td>
+	</tr>
+   </tbody>
+</table>
+</p>
+
+
+您可以在 [Release History](https://pypi.org/project/paddlepaddle-gpu/#history) 中找到PaddlePaddle-gpu的各个发行版本。
+
+***
+<a name="dockers"></a>
+</br></br>
+## **安装镜像表及简介**   
+<p align="center">
+<table>
+	<thead>
+	<tr>
+		<th> 版本号 </th>
+		<th> 版本说明 </th>
+	</tr>
+	</thead>
+	<tbody>
+	<tr>
+		<td> hub.baidubce.com/paddlepaddle/paddle:latest </td>
+		<td> 最新的预先安装好PaddlePaddle CPU版本的镜像 </td>
+	</tr>
+	<tr>
+		<td> hub.baidubce.com/paddlepaddle/paddle:latest-dev </td>
+		<td> 最新的PaddlePaddle的开发环境 </td>
+	</tr>
+		<tr>
+		<td> hub.baidubce.com/paddlepaddle/paddle:[Version] </td>
+		<td> 将version换成具体的版本，历史版本的预安装好PaddlePaddle的镜像 </td>
+	</tr>
+	<tr>
+		<td> hub.baidubce.com/paddlepaddle/paddle:latest-gpu </td>
+		<td> 最新的预先安装好PaddlePaddle GPU版本的镜像 </td>
+	</tr>
+   </tbody>
+</table>
+</p>
+
+
+您可以在 [DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/) 中找到PaddlePaddle的各个发行的版本的docker镜像。
+
+
+
+***
+<a name="ciwhls"></a>
+</br></br>
+## **多版本whl包列表**   
+<p align="center">
+<table>
+	<thead>
+	<tr>
+		<th> 版本说明 </th>
+		<th> cp27-cp27mu </th>
+		<th> cp27-cp27m </th>
+	</tr>
+	</thead>
+	<tbody>
+	<tr>
+		<td> cpu_avx_mkl </td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl">	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl</a></td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl">	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl</a></td>
+	</tr>
+	<tr>
+		<td> cpu_avx_openblas </td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl">	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl</a></td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl">	paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl</a></td>
+	</tr>
+		<tr>
+		<td> cpu_noavx_openblas </td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl">	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl</a></td>
+		<td><a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl">	paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl</a></td>
+	</tr>
+	<tr>
+		<td> cuda8.0_cudnn5_avx_mkl </td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl">	paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl</a></td>
+		<td><a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl">	paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl</a></td>
+	</tr>
+	<tr>
+		<td> cuda8.0_cudnn7_avx_mkl </td>
+		<td> <a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl">	paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl</a></td>
+		<td><a href="https://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl">	paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl</a></td>
+	</tr>
+   </tbody>
+</table>
+</p>          
+
+
+
+
+
+
+
+
+
+<!--TODO this part should be in a new webpage-->
+
+</br></br>
+
+## 在Docker中执行PaddlePaddle训练程序     
+
+***
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序: `train.py` （可以参考
+[PaddlePaddleBook](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html)
+编写），就可以使用下面的命令开始执行训练：
+
+     cd /home/work
+     docker run -it -v $PWD:/work hub.baidubce.com/paddlepaddle/paddle /work/train.py
+
+上述命令中，`-it` 参数说明容器已交互式运行；`-v $PWD:/work`
+指定将当前路径（Linux中PWD变量会展开为当前路径的绝对路径）挂载到容器内部的:`/work`
+目录: `hub.baidubce.com/paddlepaddle/paddle` 指定需要使用的容器； 最后`/work/train.py`为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+     docker run -it -v $PWD:/work hub.baidubce.com/paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** `apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+</br></br>
+
+## 使用Docker启动PaddlePaddle Book教程
+
+***
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+`docker run -p 8888:8888 hub.baidubce.com/paddlepaddle/book`
+
+国内用户可以使用下面的镜像源来加速访问：
+
+`docker run -p 8888:8888 hub.baidubce.com/paddlepaddle/book`
+
+然后在浏览器中输入以下网址：
+
+`http://localhost:8888/`
+
+就这么简单，享受您的旅程！如有其他问题请参见[FAQ](#FAQ)
+
+</br></br>
+## 使用Docker执行GPU训练
+
+***
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+`nvidia-docker run -it -v $PWD:/work hub.baidubce.com/paddlepaddle/paddle:latest-gpu /bin/bash`
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') \
+     $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} \
+      ${DEVICES} -it hub.baidubce.com/paddlepaddle/paddle:latest-gpu
+
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独[编译](/build_from_source_cn.html) PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+`if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi`
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/beginners_guide/install/compile/compile_CentOS.md b/doc/fluid/beginners_guide/install/compile/compile_CentOS.md
new file mode 100644
index 0000000000000000000000000000000000000000..914b0649212c11c59bad7e3309a7409d49e015aa
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/compile/compile_CentOS.md
@@ -0,0 +1,226 @@
+***
+# **CentOS下从源码编译**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及CentOS系统下编译PaddlePaddle，我们支持的Ubuntu系统需满足以下要求：
+
+* CentOS 7 / 6（这涉及到相关工具是否能被正常安装）
+
+## 确定要编译的版本
+* **仅支持CPU的PaddlePaddle**。
+
+<!--* 支持GPU的PaddlePaddle，为了使得PaddlePaddle程序运行的更加迅速，我们通常使用GPU对PaddlePaddle程序进行加速，但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU（具体安装流程和配置请务必参见NVIDIA官方文档：[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/)，[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/)）
+	* *Cuda 工具包9.0配合cuDNN v7*
+	* *Cuda 工具包8.0配合cuDNN v7*
+	* *GPU运算能力超过1.0的硬件设备*-->
+
+## 选择如何编译
+我们在CentOS的系统下提供2种编译方式：
+
+* Docker源码编译（不支持CentOS 6 / 7的GPU版本）
+* 直接本机源码编译（不支持CentOS 6的全部版本以及CentOS 7的GPU版本）
+
+我们更加推荐**使用Docker进行编译**，因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。        
+
+
+
+同样对于那些出于各种原因不能够安装Docker的用户我们也提供了可以从**本机直接源码编译**的方法，但是由于在本机上的情况更加复杂，因此我们只支持特定的系统。            
+
+<a name="ct_docker"></a>
+
+
+<br/><br/>
+### ***使用Docker编译***
+
+为了更好的使用Docker并避免发生问题，我们推荐使用**最高版本的Docker**，关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。   
+
+
+<!--TODO add the following back when support gpu version on Cent-->
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker编译PaddlePaddle**啦：
+
+1. 请首先选择您希望储存PaddlePaddle的路径，然后在该路径下使用以下命令将PaddlePaddle的源码从github克隆到本地当前目录下名为Paddle的文件夹中：
+
+	`git clone https://github.com/PaddlePaddle/Paddle.git`
+
+2. 进入Paddle目录下： `cd Paddle`
+
+3. 利用我们提供的镜像（使用该命令您可以不必提前下载镜像）：
+
+	`docker run --name paddle-test -v $PWD:/paddle --network=host -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash`
+	
+	> --name paddle-test为您创建的Docker容器命名为paddle-test，-v $PWD:/paddle 将当前目录挂载到Docker容器中的/paddle目录下（Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)），-it 与宿主机保持交互状态，`hub.baidubce.com/paddlepaddle/paddle` 使用名为`hub.baidubce.com/paddlepaddle/paddle:latest-dev`的镜像创建Docker容器，/bin/bash 进入容器后启动/bin/bash命令。
+
+4. 进入Docker后进入paddle目录下：`cd paddle`
+
+5. 切换到较稳定版本下进行编译：
+
+	`git checkout v1.1.0`
+
+6. 创建并进入/paddle/build路径下：
+
+	`mkdir -p /paddle/build && cd /paddle/build`
+
+7. 使用以下命令安装相关依赖：
+
+		For Python2: pip install protobuf==3.1.0
+		For Python3: pip install protobuf==3.1.0
+		
+	
+	> 安装protobuf 3.1.0。
+
+	`apt install patchelf`
+	
+	> 安装patchelf，PatchELF 是一个小而实用的程序，用于修改ELF可执行文件的动态链接器和RPATH。
+
+8. 执行cmake：       
+	
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)
+	
+	* 对于需要编译**CPU版本PaddlePaddle**的用户：
+
+		`cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`
+	
+	>> 我们目前不支持CentOS下GPU版本PaddlePaddle的编译     
+		
+9. 执行编译：
+
+	`make -j$(nproc)`
+	
+	> 使用多核编译
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包： `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包：
+
+		For Python2: pip install （whl包的名字）
+		For Python3: pip3 install （whl包的名字）
+		
+
+至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。
+
+恭喜您，现在您已经完成使用Docker编译PaddlePaddle的过程。        
+
+
+
+      
+
+<a name="ct_source"></a>
+
+
+<br/><br/>
+### ***本机编译***     
+
+**请严格按照以下指令顺序执行**
+
+
+
+1. 检查您的计算机和操作系统是否符合我们支持的编译标准： `uname -m && cat /etc/*release`
+
+2. 更新`yum`的源： `yum update`, 并添加必要的yum源：`yum install -y epel-release`
+
+3. 安装必要的工具`bzip2`以及`make`： `yum install -y bzip2` ， `yum install -y make`
+
+4. 我们支持使用virtualenv进行编译安装，首先请使用以下命令创建一个名为`paddle-venv`的虚环境：         
+
+	* a. 安装Python-dev:                
+			
+			For Python2: yum install python-devel
+			For Python3: (这里由于python3.5的编译安装过程较为复杂，请参照Python官方流程安装）
+
+	* b. 安装pip: 
+		
+			For Python2: yum install python-pip (请保证拥有9.0.1及以上的pip版本)
+			For Python3: (这里由于pip3的编译安装过程较为复杂，请参照Python官方流程安装）(请保证拥有9.0.1及以上的pip3版本）
+	
+	
+	* c.（Only For Python3）设置Python3相关的环境变量：         
+	
+		1. 首先使用``` find `dirname $(dirname         
+			$(which python3))` -name "libpython3.so"```找到Pythonlib的路径，然后（下面[python-lib-path]替换为找到文件路径）  
+		
+		2. 设置PYTHON_LIBRARIES：`export PYTHON_LIBRARY=[python-lib-path]`
+		
+		3. 其次使用```find `dirname $(dirname         
+			$(which python3))`/include -name "python3.5m"```找到PythonInclude的路径，然后（下面[python-include-path]替换为找到文件路径）		
+		4. 设置PYTHON_INCLUDE_DIR: `export PYTHON_INCLUDE_DIRS=[python-include-path]`
+		
+		5. 设置系统环境变量路径：`export PATH=[python-lib-path]:$PATH` （这里将[python-lib-path]的最后两级目录替换为/bin/) 
+		    
+
+
+	* d. 安装虚环境`virtualenv`以及`virtualenvwrapper`并创建名为`paddle-venv`的虚环境：
+
+		1.  `pip install virtualenv` 或 `pip3 install virtualenv`
+		2.  `pip install virtualenvwrapper` 或 `pip3 install virtualenvwrapper`
+		3.  找到`virtualenvwrapper.sh`： `find / -name virtualenvwrapper.sh`（请找到对应Python版本的`virtualenvwrapper.sh`
+		4.  查看`virtualenvwrapper.sh`中的安装方法： `cat vitualenvwrapper.sh`
+		5.  安装`virtualwrapper`
+		6.  创建名为`paddle-venv`的虚环境： `mkvirtualenv paddle-venv`
+
+
+5. 进入虚环境：`workon paddle-venv`      
+
+
+6. **执行编译前**请您确认在虚环境中安装有[编译依赖表](../Tables.html/#third_party)中提到的相关依赖：<!--TODO：Link 安装依赖表到这里-->
+
+	* 这里特别提供`patchELF`的安装方法，其他的依赖可以使用`yum install`或者`pip install`/`pip3 install` 后跟依赖名称和版本安装:
+
+		`yum install patchelf`
+		
+		> 不能使用apt安装的用户请参见patchElF github[官方文档](https://gist.github.com/ruario/80fefd174b3395d34c14)
+
+7. 将PaddlePaddle的源码clone在当下目录下的Paddle的文件夹中，并进入Padde目录下：
+
+	- `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+	- `cd Paddle`
+
+8. 切换到较稳定release分支下进行编译：
+
+	`git checkout release/1.0.0`
+
+9. 并且请创建并进入一个叫build的目录下：
+
+	`mkdir build && cd build`
+
+10. 执行cmake：       
+	
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)<!--TODO：Link 安装选项表到这里-->
+
+
+	*  对于需要编译**CPU版本PaddlePaddle**的用户：
+
+			For Python2: cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF		
+			For Python3: cmake .. -DPY_VERSION=3.5 -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIRS} \  
+			-DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+			
+		
+		> 如果遇到`Could NOT find PROTOBUF (missing:  PROTOBUF_LIBRARY PROTOBUF_INCLUDE_DIR)`可以重新执行一次cmake指令
+	
+
+
+11. 使用以下命令来编译：
+
+	`make -j$(nproc)`
+
+12. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包： `cd /paddle/build/python/dist`
+
+13. 在当前机器或目标机器安装编译好的`.whl`包：
+
+	`pip install （whl包的名字）`或`pip3 install （whl包的名字）`
+
+恭喜您，现在您已经完成使本机编译PaddlePaddle的过程了。
+
+
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 进入Python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 uninstall paddlepaddle`
diff --git a/doc/fluid/beginners_guide/install/compile/compile_MacOS.md b/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9a1110c110d1c12c609549e2ec42621e85f5212
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
@@ -0,0 +1,228 @@
+***
+# **MacOS下从源码编译**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及MacOS系统下编译PaddlePaddle，我们支持的MacOS系统需满足#以下要求：
+
+* MacOS 10.12/10.13/10.14（这涉及到相关工具是否能被正常安装）
+
+## 确定要编译的版本
+* **仅支持CPU的PaddlePaddle**。
+
+<!--* 支持GPU的PaddlePaddle，为了使得PaddlePaddle程序运行的更加迅速，我们通常使用GPU对PaddlePaddle程序进行加速，但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU（具体安装流程和配置请务必参见NVIDIA官方文档：[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/)，[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/)）
+	* *Cuda 工具包9.0配合cuDNN v7*
+	* *Cuda 工具包8.0配合cuDNN v7*
+	* *GPU运算能力超过1.0的硬件设备*-->
+
+## 选择如何编译
+在MacOS 10.12/10.13/10.14 的系统下我们提供2种编译方式：
+
+
+* Docker源码编译
+* 直接本机源码编译
+
+
+
+
+
+我们更加推荐**使用Docker进行编译**，因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。        
+
+同样对于那些出于各种原因不能够安装Docker的用户我们也提供了可以从**本机直接源码编译**的方法，但是由于在本机上的情况更加复杂，因此我们只支持特定的系统。        
+
+
+
+
+       
+
+<a name="mac_docker"></a>
+
+
+
+<br/><br/>
+### ***使用Docker编译***
+
+为了更好的使用Docker并避免发生问题，我们推荐使用**最高版本的Docker**，关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+
+> 请注意，在MacOS系统下登陆docker需要使用您的dockerID进行登录，否则将出现`Authenticate Failed`错误。       
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker编译PaddlePaddle**啦：
+
+1. 进入Mac的终端
+
+2. 请选择您希望储存PaddlePaddle的路径，然后在该路径下使用以下命令将PaddlePaddle的源码从github克隆到本地当前目录下名为Paddle的文件夹中：
+
+	`git clone https://github.com/PaddlePaddle/Paddle.git`
+
+3. 进入Paddle目录下： `cd Paddle`
+
+4. 利用我们提供的镜像（使用该命令您可以不必提前下载镜像）：
+
+	`docker run --name paddle-test -v $PWD:/paddle --network=host -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash`
+	
+	> --name paddle-test为您创建的Docker容器命名为paddle-test，-v $PWD:/paddle 将当前目录挂载到Docker容器中的/paddle目录下（Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)），-it 与宿主机保持交互状态，`hub.baidubce.com/paddlepaddle/paddle:latest-dev` 使用名为`hub.baidubce.com/paddlepaddle/paddle:latest-dev`的镜像创建Docker容器，/bin/bash 进入容器后启动/bin/bash命令。
+
+5. 进入Docker后进入paddle目录下：`cd paddle`
+
+6. 切换到较稳定版本下进行编译：
+
+	`git checkout v1.1.0`
+
+7. 创建并进入/paddle/build路径下：
+
+	`mkdir -p /paddle/build && cd /paddle/build`
+
+8. 使用以下命令安装相关依赖：
+
+		For Python2: pip install protobuf==3.1.0
+		For Python3: pip install protobuf==3.1.0
+		
+	
+	> 安装protobuf 3.1.0。
+
+	`apt install patchelf`
+	
+	> 安装patchelf，PatchELF 是一个小而实用的程序，用于修改ELF可执行文件的动态链接器和RPATH。
+
+9. 执行cmake：      
+	
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)<!--TODO： Link 编译选项表到这里-->
+
+
+	*  对于需要编译**CPU版本PaddlePaddle**的用户：
+
+		`cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`
+		
+		> 我们目前不支持CentOS下GPU版本PaddlePaddle的编译            
+
+
+
+
+10. 执行编译：
+
+	`make -j$(nproc)`
+	
+	> 使用多核编译
+
+11. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包： `cd /paddle/build/python/dist`
+
+12. 在当前机器或目标机器安装编译好的`.whl`包：
+
+		For Python2: pip install （whl包的名字）
+		For Python3: pip3 install （whl包的名字)
+		
+
+至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。
+
+恭喜您，现在您已经完成使用Docker编译PaddlePaddle的过程。
+
+
+<br/><br/>
+### ***本机编译***      
+
+**请严格按照以下指令顺序执行**
+
+
+1. 检查您的计算机和操作系统是否符合我们支持的编译标准： `uname -m` 并且在`关于本机`中查看系统版本。
+
+2. 安装python以及pip：    
+
+	> **请不要使用MacOS中自带python**，我们强烈建议您使用[Homebrew](https://brew.sh)安装python(对于**Python3**请使用python[官方下载](https://www.python.org/downloads/mac-osx/)python3.5.x), pip以及其他的依赖，这会大大降低您安装编译的难度。
+	
+		For python2: brew install python@2
+		For python3: 使用Python官网安装
+	
+	> 请注意，当您的mac上安装有多个python时请保证您正在使用的python是您希望使用的python。           
+	
+3. (Only For Python2)设置Python相关的环境变量：
+	
+	- 请使用`find / -name libpython2.7.dylib`找到您当前使用python的`libpython2.7.dylib`路径，并使用`export LD_LIBRARY_PATH=[libpython2.7.dylib的路径] && export DYLD_LIBRARY_PATH=[libpython2.7.dylib所在的目录的上两级目录]`
+
+4. (Only For Python3)设置Python相关的环境变量：         
+	
+	- a. 首先使用 
+			```find `dirname $(dirname 
+			  $(which python3))` -name "libpython3.*.dylib"```  
+			找到Pythonlib的路径（弹出的第一个对应您需要使用的python的dylib路径），然后（下面[python-lib-path]替换为找到文件路径）  
+		
+	- b. 设置PYTHON_LIBRARIES：`export PYTHON_LIBRARY=[python-lib-path]`
+		
+	- c. 其次使用找到PythonInclude的路径（通常是找到[python-lib-path]的上一级目录为同级目录的include,然后找到该目录下python3.x或者python2.x的路径），然后（下面[python-include-path]替换为找到路径）		
+	- d. 设置PYTHON_INCLUDE_DIR: `export PYTHON_INCLUDE_DIRS=[python-include-path]`
+		
+	- e. 设置系统环境变量路径：`export PATH=[python-bin-path]:$PATH` （这里[python-bin-path]为将[python-lib-path]的最后两级目录替换为/bin/后的目录)        
+
+	- f. 设置动态库链接： `export LD_LIBRARY_PATH=[python-ld-path]` 以及 `export DYLD_LIBRARY_PATH=[python-ld-path]` （这里[python-ld-path]为[python-bin-path]的上一级目录)    
+	
+	- g. (可选）如果您是在MacOS 10.14上编译PaddlePaddle，请保证您已经安装了[对应版本](http://developer.apple.com/download)的Xcode。  
+
+	
+	
+
+5. **执行编译前**请您确认您的环境中安装有[编译依赖表](../Tables.html/#third_party)中提到的相关依赖，否则我们强烈推荐使用`Homebrew`安装相关依赖。
+	
+	> MacOS下如果您未自行修改或安装过“编译依赖表”中提到的依赖，则仅需要使用`pip`安装`numpy，protobuf，wheel`，使用`homebrew`安装`wget，swig`，另外安装`cmake`即可
+	
+	- a. 这里特别说明一下**CMake**的安装：
+		
+		由于我们使用的是CMake3.4请根据以下步骤：
+		
+		1. 从CMake[官方网站](https://cmake.org/files/v3.4/cmake-3.4.3-Darwin-x86_64.dmg)下载CMake镜像并安装
+		2. 在控制台输入`sudo "/Applications/CMake.app/Contents/bin/cmake-gui" –install`
+	
+	- b. 如果您不想使用系统默认的blas而希望使用自己安装的OPENBLAS请参见[FAQ](../FAQ.html/#OPENBLAS)
+
+
+6. 将PaddlePaddle的源码clone在当下目录下的Paddle的文件夹中，并进入Padde目录下：
+
+	- `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+	- `cd Paddle`
+
+7. 切换到较稳定release分支下进行编译：
+
+	`git checkout release/1.0.0`
+
+8. 并且请创建并进入一个叫build的目录下：
+
+	`mkdir build && cd build`
+
+9. 执行cmake：
+	
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)<!--TODO：Link 安装选项表到这里-->
+
+
+	*  对于需要编译**CPU版本PaddlePaddle**的用户：
+
+			For Python2: cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+			For Python3: cmake .. -DPY_VERSION=3.5 -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIRS} \    
+			 -DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+	
+
+10. 使用以下命令来编译：
+
+	`make -j4`
+
+11. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包： `cd /paddle/build/python/dist`
+
+12. 在当前机器或目标机器安装编译好的`.whl`包：
+
+	`pip install （whl包的名字）`或`pip3 install （whl包的名字）`     
+	
+	> 如果您的电脑上安装有多个python环境以及pip请参见[FAQ](../Tables.html/#MACPRO)
+
+恭喜您，现在您已经完成使用本机编译PaddlePaddle的过程了。
+
+
+
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 进入Python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 uninstall paddlepaddle`       
diff --git a/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md b/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md
new file mode 100644
index 0000000000000000000000000000000000000000..5aafbef07c83ad4488228ff5d7aa605b4c0daecd
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md
@@ -0,0 +1,224 @@
+***       
+
+# **Ubuntu下从源码编译**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Ubuntu系统下编译PaddlePaddle，我们支持的Ubuntu系统需满足以下要求：
+
+* Ubuntu 14.04/16.04/18.04（这涉及到相关工具是否能被正常安装）
+
+## 确定要编译的版本
+* **仅支持CPU的PaddlePaddle**，如果您的系统没有 NVIDIA® GPU，则必须安装此版本。而此版本较GPU版本更加容易安
+因此即使您的计算机上拥有GPU我们也推荐您先安装CPU版本的PaddlePaddle来检测您本地的环境是否适合。
+
+* **支持GPU的PaddlePaddle**，为了使得PaddlePaddle程序运行的更加迅速，我们通常使用GPU对PaddlePaddle程序进行加速，但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU（具体安装流程和配置请务必参见NVIDIA官方文档：[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/)，[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/)）
+	* *CUDA 工具包9.0配合cuDNN v7*
+	* *CUDA 工具包8.0配合cuDNN v7*
+	* *GPU运算能力超过1.0的硬件设备*
+
+## 选择如何编译
+在Ubuntu的系统下我们提供2种编译方式：
+
+* Docker源码编译
+* 直接本机源码编译
+
+我们更加推荐**使用Docker进行编译**，因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。        
+
+
+
+我们也提供了可以从**本机直接源码编译**的方法，但是由于在本机上的情况更加复杂，我们只对特定系统提供了支持。        
+
+<a name="ubt_docker"></a>                         
+
+<br/><br/>
+### ***使用Docker编译***
+为了更好的使用Docker并避免发生问题，我们推荐使用**最高版本的Docker**，关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)
+
+
+> 请注意，要安装和使用支持 GPU 的PaddlePaddle版本，您必须先安装[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker编译PaddlePaddle**：
+
+1. 请首先选择您希望储存PaddlePaddle的路径，然后在该路径下使用以下命令将PaddlePaddle的源码从github克隆到本地当前目录下名为Paddle的文件夹中：
+
+	`git clone https://github.com/PaddlePaddle/Paddle.git`
+
+2. 进入Paddle目录下： `cd Paddle`
+
+3. 利用我们提供的镜像（使用该命令您可以不必提前下载镜像）：
+
+	`docker run --name paddle-test -v $PWD:/paddle --network=host -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash`
+	
+	> --name paddle-test为您创建的Docker容器命名为paddle-test，-v $PWD:/paddle 将当前目录挂载到Docker容器中的/paddle目录下（Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)），-it 与宿主机保持交互状态，`hub.baidubce.com/paddlepaddle/paddle:latest-dev` 使用名为`hub.baidubce.com/paddlepaddle/paddle:latest-dev`的镜像创建Docker容器，/bin/bash 进入容器后启动/bin/bash命令。
+
+4. 进入Docker后进入paddle目录下：`cd paddle`
+
+5. 切换到较稳定版本下进行编译：
+
+	`git checkout v1.1.0`
+
+6. 创建并进入/paddle/build路径下：
+
+	`mkdir -p /paddle/build && cd /paddle/build`
+
+7. 使用以下命令安装相关依赖：
+
+		For Python2: pip install protobuf==3.1.0
+		For Python3: pip install protobuf==3.1.0
+		
+	
+	> 安装protobuf 3.1.0。
+
+	`apt install patchelf`
+	
+	> 安装patchelf，PatchELF 是一个小而实用的程序，用于修改ELF可执行文件的动态链接器和RPATH。
+
+8. 执行cmake：     
+	
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)<!--TODO: Link 编译选项表到这里-->
+
+
+	*  对于需要编译**CPU版本PaddlePaddle**的用户：
+
+		`cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`
+
+
+	* 对于需要编译**GPU版本PaddlePaddle**的用户：
+
+		`cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF`
+
+
+9. 执行编译：
+
+	`make -j$(nproc)`
+	
+	> 使用多核编译
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包： `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包：
+
+		For Python2: pip install （whl包的名字）
+		For Python3: pip3 install （whl包的名字）
+		
+
+至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。
+
+恭喜您，现在您已经完成使用Docker编译PaddlePaddle的过程。            
+
+<a name="ubt_source"></a>    
+	
+<br/><br/>
+### ***本机编译***
+
+
+**请严格按照以下指令顺序执行**    
+
+
+1. 检查您的计算机和操作系统是否符合我们支持的编译标准： `uname -m && cat /etc/*release`
+
+2. 更新`apt`的源： `apt update`
+
+2. 我们支持使用virtualenv进行编译安装，首先请使用以下命令创建一个名为`paddle-venv`的虚环境：
+
+
+	* a. 安装Python-dev: 
+		
+		
+			For Python2: apt install python-dev 
+			For Python3: apt install python3.5-dev
+
+	
+	* b. 安装pip: (请保证拥有9.0.1及以上版本的pip):
+		
+		
+			For Python2: apt install python-pip
+			For Python3: apt install curl && curl https://bootstrap.pypa.io/get-pip.py -o - | python3.5 && easy_install pip 
+		
+		
+	* c. 安装虚环境`virtualenv`以及`virtualenvwrapper`并创建名为`paddle-venv`的虚环境：
+
+		1.  `apt install virtualenv` 或 `pip install virtualenv` 或 `pip3 install virtualenv`
+		2.  `apt install virtualenvwrapper` 或 `pip install virtualenvwrapper` 或 `pip3 install virtualenvwrapper`
+		3.  找到`virtualenvwrapper.sh`： `find / -name virtualenvwrapper.sh`
+		4.  (Only for Python3) 设置虚环境的解释器路径：`export VIRTUALENVWRAPPER_PYTHON=/usr/bin/python3.5`
+		5.  查看`virtualenvwrapper.sh`中的安装方法： `cat virtualenvwrapper.sh`
+		6.  按照`virtualenvwrapper.sh`中的安装方法安装`virtualwrapper`
+		7.  创建名为`paddle-venv`的虚环境： `mkvirtualenv paddle-venv`
+
+
+3. 进入虚环境：`workon paddle-venv`         
+
+
+4. **执行编译前**请您确认在虚环境中安装有[编译依赖表](../Tables.html/#third_party)中提到的相关依赖：<!--TODO：Link 安装依赖表到这里-->
+
+	* 这里特别提供`patchELF`的安装方法，其他的依赖可以使用`apt install`或者`pip install` 后跟依赖名称和版本安装:
+
+		`apt install patchelf`
+		
+		> 不能使用apt安装的用户请参见patchElF github[官方文档](https://gist.github.com/ruario/80fefd174b3395d34c14)
+
+5. 将PaddlePaddle的源码clone在当下目录下的Paddle的文件夹中，并进入Padde目录下：
+
+	- `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+	- `cd Paddle`
+
+6. 切换到较稳定release分支下进行编译，将中括号以及其中的内容替换为**目标分支名**：
+
+	`git checkout [name of target branch]`
+
+7. 并且请创建并进入一个叫build的目录下：
+
+	`mkdir build && cd build`
+
+8. 执行cmake：       
+	
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)<!--TODO：Link 安装选项表到这里-->
+
+
+	*  对于需要编译**CPU版本PaddlePaddle**的用户：
+
+			For Python2: cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+			For Python3: cmake .. -DPY_VERSION=3.5 -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+
+
+	* 对于需要编译**GPU版本PaddlePaddle**的用户：(*仅支持ubuntu16.04/14.04*)
+
+		1. 请确保您已经正确安装nccl2，或者按照以下指令安装nccl2（这里提供的是ubuntu 16.04，CUDA9，cuDNN7下nccl2的安装指令），更多版本的安装信息请参考NVIDIA[官方网站](https://developer.nvidia.com/nccl/nccl-download):      
+			i. `wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`     		
+			ii.  `dpkg -i nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`     		     
+			iii. `sudo apt-get install -y libnccl2=2.2.13-1+cuda9.0 libnccl-dev=2.2.13-1+cuda9.0` 
+		
+		2. 如果您已经正确安装了`nccl2`，就可以开始cmake了：
+
+				For Python2: cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF       
+				For Python3: cmake .. -DPY_VERSION=3.5 -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF
+
+9. 使用以下命令来编译：
+
+	`make -j$(nproc)`
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包： `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包：
+
+	`pip install （whl包的名字）`或`pip3 install （whl包的名字）`
+
+恭喜您，现在您已经完成使本机编译PaddlePaddle的过程了。
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 或 `python3` 进入Python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 uninstall paddlepaddle`
+
+* ***GPU版本的PaddlePaddle***: `pip uninstall paddlepaddle-gpu` 或 `pip3 uninstall paddlepaddle-gpu`
+
diff --git a/doc/fluid/beginners_guide/install/compile/fromsource.rst b/doc/fluid/beginners_guide/install/compile/fromsource.rst
new file mode 100644
index 0000000000000000000000000000000000000000..20914c4fea3584b7002a3cae0081f1059f7ef543
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/compile/fromsource.rst
@@ -0,0 +1,13 @@
+===========================
+**从源码编译PaddlePaddle**
+===========================
+
+您也可以选择源码编译的方式编译安装PaddlePaddle，但由于本机环境的多样性，在编译源码时易出现复杂问题，可能会造成您安装失败。为保证您顺利安装，推荐您优先选择普通安装方式。
+
+**编译PaddlePaddle**
+---------------------
+
+..	toctree::
+	compile_Ubuntu.md
+	compile_CentOS.md
+	compile_MacOS.md
diff --git a/doc/fluid/beginners_guide/install/install_CentOS.md b/doc/fluid/beginners_guide/install/install_CentOS.md
new file mode 100644
index 0000000000000000000000000000000000000000..359ef9a88eae776489dc5804d88a6025599ae405
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/install_CentOS.md
@@ -0,0 +1,186 @@
+***
+
+# **CentOS下安装**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及CentOS系统下安装PaddlePaddle，我们支持的CentOS系统需满足以下要求：      
+
+
+
+请注意：在其他系统上的尝试可能会导致安装失败。请确保您的环境满足以上条件，我们默认提供的安装同时需要您的计算机处理器支持AVX2指令集，否则请选择[多版本whl包安装列表](Tables.html/#ciwhls)中`no_avx`的版本。
+
+CentOS系统下您可以使用`cat /proc/cpuinfo | grep avx2`来检测您的处理器是否支持avx2指令集
+
+* *CentOS 6 / 7*
+
+## 确定要安装的版本
+* 仅支持CPU的PaddlePaddle。如果您的计算机没有 NVIDIA® GPU，则只能安装此版本。如果您的计算机有GPU，
+推荐您先安装CPU版本的PaddlePaddle，来检测您本地的环境是否适合。
+
+* 支持GPU的PaddlePaddle，为了使PaddlePaddle程序运行的更加迅速，我们通过GPU对PaddlePaddle程序进行加速，但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU（具体安装流程和配置请务必参见NVIDIA官方文档：[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/)，[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/)）
+	* *CUDA 工具包9.0配合cuDNN v7*
+	* *CUDA 工具包8.0配合cuDNN v7*
+	* *GPU运算能力超过1.0的硬件设备*
+
+
+
+## 选择如何安装
+在CentOS的系统下我们提供4种安装方式：
+
+* pip安装
+* Docker安装（不支持GPU版本）
+* 源码编译安装（不支持CentOS 6的所有版本以及CentOS 7的GPU版本）
+* Docker源码编译安装（不支持GPU版本）
+
+
+
+
+**使用pip安装**（最便捷的安装方式），我们为您提供pip安装方法，但它更依赖您的本机环境，可能会出现和您本机环境相关的一些问题。
+
+
+**使用Docker进行安装**（最保险的安装方式），因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。        
+
+
+从[**源码编译安装**](#ct_source)以及[**使用Docker进行源码编译安装**](#ct_docker)，这是一种通过将PaddlePaddle源代码编译成为二进制文件，然后在安装这个二进制文件的过程，相比使用我们为您编译过的已经通过测试的二进制文件形式的PaddlePaddle，手动编译更为复杂，我们将在说明的最后详细为您解答。
+
+
+
+
+<br/><br/>
+### ***使用pip安装PaddlePaddle***
+
+您可以直接粘贴以下命令到命令行来安装PaddlePaddle(适用于CentOS7安装CPU-ONLY的版本)，如果出现问题，您可以参照后面的解释对命令作出适应您系统的更改：
+
+Python2.7：
+		
+	yum update && yum install -y epel-release && yum install -y python-devel python-pip && pip install paddlepaddle     
+	    
+	
+Python3.5: (由于在CentOS下安装Python3的方法较为复杂，我们提供默认您已经正确安装python3.5已经pip3之后的安装命令）
+	
+	yum update && yum install -y epel-release && pip3 install paddlepaddle     
+	
+
+首先，我们使用以下指令来**检测本机的环境**是否适合安装PaddlePaddle：
+
+`uname -m && cat /etc/*release`
+
+> 上面的命令将会显示本机的操作系统和位数信息，请确保您的计算机和本教程的要求一致。
+
+
+其次，您的计算机需要满足以下要求：
+
+*	Python2.7.x (devel)，Pip >= 9.0.1 
+	
+	> CentOS6需要编译Python2.7成[共享库](./FAQ.html/#FAQ)。      
+
+*  Python3.5.x (devel)，Pip3 >= 9.0.1    
+
+	
+	> 您的CentOS上可能已经安装pip请使用pip -V来确认我们建议使用pip 9.0.1或更高版本来安装。
+
+	更新yum的源：   `yum update` 并安装拓展源以安装pip：   `yum install -y epel-release`
+
+	使用以下命令安装或升级Python和pip到需要的版本： 
+	
+	
+	- For Python2: `sudo yum install python-devel python-pip`       
+	- For Python3: (这里由于python3.5的编译安装过程较为复杂，请参照Python官方流程安装）
+	
+	> 即使您的环境中已经有`Python`也需要安装`python develop`套装。
+
+下面将说明如何安装PaddlePaddle：
+
+1. 使用pip install来安装PaddlePaddle：
+	
+	* 对于需要**CPU版本PaddlePaddle**的用户：`pip install paddlepaddle` 或 `pip3 install paddlepaddle`
+
+
+	* 对于需要**GPU版本PaddlePaddle**的用户: `pip install paddlepaddle-gpu` 或 `pip3 install paddlepaddle-gpu`
+	
+	> 1. 为防止出现nccl.h找不到的问题请首先按照NVIDIA[官方网站](https://developer.nvidia.com/nccl/nccl-download)的指示正确安装nccl2
+	> 2. 如果您不规定pypi包版本号，我们默认为您提供支持Cuda 9/cuDNN v7的PaddlePaddle版本。 
+
+	对于出现`Cannot uninstall 'six'.`问题的用户，可是由于您的系统中已有的Python安装问题造	成的，请使用`pip install paddlepaddle --ignore-installed six`（CPU）或`pip 	install paddlepaddle-gpu --ignore-installed six`（GPU）解决。
+	
+	* 对于有**其他要求**的用户：`pip install paddlepaddle==[版本号]`  或 `pip3 install paddlepaddle==[版本号]`      
+	
+	> `版本号`参见[安装包列表](./Tables.html/#whls)或者您如果需要获取并安装**最新的PaddlePaddle开发分支**，可以从[多版本whl包列表](./Tables.html/#ciwhls)或者我们的[CI系统](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 中下载最新的whl安装包和c-api开发包并安装。如需登录，请点击“Log in as guest”。
+	
+     
+	
+	    
+	    
+
+现在您已经完成通过`pip install` 来安装的PaddlePaddle的过程。
+
+
+
+<br/><br/>
+### ***使用Docker进行安装***
+
+<!-- 我们更加推荐**使用Docker进行安装**，因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。-->
+
+为了更好的使用Docker并避免发生问题，我们推荐使用**最高版本的Docker**，关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)
+
+
+> 请注意，要安装和使用支持 GPU 的PaddlePaddle版本，您必须先安装[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+
+
+
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker安装PaddlePaddle**
+
+1. 使用以下指令拉取我们为您预安装好PaddlePaddle的镜像：
+
+
+	* 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:1.1.0`       
+		
+		
+
+
+	* 您也可以通过以下指令拉取任意的我们提供的Docker镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
+		
+		> （请把[tag]替换为[镜像表](./Tables.html/#dockers)中的内容）             
+
+
+
+2. 使用以下指令用已经拉取的镜像构建并进入Docker容器：
+
+	`docker run --name [Name of container] -it -v $PWD:/paddle <imagename> /bin/bash`
+	
+	> 上述命令中，--name [Name of container] 设定Docker的名称；-it 参数说明容器已和本机交互式运行； -v $PWD:/paddle 指定将当前路径（Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)）挂载到容器内部的 /paddle 目录； `<imagename>` 指定需要使用的image名称，如果您需要使用我们的镜像请使用`hub.baidubce.com/paddlepaddle/paddle:[tag]` 注：tag的意义同第二步，/bin/bash是在Docker中要执行的命令。  
+
+3. （可选：当您需要第二次进入Docker容器中）使用如下命令使用PaddlePaddle：
+
+	`docker start [Name of container]`
+	
+	> 启动之前创建的容器。
+
+	`docker attach [Name of container]`
+	
+	> 进入启动的容器。
+	
+至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。
+
+
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 或 `python3` 进入Python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 install paddlepaddle`
+
+* ***GPU版本的PaddlePaddle***: `pip uninstall paddlepaddle-gpu` 或 `pip3 install paddlepaddle-gpu`
+
+
diff --git a/doc/fluid/beginners_guide/install/install_MacOS.md b/doc/fluid/beginners_guide/install/install_MacOS.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a9e37da83f23441811922935060874bab3ea701
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/install_MacOS.md
@@ -0,0 +1,137 @@
+***
+
+# **MacOS下安装**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及MacOS系统下安装PaddlePaddle，我们支持的MacOS系统需满足以下要求。
+
+请注意：在其他系统上的尝试可能会导致安装失败。
+
+* MacOS 10.11/10.12/10.13/10.14
+
+## 确定要安装的版本
+
+* 仅支持CPU的PaddlePaddle。
+
+
+
+## 选择如何安装
+在MacOS的系统下我们提供3种安装方式：
+
+* pip安装（不支持GPU版本）(python3下不支持分布式）
+* Docker安装（不支持GPU版本）
+* Docker源码编译安装（不支持GPU版本）
+
+
+**使用pip安装**（最便捷的安装方式），我们为您提供pip安装方法，但它更依赖您的本机环境，可能会出现和您本机环境相关的一些问题。
+
+
+**使用Docker进行安装**（最保险的安装方式），因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。    
+
+
+
+<br/><br/>
+### ***使用pip安装***
+
+由于在MacOS中的Python情况差别较大我们暂不提供快速安装的命令，请您按照以下步骤进行安装
+
+首先，**检查您的计算机和操作系统**是否符合我们支持的编译标准： `uname -m` 并且在`关于本机`中查看系统版本。
+
+其次，您的计算机需要满足以下要求：    
+
+> **请不要使用MacOS中自带python**，对于**Python2**，建议您使用[Homebrew](https://brew.sh)或[Python.org](https://www.python.org/ftp/python/2.7.15/python-2.7.15-macosx10.9.pkg)提供的python2.7.15；对于**Python3**，请使用[Python.org](https://www.python.org/downloads/mac-osx/)提供的python3.5.x。
+	
+		For python2: brew install python@2 或 使用Python官方下载的python2.7.15
+		For python3: 使用Python官方下载的python3.5.x
+
+*	Python2.7.x，Pip >= 9.0.1            
+*  Python3.5.x，Pip3 >= 9.0.1  
+
+	> 注： 您的MacOS上可能已经安装pip请使用pip -V来确认我们建议使用pip 9.0.1或更高版本来安装。        
+	    
+下面将说明如何安装PaddlePaddle：
+
+1. 设置python环境变量：
+	
+	- For Python2.7.x: 请使用`find / -name libpython2.7.dylib`找到您当前使用python的`libpython2.7.dylib`路径，并使用`export LD_LIBRARY_PATH=[libpython2.7.dylib所在的目录的上两级目录] && export DYLD_LIBRARY_PATH=[libpython2.7.dylib所在的目录的上两级目录]`
+	
+	- For Python3.5.x: 请使用`find / -name libpython3.5.dylib`找到您当前使用python的`libpython3.5.dylib`路径，并使用`export LD_LIBRARY_PATH=[libpython3.5.dylib所在的目录的上两级目录] && export DYLD_LIBRARY_PATH=[libpython3.5.dylib所在的目录的上两级目录]`
+
+2. 使用pip install来安装PaddlePaddle：
+	
+	* 对于需要**CPU版本PaddlePaddle**的用户：`pip install paddlepaddle` 或 `pip3 install paddlepaddle`
+
+
+	
+	* 对于有**其他要求**的用户：`pip install paddlepaddle==[版本号]`  或 `pip3 install paddlepaddle==[版本号]`
+	
+	> `版本号`参见[安装包列表](./Tables.html/#whls)或者您如果需要获取并安装**最新的PaddlePaddle开发分支**，可以从[多版本whl包列表](./Tables.html/#ciwhls)或者我们的[CI系统](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 中下载最新的whl安装包和c-api开发包并安装。如需登录，请点击“Log in as guest”。
+	
+     
+	
+	    
+	    
+
+现在您已经完成通过`pip install` 来安装的PaddlePaddle的过程。
+
+
+
+
+<br/><br/>
+### ***使用Docker安装***
+
+<!-- 我们更加推荐**使用Docker进行安装**，因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。-->
+
+为了更好的使用Docker并避免发生问题，我们推荐使用**最高版本的Docker**，关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+
+> 请注意，在MacOS系统下登陆docker需要使用您的dockerID进行登录，否则将出现`Authenticate Failed`错误。
+
+如果已经**正确安装Docker**，即可以开始**使用Docker安装PaddlePaddle**
+
+1. 使用以下指令拉取我们为您预安装好PaddlePaddle的镜像：
+
+
+	* 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:1.1.0`
+		
+
+	* 您也可以通过以下指令拉取任意的我们提供的Docker镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
+		
+		> （请把[tag]替换为[镜像表](./Tables.html/#dockers)中的内容）
+		
+2. 使用以下指令用已经拉取的镜像构建并进入Docker容器：
+
+	`docker run --name [Name of container] -it -v $PWD:/paddle <imagename> /bin/bash`
+
+	> 上述命令中，--name [Name of container] 设定Docker的名称；-it 参数说明容器已和本机交互式运行； -v $PWD:/paddle 指定将当前路径（Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)）挂载到容器内部的 /paddle 目录； `<imagename>` 指定需要使用的image名称，如果您需要使用我们的镜像请使用`hub.baidubce.com/paddlepaddle/paddle:[tag]` 注：tag的意义同第二步；/bin/bash是在Docker中要执行的命令。
+
+3. （可选：当您需要第二次进入Docker容器中）使用如下命令使用PaddlePaddle：
+
+	`docker start [Name of container]`
+	
+	> 启动之前创建的容器。
+
+	`docker attach [Name of container]`
+	
+	> 进入启动的容器。     
+	
+	
+至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。
+<!--TODO: When we support pip install mode on MacOS, we can write on this part -->
+
+
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 或 `python3` 进入python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 install paddlepaddle`     
+
diff --git a/doc/fluid/beginners_guide/install/install_Ubuntu.md b/doc/fluid/beginners_guide/install/install_Ubuntu.md
new file mode 100644
index 0000000000000000000000000000000000000000..c821db273964147876fddcbd9e8551a3b94a8e2f
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/install_Ubuntu.md
@@ -0,0 +1,181 @@
+***
+
+# **Ubuntu下安装**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Ubuntu系统下安装PaddlePaddle，我们支持的Ubuntu系统需满足以下要求：
+
+
+
+请注意：在其他系统上的尝试可能会导致安装失败。请确保您的环境满足以上条件，我们默认提供的安装同时需要您的计算机处理器支持AVX2指令集，否则请选择[多版本whl包安装列表](Tables.html/#ciwhls)中`no_avx`的版本。
+
+Ubuntu系统下您可以使用`cat /proc/cpuinfo | grep avx2`来检测您的处理器是否支持avx2指令集
+
+* *Ubuntu 14.04 /16.04 /18.04*
+
+## 确定要安装的版本
+
+* 仅支持CPU的PaddlePaddle。如果您的计算机没有 NVIDIA® GPU，则只能安装此版本。如果您的计算机有GPU，
+也推荐您先安装CPU版本的PaddlePaddle，来检测您本地的环境是否适合。
+
+* 支持GPU的PaddlePaddle。为了使PaddlePaddle程序运行更加迅速，我们通过GPU对PaddlePaddle程序进行加速，但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU（具体安装流程和配置请务必参见NVIDIA官方文档：[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/)，[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/)）
+	* *CUDA 工具包9.0配合cuDNN v7*
+	* *CUDA 工具包8.0配合cuDNN v7*
+	* *GPU运算能力超过1.0的硬件设备*
+
+
+
+## 选择如何安装
+在Ubuntu的系统下我们提供4种安装方式：
+
+* pip安装
+* Docker安装
+* 源码编译安装
+* Docker源码编译安装
+
+
+
+**使用pip安装**（最便捷的安装方式），我们为您提供pip安装方法，但它更依赖您的本机环境，可能会出现和您本机环境相关的一些问题。
+
+**使用Docker进行安装**（最保险的安装方式），因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。                 
+
+
+
+从[**源码编译安装**](#ubt_source)以及[**使用Docker进行源码编译安装**](#ubt_docker)，这是一种通过将PaddlePaddle源代码编译成为二进制文件，然后在安装这个二进制文件的过程，相比使用我们为您编译过的已经通过测试的二进制文件形式的PaddlePaddle，手动编译更为复杂，我们将在说明的最后详细为您解答。
+
+
+
+<br/><br/>
+### ***使用pip安装***
+
+您可以直接粘贴以下命令到命令行来安装PaddlePaddle(适用于ubuntu16.04及以上安装CPU-ONLY的版本)，如果出现问题，您可以参照后面的解释对命令作出适应您系统的更改：
+
+Python2.7：
+		
+	apt update && apt install -y python-dev python-pip && pip install paddlepaddle
+
+Python3.5（该指令适用于本机未安装python2的用户，否则，请卸载python2之后再使用本指令）：
+	
+	apt-get install -y curl python3.5 python3.5-dev wget vim git && curl https://bootstrap.pypa.io/get-pip.py -o - | python3.5 && easy_install pip && pip3 install paddlepaddle
+	
+	
+首先，我们使用以下指令来**检测本机的环境**是否适合安装PaddlePaddle：
+
+`uname -m && cat /etc/*release`
+
+> 上面的命令将会显示本机的操作系统和位数信息，请确保您的计算机和本教程的要求一致。
+
+
+其次，您的电脑需要满足以下任一要求：
+
+*	Python2.7.x (dev)，Pip >= 9.0.1 
+*	Python3.5.x (dev)，Pip3 >= 9.0.1    
+	
+	> 您的Ubuntu上可能已经安装pip请使用pip -V或pip3 -V来确认我们建议使用pip 9.0.1或更高版本来安装
+
+	更新apt的源：   `apt update`
+
+	使用以下命令安装或升级Python和pip到需要的版本：      
+	
+	- For python2： `sudo apt install python-dev python-pip`  
+	- For python3：`sudo apt install python3.5-dev` and `curl https://bootstrap.pypa.io/get-pip.py -o - | python3.5 && easy_install pip`
+	
+	> 即使您的环境中已经有Python2或Python3也需要安装Python-dev或Python3.5-dev。
+
+现在，让我们来安装PaddlePaddle：
+
+1. 使用pip install来安装PaddlePaddle
+
+	* 对于需要**CPU版本PaddlePaddle**的用户：`pip install paddlepaddle` 或 `pip3 install paddlepaddle`
+	
+
+	* 对于需要**GPU版本PaddlePaddle**的用户：`pip install paddlepaddle-gpu` 或 `pip3 install paddlepaddle`
+	
+	> 1. 为防止出现nccl.h找不到的问题请首先按照以下命令安装nccl2（这里提供的是ubuntu 16.04，CUDA9，cuDNN v7下nccl2的安装指令），更多版本的安装信息请参考NVIDIA[官方网站](https://developer.nvidia.com/nccl/nccl-download):      
+			i. `wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`     
+			ii.  `dpkg -i nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`       	 	          
+			iii. `sudo apt-get install -y libnccl2=2.2.13-1+cuda9.0 libnccl-dev=2.2.13-1+cuda9.0` 
+	> 2. 如果您不规定pypi包版本号，我们默认为您提供支持Cuda 9/cuDNN v7的PaddlePaddle版本。
+
+
+	对于出现`Cannot uninstall 'six'.`问题的用户，可是由于您的系统中已有的Python安装问题造成的，请使用`pip install paddlepaddle --ignore-installed six`（CPU）或`pip 	install paddlepaddle --ignore-installed six`（GPU）解决。      
+	
+	* 对于有**其他要求**的用户：`pip install paddlepaddle==[版本号]` 或 `pip3 install paddlepaddle==[版本号]`       
+	
+	> `版本号`参见[安装包列表](./Tables.html/#whls)或者您如果需要获取并安装**最新的PaddlePaddle开发分支**，可以从我们的[CI系统](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 中下载最新的whl安装包和c-api开发包并安装。如需登录，请点击“Log in as guest”。
+	
+
+
+	
+
+现在您已经完成使用`pip install` 来安装的PaddlePaddle的过程。
+
+
+<br/><br/>
+### ***使用Docker安装***
+
+<!-- TODO: uncomment it when the offical website can split it to different pages我们更加推荐**使用Docker进行安装**，因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。-->
+
+为了更好的使用Docker并避免发生问题，我们推荐使用**最高版本的Docker**，关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+
+
+
+> 请注意，要安装和使用支持 GPU 的PaddlePaddle版本，您必须先安装[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+
+
+
+如果已经**正确安装Docker**，即可以开始**使用Docker安装PaddlePaddle**
+
+1. 使用以下指令拉取我们为您预安装好PaddlePaddle的镜像：
+
+
+	* 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:1.1.0`
+		
+
+	* 对于需要**GPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For GPU*的镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:1.1.0-gpu-cuda9.0-cudnn7`
+		
+
+	* 您也可以通过以下指令拉取任意的我们提供的Docker镜像：
+
+		`docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
+		
+		> （请把[tag]替换为[镜像表](./Tables.html/#dockers)中的内容）
+		
+2. 使用以下指令用已经拉取的镜像构建并进入Docker容器：
+
+	`docker run --name [Name of container] -it -v $PWD:/paddle <imagename> /bin/bash`
+
+	> 上述命令中，--name [Name of container] 设定Docker的名称；-it 参数说明容器已和本机交互式运行； -v $PWD:/paddle 指定将当前路径（Linux中PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 /paddle 目录； `<imagename>` 指定需要使用的image名称，如果您需要使用我们的镜像请使用`hub.baidubce.com/paddlepaddle/paddle:[tag]` 注：tag的意义同第二步；/bin/bash是在Docker中要执行的命令。
+
+3. （可选：当您需要第二次进入Docker容器中）使用如下命令使用PaddlePaddle：
+
+	`docker start [Name of container]`
+	
+	> 启动之前创建的容器。
+
+	`docker attach [Name of container]`
+	
+	> 进入启动的容器。
+	
+至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。
+
+
+
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 或 `python3` 进入python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 uninstall paddlepaddle`
+
+* ***GPU版本的PaddlePaddle***: `pip uninstall paddlepaddle-gpu` 或 `pip3 uninstall paddlepaddle-gpu`
+
diff --git a/doc/fluid/beginners_guide/install/install_Windows.md b/doc/fluid/beginners_guide/install/install_Windows.md
new file mode 100644
index 0000000000000000000000000000000000000000..a869de87b755d4199facb6681e03dd9115acd6c3
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/install_Windows.md
@@ -0,0 +1,44 @@
+***
+
+# **Windows下安装**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Windows系统下安装PaddlePaddle，我们支持的Windows系统需满足以下要求。
+
+
+
+请注意：在其他系统上的尝试可能会导致安装失败。 请确保您的环境满足以上条件，我们默认提供的安装同时需要您的计算机处理器支持AVX2指令集，否则请选择[多版本whl包安装列表](Tables.html/#ciwhls) 中`no_avx`的版本:
+
+Windows系统下可使用`cpu-z`这类软件来检测您的处理器是否支持AVX2指令集
+
+* *Windows 7/8 and Windows 10 专业版/企业版*
+
+## 确定要安装的版本
+
+* Windows下我们目前仅提供支持CPU的PaddlePaddle。
+
+
+## 选择如何安装
+在Windows系统下请使用我们为您提供的[一键安装包](http://paddle-windows.bj.bcebos.com/1.1.0/PaddlePaddle-windows-1.1.0.zip)进行安装
+	
+> 我们提供的一键安装包将基于Docker为您进行便捷的安装流程
+
+
+我们之所以使用**基于Docker的安装方式**，是因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。        
+
+
+
+
+
+
+<br/><br/>
+## ***验证安装***
+安装完成后您可以使用：`python` 或 `python3` 进入python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
+
+<br/><br/>
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+
+* ***CPU版本的PaddlePaddle***: `pip install paddlepaddle` 或 `pip3 install paddlepaddle`  
+
+
+
diff --git a/doc/fluid/beginners_guide/programming_guide/programming_guide.md b/doc/fluid/beginners_guide/programming_guide/programming_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..04d542c93c8df86a7c604e1d3c0c5282b5ecb497
--- /dev/null
+++ b/doc/fluid/beginners_guide/programming_guide/programming_guide.md
@@ -0,0 +1,422 @@
+
+# Fluid编程指南
+
+本文档将指导您如何用Fluid API编程并搭建一个简单的神经网络。阅读完本文档，您将掌握：
+
+- Fluid有哪些核心概念
+- 如何在fluid中定义运算过程
+- 如何使用executor运行fluid操作
+- 如何从逻辑层对实际问题建模
+- 如何调用API（层，数据集，损失函数，优化方法等等）
+
+在进行模型搭建之前，首先需要明确几个Fluid核心使用概念：
+
+## 使用Tensor表示数据 
+
+Fluid和其他主流框架一样，使用Tensor数据结构来承载数据。
+
+在神经网络中传递的数据都是Tensor,Tensor可以简单理解成一个多维数组，一般而言可以有任意多的维度。不同的Tensor可以具有自己的数据类型和形状，同一Tensor中每个元素的数据类型是一样的，Tensor的形状就是Tensor的维度。
+
+下图直观地表示1～6维的Tensor： 
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/beginners_guide/image/tensor.jpg" width="400">
+</p>
+
+
+在 Fluid 中存在三种特殊的 Tensor：
+
+**1. 模型中的可学习参数**
+
+模型中的可学习参数（包括网络权重、偏置等）生存期和整个训练任务一样长，会接受优化算法的更新，在 Fluid 中以 Variable 的子类 Parameter 表示。
+
+在Fluid中可以通过`fluid.layers.create_parameter`来创建可学习参数：
+
+```python
+w = fluid.layers.create_parameter(name="w",shape=[1],dtype='float32')
+```
+
+
+一般情况下，您不需要自己来创建网络中的可学习参数，Fluid 为大部分常见的神经网络基本计算模块都提供了封装。以最简单的全连接模型为例，下面的代码片段会直接为全连接层创建连接权值（W）和偏置（ bias ）两个可学习参数，无需显式地调用 Parameter 相关接口来创建。
+
+```python
+import paddle.fluid as fluid
+y = fluid.layers.fc(input=x, size=128, bias_attr=True)
+```
+
+
+**2. 输入输出Tensor**
+
+整个神经网络的输入数据也是一个特殊的 Tensor，在这个 Tensor 中，一些维度的大小在定义模型时无法确定（通常包括：batch size，如果 mini-batch 之间数据可变，也会包括图片的宽度和高度等），在定义模型时需要占位。
+
+
+Fluid 中使用 `fluid.layers.data` 来接收输入数据， `fluid.layers.data` 需要提供输入 Tensor 的形状信息，当遇到无法确定的维度时，相应维度指定为 None ，如下面的代码片段所示：
+
+```python
+import paddle.fluid as fluid
+
+#定义x的维度为[3,None]，其中我们只能确定x的第一的维度为3，第二个维度未知，要在程序执行过程中才能确定
+x = fluid.layers.data(name="x", shape=[3,None], dtype="int64")
+
+#batch size无需显示指定，框架会自动补充第0维为batch size，并在运行时填充正确数值
+a = fluid.layers.data(name="a",shape=[3,4],dtype='int64')
+
+#若图片的宽度和高度在运行时可变，将宽度和高度定义为None。
+#shape的三个维度含义分别是：channel、图片的宽度、图片的高度
+b = fluid.layers.data(name="image",shape=[3,None,None],dtpye="float32")
+```
+
+其中，dtpye=“int64”表示有符号64位整数数据类型，更多Fluid目前支持的数据类型请查看：[Fluid目前支持的数据类型](../../user_guides/howto/prepare_data/feeding_data.html#fluid)。
+
+**3. 常量 Tensor**
+
+Fluid 通过 `fluid.layers.fill_constant` 来实现常量Tensor，用户可以指定Tensor的形状，数据类型和常量值。代码实现如下所示：
+
+```python
+import paddle.fluid as fluid
+data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+```
+
+需要注意的是，上述定义的tensor并不具有值，它们仅表示将要执行的操作，如您直接打印data将会得到描述该data的一段信息：
+
+```python
+print data
+```
+输出结果:
+
+```
+name: "fill_constant_0.tmp_0"
+type {
+    type: LOD_TENSOR
+    lod_tensor {
+        tensor {
+            data_type: INT64
+            dims: 1
+        }
+    }
+}
+persistable: false
+```
+    
+具体输出数值将在Executor运行时得到，详细过程会在后文展开描述。
+
+## 数据传入
+
+Fluid有特定的数据传入方式：
+
+您需要使用 `fluid.layers.data` 配置数据输入层，并在 `fluid.Executor` 或 `fluid.ParallelExecutor` 中，使用 executor.run(feed=...) 传入训练数据。
+
+具体的数据准备过程，请阅读[准备数据](../../user_guides/howto/prepare_data/index.html)
+
+
+## 使用Operator表示对数据的操作
+
+在Fluid中，所有对数据的操作都由Operator表示，您可以使用内置指令来描述他们的神经网络。
+
+为了便于用户使用，在Python端，Fluid中的Operator被一步封装入`paddle.fluid.layers`，`paddle.fluid.nets` 等模块。
+
+这是因为一些常见的对Tensor的操作可能是由更多基础操作构成，为了提高使用的便利性，框架内部对基础 Operator 进行了一些封装，包括创建 Operator 依赖可学习参数，可学习参数的初始化细节等，减少用户重复开发的成本。
+
+例如用户可以利用`paddle.fluid.layers.elementwise_add()`实现两个输入Tensor的加法运算：
+
+```python
+#定义网络
+import paddle.fluid as fluid
+a = fluid.layers.data(name="a",shape=[1],dtype='float32')
+b = fluid.layers.data(name="b",shape=[1],dtype='float32') 
+
+result = fluid.layers.elementwise_add(a,b)
+
+#定义Exector
+cpu = fluid.core.CPUPlace() #定义运算场所，这里选择在CPU下训练
+exe = fluid.Executor(cpu) #创建执行器
+exe.run(fluid.default_startup_program()) #网络参数初始化
+
+#准备数据
+import numpy
+data_1 = input("a=")
+data_2 = input("b=")
+x = numpy.array([[data_1]])
+y = numpy.array([[data_2]]) 
+
+#执行计算
+outs = exe.run(
+feed={'a':x,'b':y},
+fetch_list=[result.name])
+
+#验证结果
+print "%d+%d=%d" % (data_1,data_2,outs[0][0])
+```
+
+输出结果：
+```
+a=7
+b=3
+7+3=10
+```
+
+本次运行时，输入a=7，b=3，得到outs=10。
+
+您可以复制这段代码在本地执行，根据指示输入其他数值观察计算结果。
+
+如果想获取网络执行过程中的a，b的具体值，可以将希望查看的变量添加在fetch_list中。
+
+```python
+...
+#执行计算
+outs = exe.run(
+    feed={'a':x,'b':y},
+    fetch_list=[a,b,result.name]
+#查看输出结果
+print outs
+```
+
+输出结果：
+```
+[array([[7]]), array([[3]]), array([[10]])]
+```
+    
+## 使用Program描述神经网络模型
+
+Fluid不同于其他大部分深度学习框架，去掉了静态计算图的概念，代之以Program的形式动态描述计算过程。这种动态的计算描述方式，兼具网络结构修改的灵活性和模型搭建的便捷性，在保证性能的同时极大地提高了框架对模型的表达能力。
+
+开发者的所有 Operator 都将写入 Program ，在Fluid内部将自动转化为一种叫作 ProgramDesc 的描述语言，Program 的定义过程就像在写一段通用程序，有开发经验的用户在使用 Fluid 时，会很自然的将自己的知识迁移过来。 
+
+其中，Fluid通过提供顺序、分支和循环三种执行结构的支持，让用户可以通过组合描述任意复杂的模型。
+
+**顺序执行：**
+
+用户可以使用顺序执行的方式搭建网络：
+
+```python
+x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+```
+
+**条件分支——switch、if else：**
+
+Fluid 中有 switch 和 if-else 类来实现条件选择，用户可以使用这一执行结构在学习率调节器中调整学习率或其他希望的操作：
+
+```python
+lr = fluid.layers.tensor.create_global_var(
+        shape=[1],
+        value=0.0,
+        dtype='float32',
+        persistable=True,
+        name="learning_rate")
+
+one_var = fluid.layers.fill_constant(
+        shape=[1], dtype='float32', value=1.0)
+two_var = fluid.layers.fill_constant(
+        shape=[1], dtype='float32', value=2.0)
+
+with fluid.layers.control_flow.Switch() as switch:
+    with switch.case(global_step == zero_var):
+        fluid.layers.tensor.assign(input=one_var, output=lr)
+        with switch.default():
+            fluid.layers.tensor.assign(input=two_var, output=lr)
+```
+
+    
+关于 Fluid 中 Program 的详细设计思想，可以参考阅读[Fluid设计思想](../../user_guides/design_idea/fluid_design_idea.html)
+
+更多 Fluid 中的控制流，可以参考阅读[API文档](http://www.paddlepaddle.org/documentation/api/zh/1.0.0/layers.html#permalink-1-control_flow)
+
+
+## 使用Executor执行Program
+
+Fluid的设计思想类似于高级编程语言C++和JAVA等。程序的执行过程被分为编译和执行两个阶段。
+
+用户完成对 Program 的定义后，Executor 接受这段 Program 并转化为C++后端真正可执行的 FluidProgram，这一自动完成的过程叫做编译。
+
+编译过后需要 Executor 来执行这段编译好的 FluidProgram。
+
+例如上文实现的加法运算，当构建好 Program 后，需要创建 Executor，进行初始化 Program 和训练 Program：
+
+```python
+#定义Exector
+cpu = fluid.core.CPUPlace() #定义运算场所，这里选择在CPU下训练
+exe = fluid.Executor(cpu) #创建执行器
+exe.run(fluid.default_startup_program()) #初始化Program
+
+#训练Program，开始计算
+#feed以字典的形式定义了数据传入网络的顺序
+#fetch_list定义了网络的输出
+outs = exe.run(
+    feed={'a':x,'b':y},
+    fetch_list=[result.name])
+```
+
+## 代码实例
+
+至此，您已经对Fluid核心概念有了初步认识了，不妨尝试配置一个简单的网络吧。如果感兴趣的话可以跟随本部分，完成一个非常简单的数据预测。已经掌握这部分内容的话，可以跳过本节阅读[What's next](#what_next)。
+
+从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后，需要使用 PaddlePaddle 提供的 API 及算子来实现模型逻辑。一个典型的模型主要包含4个部分，分别是：输入数据格式定义，模型前向计算逻辑，损失函数以及优化算法。
+
+1. 问题描述
+
+    给定一组数据 $<X,Y>$，求解出函数 $f$，使得 $y=f(x)$，其中$X$,$Y$均为一维张量。最终网络可以依据输入$x$，准确预测出$y_{\_predict}$。
+
+2. 定义数据
+
+    假设输入数据X=[1 2 3 4]，Y=[2,4,6,8]，在网络中定义：
+    
+    ```python
+    #定义X数值
+    train_data=numpy.array([[1.0],[2.0],[3.0],[4.0]]).astype('float32')
+    #定义期望预测的真实值y_true
+    y_true = numpy.array([[2.0],[4.0],[6.0],[8.0]]).astype('float32')
+    ```
+        
+3. 搭建网络（定义前向计算逻辑）
+
+    接下来需要定义预测值与输入的关系，本次使用一个简单的线性回归函数进行预测：
+    
+    ```python
+    #定义输入数据类型
+    x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+    #搭建全连接网络
+    y_predict = fluid.layers.fc(input=x,size=1,act=None)
+    ```
+    
+    这样的网络就可以进行预测了，虽然输出结果只是一组随机数，离预期结果仍相差甚远：
+    
+    ```python
+    #加载库
+    import paddle.fluid as fluid
+    import numpy
+    #定义数据
+    train_data=numpy.array([[1.0],[2.0],[3.0],[4.0]]).astype('float32')
+    y_true = numpy.array([[2.0],[4.0],[6.0],[8.0]]).astype('float32')
+    #定义预测函数
+    x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+    y_predict = fluid.layers.fc(input=x,size=1,act=None)
+    #参数初始化
+    cpu = fluid.core.CPUPlace()
+    exe = fluid.Executor(cpu)
+    exe.run(fluid.default_startup_program())
+    #开始训练
+    outs = exe.run(
+        feed={'x':train_data},
+        fetch_list=[y_predict.name])
+    #观察结果
+    print outs
+    ```
+    
+    输出结果：
+    
+    ```
+    [array([[0.74079144],
+               [1.4815829 ],
+               [2.2223744 ],
+               [2.9631658 ]], dtype=float32)]
+    ```
+
+4. 添加损失函数 
+    
+    完成模型搭建后，如何评估预测结果的好坏呢？我们通常在设计的网络中添加损失函数，以计算真实值与预测值的差。
+
+    在本例中，损失函数采用[均方差函数](https://en.wikipedia.org/wiki/Mean_squared_error)：
+    ```python
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    ```
+    输出一轮计算后的预测值和损失函数：
+    
+    ```python
+    #加载库
+    import paddle.fluid as fluid
+    import numpy
+    #定义数据
+    train_data=numpy.array([[1.0],[2.0],[3.0],[4.0]]).astype('float32')
+    y_true = numpy.array([[2.0],[4.0],[6.0],[8.0]]).astype('float32')
+    #定义网络
+    x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+    y = fluid.layers.data(name="y",shape=[1],dtype='float32')
+    y_predict = fluid.layers.fc(input=x,size=1,act=None)
+    #定义损失函数
+    cost = fluid.layers.square_error_cost(input=y_predict,label=y)
+    avg_cost = fluid.layers.mean(cost)
+    #参数初始化
+    cpu = fluid.core.CPUPlace()
+    exe = fluid.Executor(cpu)
+    exe.run(fluid.default_startup_program())
+    #开始训练
+    outs = exe.run(
+        feed={'x':train_data,'y':y_true},
+        fetch_list=[y_predict.name,avg_cost.name])
+    #观察结果
+    print outs
+    ```
+    输出结果:
+    
+    ```
+    [array([[0.9010564],
+        [1.8021128],
+        [2.7031693],
+        [3.6042256]], dtype=float32), array([9.057577], dtype=float32)]
+    ```
+
+    可以看到第一轮计算后的损失函数为9.0，仍有很大的下降空间。
+    
+5. 网络优化
+    
+    确定损失函数后，可以通过前向计算得到损失值，然后通过链式求导法则得到参数的梯度值。
+    
+    获取梯度值后需要更新参数，最简单的算法是随机梯度下降法：w=w−η⋅g，由`fluid.optimizer.SGD`实现：
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+    ```
+    让我们的网络训练100次，查看结果：
+    
+    ```python
+    #加载库
+    import paddle.fluid as fluid
+    import numpy
+    #定义数据
+    train_data=numpy.array([[1.0],[2.0],[3.0],[4.0]]).astype('float32')
+    y_true = numpy.array([[2.0],[4.0],[6.0],[8.0]]).astype('float32')
+    #定义网络
+    x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+    y = fluid.layers.data(name="y",shape=[1],dtype='float32')
+    y_predict = fluid.layers.fc(input=x,size=1,act=None)
+    #定义损失函数
+    cost = fluid.layers.square_error_cost(input=y_predict,label=y)
+    avg_cost = fluid.layers.mean(cost)
+    #参数初始化
+    cpu = fluid.core.CPUPlace()
+    exe = fluid.Executor(cpu)
+    exe.run(fluid.default_startup_program())
+    ##开始训练，迭代100次
+    for i in range(100):
+        outs = exe.run(
+            feed={'x':train_data,'y':y_true},
+            fetch_list=[y_predict.name,avg_cost.name])
+    #观察结果
+    print outs
+    ```
+
+    输出结果:
+    ```
+    [array([[2.2075021],
+            [4.1005487],
+            [5.9935956],
+            [7.8866425]], dtype=float32), array([0.01651453], dtype=float32)]
+    ```
+    可以看到100次迭代后，预测值已经非常接近真实值了，损失值也从初始值9.05下降到了0.01。
+    
+    恭喜您！已经成功完成了第一个简单网络的搭建，想尝试线性回归的进阶版——房价预测模型，请阅读：[线性回归](../../beginners_guide/quick_start/fit_a_line/README.cn.html)。更多丰富的模型实例可以在[模型库](../../user_guides/models/index.html)中找到。
+
+<a name="what_next"></a>
+## What's next
+
+如果您已经掌握了基本操作，可以进行下一阶段的学习了：
+
+跟随这一教程将学习到如何对实际问题建模并使用fluid构建模型：[配置简单的网络](../../user_guides/howto/configure_simple_model/index.html)。
+
+完成网络搭建后，可以开始在单机或多机上训练您的网络了，详细步骤请参考[训练神经网络](../../user_guides/howto/training/index.html)。
+
+除此之外，使用文档模块根据开发者的不同背景划分了三个学习阶段：[新手入门](../../beginners_guide/index.html)、[使用指南](../../user_guides/index.html)和[进阶使用](../../advanced_usage/index.html)。
+
+如果您希望阅读更多场景下的应用案例，可以跟随导航栏进入[快速入门](../../beginners_guide/quick_start/index.html)和[深度学习基础知识](../../beginners_guide/basics/index.html)。已经具备深度学习基础知识的用户，可以从[使用指南](../../user_guides/index.html)开始阅读。
diff --git a/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..0074b2df726b61a02f9a8e98116b639ab7e562e4
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -0,0 +1 @@
+../../../../../external/book/01.fit_a_line/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/quick_start/fit_a_line/image b/doc/fluid/beginners_guide/quick_start/fit_a_line/image
new file mode 120000
index 0000000000000000000000000000000000000000..ae7c57fe36c2e50f67f81b6797af80df03455c12
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/fit_a_line/image
@@ -0,0 +1 @@
+../../../../../external/book/01.fit_a_line/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/quick_start/index.rst b/doc/fluid/beginners_guide/quick_start/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2d93e45b904f532e44be9c0bac5d4e30587e4ae8
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/index.rst
@@ -0,0 +1,13 @@
+########
+快速入门
+########
+
+..  todo::
+
+    概述
+
+..  toctree::
+    :titlesonly:
+
+    fit_a_line/README.cn.md
+    recognize_digits/README.cn.md
diff --git a/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..c8b9a16180e19dabfebdbc07f8145e7e4c873a63
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -0,0 +1 @@
+../../../../../external/book/02.recognize_digits/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/quick_start/recognize_digits/image b/doc/fluid/beginners_guide/quick_start/recognize_digits/image
new file mode 120000
index 0000000000000000000000000000000000000000..2343a4bf23c308fcd0fe7fad0894f8c346aef07c
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/recognize_digits/image
@@ -0,0 +1 @@
+../../../../../external/book/02.recognize_digits/image
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d0dacb104f148c2aeb323365cbd6f014ae00ed5a
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,225 @@
+从源码编译
+======================
+
+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
+   docker build -t paddle:dev .
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..664b68da8b7dd3e005ebf3ec34de77729e5ab355
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@@ -0,0 +1,237 @@
+Build from Sources
+==========================
+
+.. _requirements:
+
+Requirements
+----------------
+
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+2. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
+We run all the tools by running this image.
+
+.. _build_step:
+
+How To Build
+----------------
+
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
+Or you can build your own image from source as the optional step below:
+
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
+.. code-block:: bash
+
+   # 1. clone the source code
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. Optional: build development docker image from source
+   docker build -t paddle:dev .
+   # 3. Run the following command to build a CPU-Only binaries
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container.
+
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+If you wish to run only one unit test, like :code:`test_sum_op`:
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+---------------------------
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
+
+- Not enough disk space
+
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
+
+.. _compile_deps:
+
+Appendix: Compile Dependencies
+-------------------------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Appendix: Build Options
+-------------------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
+
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..106c86bace075764c84bc2a7f7cb09d466fa8794
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@@ -0,0 +1,146 @@
+使用Docker安装运行
+================================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
+
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+------------------------------
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+------------------------------
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25aecb8d0da9feb00006da6259b529b7011d91cb
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@@ -0,0 +1,153 @@
+Run in Docker Containers
+=================================
+
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
+
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
+
+After you've read above tutorials you may proceed the following steps.
+
+.. _docker_pull:
+
+Pull PaddlePaddle Docker Image
+------------------------------
+
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+Choose between different BLAS version:
+
+  .. code-block:: bash
+
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
+
+
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+Launch your training program in Docker
+--------------------------------------
+
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
+
+Also, you can go into the container shell, run or debug your code
+interactively:
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
+
+.. _docker_run_book:
+
+PaddlePaddle Book
+------------------
+
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+Then, you would back and paste the address into the local browser:
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+.. _docker_run_gpu:
+
+Train with Docker with GPU
+------------------------------
+
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a9305ac4b6578c14a962f223c647a71e3b8a72b
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1,56 @@
+安装与编译
+==========
+
+.. _install_steps:
+
+PaddlePaddle针对不同的用户群体提供了多种安装方式。
+
+专注深度学习模型开发
+--------------------
+
+PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_cn.rst
+
+这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
+
+关注底层框架
+-------------
+
+PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_cn.rst
+
+我们推荐在Docker中运行PaddlePaddle，该方式具有以下优势：
+
+- 无需单独安装第三方依赖
+- 方便分享运行时环境，易于问题的复现
+
+对于有定制化二进制文件需求的用户，我们同样提供了从源码编译安装PaddlePaddle的方法：
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_cn.rst
+
+.. warning::
+
+	需要提醒的是，这种安装方式会涉及到一些第三方库的下载、编译及安装，整个安装过程耗时较长。
+
+
+常见问题汇总
+--------------
+
+如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
+
+:ref:`常见问题解答 <install_faq>`
+
+如果问题没有得到解决，欢迎向PaddlePaddle社区反馈问题：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7990bacbd6966e88e8763e9c5709e410f7e9fed4
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1,56 @@
+install and Compile
+======================
+
+.. _install_steps:
+
+PaddlePaddle provides various methods of installation for many different users
+
+Focus on Deep Learning Model Development
+----------------------------------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+------------------------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_en.rst
+
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
+
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
+
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
+
+FAQ
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 <install_faq>`
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/fluid/build_and_install/paddleci.png differ
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..095da19cd41d29bfa72ab23abd24bec45f925a86
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@@ -0,0 +1,105 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@@ -0,0 +1,123 @@
+Install using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/fluid/design/algorithm/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/design/algorithm/images/asgd.gif differ
diff --git a/doc/fluid/design/algorithm/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/design/algorithm/images/theta_star.gif differ
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_cn.rst
@@ -0,0 +1,7 @@
+梯度更新算法
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_en.rst
@@ -0,0 +1,7 @@
+Gradient Update Algorithm
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
new file mode 100644
index 0000000000000000000000000000000000000000..28ad6495d97515442eb8af2050158829814acd33
--- /dev/null
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -0,0 +1,74 @@
+# Averaging Parameter in PaddlePaddle
+
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
+
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
+</p>
+
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+
+### How to perform Parameter Averaging in PaddlePaddle
+
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all N instances of the parameters in memory is not feasible.
+    3. Therefore, an approximation algorithm is used.
+
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+
+During the testing/saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+
+### How to implement Averaging of Parameter in PaddlePaddle
+
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+
+	**Advantages**:
+    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+    - Makes it easy for the users to customize and extend the framework.
+
+	**Disadvantages**:
+    - Implementation requires re-writing the averaging methodology in Python.  
+
+### Low-Level implementation
+
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Python API implementation for ParameterAverageOptimizer
+
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
+
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+
+The proposal is to add the op immediately while building the computation graph.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ded0ad22f4013a521bf3bee260565dc5cf855ae
--- /dev/null
+++ b/doc/fluid/design/concepts/README.md
@@ -0,0 +1,174 @@
+A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
+
+Here are some initial thoughts. Your comments are welcome!
+
+# Required CMake Function
+
+I think we need only the following few CMake functions to make a project description mean and clean:
+
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
+
+- The `_library` functions generate  .a files from source code.
+- The `_binary` functions generate executable binary files.
+- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
+
+The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
+
+Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
+
+Also,
+
+- to describe external dependencies, we need `external_library`.
+- to build shared libraries, we need `shared_library`.
+
+## An Example Project
+
+Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
+
+- tensor.h
+- tensor.cc
+- tensor_test.cc
+- ops.h
+- ops.cu
+- ops_test.cu
+- api.go
+- api_test.go
+
+Suppose that ops.cu depends on CUDNN.
+
+```cmake
+# cc_binary parses tensor.cc and figures out that target also depend
+# on tensor.h.
+cc_binary(tensor
+  SRCS
+  tensor.cc)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+cc_test(tensor_test
+  SRCS
+  tensor_test.cc
+  DEPS
+  tensor)
+
+# I don't have a clear idea what parameters external_library need to
+# have.  @gangliao as a CMake expert would have better ideas.
+external_library(cudnn
+  ....)
+
+# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
+# include global functions that take Tensor as their parameters, so
+# ops depend on tensor.  This implies that if any of tensor.{h.cc},
+# ops.{h,cu} is changed, ops need to be re-built.
+nv_library(ops
+  SRCS
+  ops.cu
+  DEPS
+  tensor
+  cudnn)  # cudnn is defined later.
+
+nv_test(ops_test
+  SRCS
+  ops_test.cu
+  DEPS
+  ops)
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+go_library(api
+  SRCS
+  api.go
+  DEPS
+  tensor # Because ops depend on tensor, this line is optional.
+  ops)
+
+go_test(api_test
+  SRCS
+  api_test.go
+  DEPS
+  api)
+
+
+# This builds libapi.so.  shared_library might use CMake target
+# api_shared so to distinguish it from above target api.
+shared_library(api
+  DEPS
+  api)
+
+```
+
+## Implementation
+
+As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
+
+## Using Package Manager For Go
+
+Building Go binaries and libraries need to satisfy their dependencies, generally
+we can do `go get ./...` to download and compile all external dependencies. The
+problems are:
+
+1. `go get` will always get the latest code from the default branch of the
+    remote repo, so changes of dependents might break the build. This is very
+    different with what we already have in `cmake/external` which download a
+    specific version or commit id of the dependency.
+1. Some locations can not access external dependencies through the internet, as mentioned
+   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
+   tools can package the dependencies as a "vendor" package, which can be mirrored
+   at many cloud file hosting, so users what to compile paddle by themselves can
+   download this "vendor" package from a mirror site.
+
+### Choose A Suitable Tool
+
+As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
+list dozens of Go package managers. We choose the tool using following principles:
+
+- Most "active" projects with more stars, more pull requests or commits
+- Widely used project
+
+After comparing all these projects, we shall choose between the most popular
+tools: Godep and Glide.
+
+Here's a brief comparison between Godep and Glide
+: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about using `Godep`. There's also a new "official" pakcage
+management tool has been started at: https://github.com/golang/dep to resolve
+such problems, but it's currently at Alpha stage. So the best choice now is
+glide obviously.
+
+### Manage Go Packages
+
+- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
+  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
+  with their commit id. Builds will "lock" to these packages if we don't `glide up`
+  them
+- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
+  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
+  under `go/`, cmake will just check the commit id to the packages under the folder,
+  if commit id matches, there will be no download at all.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
new file mode 100644
index 0000000000000000000000000000000000000000..3757cd055c818be1e63ee8c0f000f4dd299b59f4
--- /dev/null
+++ b/doc/fluid/design/concepts/block.md
@@ -0,0 +1,375 @@
+# Design Doc: Block and Scope
+
+## The Representation of Computation
+
+Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
+
+- Caffe, Torch, and Paddle: sequences of layers.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
+- PaddlePaddle: nested blocks, like C++ and Java programs.
+
+## Block in Programming Languages and Deep Learning
+
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
+
+Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
+
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
+
+A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
+
+## Stack Frames and the Scope Hierarchy
+
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
+
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
+
+1. In traditional programs:
+
+   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
+   - After the execution leaves the right curly brace, the runtime pops the frame.
+   - The maximum number of frames in the stack is the maximum depth of nested blocks.
+
+1. In PaddlePaddle
+
+   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - The height of the highest tree is the maximum depth of nested blocks.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
+
+## Use Blocks in C++ and PaddlePaddle Programs
+
+Let us consolidate the discussion by presenting some examples.
+
+### Blocks with `if-else` and `IfElseOp`
+
+The following C++ programs shows how blocks are used with the `if-else` structure:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int z = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+
+```
+
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
+
+
+### Blocks with `for` and `RNNOp`
+
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
+
+```python
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  h_prev = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, h_prev)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
+o1, o2 = rnn()
+```
+has its equivalent C++ program as follows
+
+```c++
+int* x = {10, 20, 30};
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
+
+int mem[sizeof(x) / sizeof(x[0]) + 1];
+int o1[sizeof(x) / sizeof(x[0]) + 1];
+int o2[sizeof(x) / sizeof(x[0]) + 1];
+for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
+  int x = x[i-1];
+  if (i == 1) mem[0] = m;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
+  int act = sigmoid(sum);
+  mem[i] = act;
+  o1[i] = act;
+  o2[i] = hidden_out;
+}
+```
+
+## Compilation and Execution
+
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
+
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
+
+## The "Binary Executable File Format"
+
+The definition of the protobuf message is as follows:
+
+```protobuf
+message BlockDesc {
+  repeated VarDesc vars = 1;
+  repeated OpDesc ops = 2;
+}
+```
+
+The step net in above RNN example would look like
+
+```
+BlockDesc {
+  vars = {
+    VarDesc {...} // x
+    VarDesc {...} // h
+    VarDesc {...} // fc_out
+    VarDesc {...} // hidden_out
+    VarDesc {...} // sum
+    VarDesc {...} // act
+  }
+  ops = {
+    OpDesc {...} // matmul
+    OpDesc {...} // add_two
+    OpDesc {...} // sigmoid
+  }
+};
+```
+
+Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
+
+```
+OpDesc {
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
+  attrs {
+    "states" : {1} // the index of h
+    "step_net" : <above step net>
+  }
+};
+```
+
+This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
+
+
+## The Compilation of Blocks
+
+During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
+
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
+
+```python
+a = pd.Variable(shape=[20, 20])
+b = pd.fc(a, params=["fc.w", "fc.b"])
+
+rnn = pd.create_rnn()
+with rnn.stepnet():
+    x = a.as_step_input()
+    # reuse fc's parameter
+    fc_without_b = pd.get_variable("fc.w")
+    rnn.output(fc_without_b)
+
+out = rnn()
+```
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+
+In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
+
+To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
+
+`SymbolTable` can do the following:
+
+- store the definitions (some names and attributes) of variables and operators,
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+
+
+```c++
+// Information in SymbolTable is enough to trace the dependency graph. So maybe
+// the Eval() interface takes a SymbolTable is enough.
+class SymbolTable {
+ public:
+  SymbolTable(SymbolTable* parent) : parent_(parent) {}
+
+  OpDesc* NewOp(const string& name="");
+
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");
+
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
+  // recursively.
+  // this interface is introduced to support InferShape, find protobuf messages
+  // of variables and operators, pass pointers into InferShape.
+  //
+  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
+  VarDesc* FindVar(const string& name, bool recursive=true);
+
+  OpDesc* FindOp(const string& name);
+
+  BlockDesc Compile() const;
+
+ private:
+  SymbolTable* parent_;
+
+  map<string, OpDesc> ops_;
+  map<string, VarDesc> vars_;
+};
+```
+
+After all the description of variables and operators is added into SymbolTable,
+the block has enough information to run.
+
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
+
+
+```c++
+namespace {
+
+class Block : OperatorBase {
+public:
+  Block(const BlockDesc& desc) desc_(desc) {}
+
+  void InferShape(const framework::Scope& scope) const override {
+    if (!symbols_ready_) {
+      CreateVariables(scope);
+      CreateOperators();
+    }
+    // should run InferShape first.
+    for (auto& op : runtime_table_.ops()) {
+      op->InferShape(scope);
+    }
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
+    for (auto& op : runtime_table_.ops()) {
+      op->Run(scope, place);
+    }
+  }
+
+  void CreateVariables(const framework::Scope& scope);
+  void CreateOperators();
+
+  // some other necessary interfaces of NetOp are listed below
+  // ...
+
+private:
+  BlockDesc desc_;
+  bool symbols_ready_{false};
+};
+```
+
+## The Execution of Blocks
+
+Block inherits from OperatorBase, which has a Run method.
+Block's Run method will run its operators sequentially.
+
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
+
+The definition of Eval is as follows:
+
+```c++
+// clean a block description by targets using the corresponding dependency graph.
+// return a new BlockDesc with minimal number of operators.
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
+// to a cluster.
+BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
+
+void Block::Eval(const vector<string>& targets,
+                 const framework::Scope& scope,
+                 const platform::DeviceContext& dev_ctx) {
+  BlockDesc min_desc = Prune(desc_, targets);
+  Block min_block(min_desc);
+  min_block.Run(scope, dev_ctx);
+}
+```
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..aabc1ba75a67c5767d409bd6e7e6240dec86b16c
--- /dev/null
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -0,0 +1,204 @@
+# C++ Data Feeding
+
+While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
+
+In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
+
+## Overview
+
+![](images/readers.png)
+
+## Reader
+
+In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
+
+
+### ReaderBase
+
+`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
+
+```cpp
+class ReaderBase {
+ public:
+  // Reads the next batch of data. (A 'batch' can be only one instance)
+  // If the next batch doesn't exist, it throws an exception
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  
+  // Checks whether the next instance exists.
+  virtual bool HasNext() = 0;
+  
+  // Reinitializes the reader and read the file from the beginning.
+  virtual void ReInit() = 0;
+
+  virtual ~ReaderBase();
+};
+```
+
+### FileReader
+
+`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
+
+```cpp
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& dims);
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+
+ private:
+  std::vector<DDim> dims_;
+};
+```
+
+A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
+
+The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.  
+
+### DecoratedReader
+
+A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling,  batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+```cpp
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  void ReInit() override { reader_->ReInit(); }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+```
+
+Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
+
+### MultipleReader
+
+All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
+
+So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
+
+![](images/multiple_reader.png)
+
+This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
+
+To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. 
+
+### ReaderHolder
+
+Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+We would have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some new ops are introduced:
+
+### Operators That Create Readers
+
+Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
+
+However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
+
+### OpenFilesOp
+
+The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
+
+To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
+
+### HasNextOp
+
+`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
+
+### ResetOp
+
+`ResetOp` is used to reset a reader via its `ReInit()` interface.
+
+### ReadOp
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
+
+## Program with Readers
+
+A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
+
+The ops of a `startup_program` with readers would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+double_buffer_reader = create_double_buffer_op(batch_reader)
+... (other initializers)
+```
+
+The forwarding ops of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
+    has_next = has_next_op(double_buffer_reader)
+    if_else_op(has_next) {
+        batch_data = read_op(double_buffer_reader)
+        ... (subsequent training ops)
+    } else {
+        reset_op(double_buffer_reader)
+        increase_op(pass_count)
+        not_completed = less_than_op(pass_count, reqiured_pass_num)
+    }
+}
+```
+
+A few important considerations for these programs are as follows:
+
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
+
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+
+### Simplify Configuration by MultiPassReader
+
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+
+With `MultiPassReader`, the startup program would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+
+The forwarding part of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+while_op(not_completed) {
+    batch_data = read_op(double_buffer_reader)
+    ... (subsequent training ops)
+    not_completed = has_next_op(double_buffer_reader)
+}
+```
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..3fcddf4dd90f826ee1a16713f4371fb010f8eac5
--- /dev/null
+++ b/doc/fluid/design/concepts/executor.md
@@ -0,0 +1,29 @@
+# Executor Design Doc
+
+## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
+
+## Executor
+
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
+
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
+
+### Running an Executor
+
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f86b99e5197c3e0b85fd76fe704520ef21b06d3
--- /dev/null
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -0,0 +1,128 @@
+# Design Doc: Functions, Operators, and Layers
+
+In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
+
+Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
+
+In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
+
+```c++
+template <typename T> T add(T x, T y) { return x + y; }
+template <typename T> T mul(T x, T y) { return x * y; }
+```
+
+Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
+
+```c++
+#define MAKE_FUNCTION_OPERATOR(mul);
+```
+
+generates
+
+```c++
+template <typename T> class mulOp : public OperatorBase {...};
+REGISTER_OP(mulOp<float32>, "mul");
+```
+
+so that in Python we can create operator mul by:
+
+```python
+X1 = Var()
+X2 = Var()
+Y = Var()
+paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
+```
+
+Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
+
+```c++
+template <typename T>
+class FCOp : public OperatorBase {
+ public:
+  void Run(...) {
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
+  }
+};
+REGISTER_OP(FCOp, "fc");
+```
+
+We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
+
+Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
+
+```python
+def operator.mul(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
+    return O
+
+def operator.add(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
+    return O
+```
+
+Above code snippets are automatically generated.  Given them, users can define
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    return operator.add(operator.mul(X, W), b)
+```
+
+If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    O1 = Var()
+    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
+    O2 = Var()
+    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
+    return O2
+```
+
+We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
+
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
+
+
+This is how we differentiate layer and operators in PaddlePaddle:
+
+- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
+- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/design/concepts/images/multiple_reader.png differ
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
new file mode 100644
index 0000000000000000000000000000000000000000..40753cb140540c08d9d4c449b8d377e315280436
--- /dev/null
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
@@ -0,0 +1,83 @@
+digraph G {
+  subgraph cluster_init {
+    label="Initialization"
+    startup_program [label="startup", shape=box]
+    node_w_g0 [label="W\nGPU0"]
+    startup_program -> node_w_g0 [label="Initialize"]
+    node_w_g1 [label="W\nGPU1"]
+    node_w_g0 -> node_w_g1 [label="broadcast"]
+  }
+
+  subgraph cluster_train {
+    label="forward_backward"
+
+    subgraph cluster_gpu0 {
+      label="GPU0"
+      fc_0 [label="fc\nGPU0", shape=box]
+      hidden_0 [label="hidden\nGPU0"]
+      node_w_g0 -> fc_0
+      fc_0 -> hidden_0
+      loss0 [label="loss\nGPU0"]
+      hidden_0 -> loss0 [label="many ops omitted"]
+      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+      loss_g0 [label="loss_grad\nGPU0"]
+      scale_loss_0->loss_g0
+      
+      fc_g_0 [label="w_grad\nGPU0", shape=box]
+      loss0 -> fc_g_0
+      loss_g0 -> fc_g_0
+      hidden_0 -> fc_g_0
+    }
+
+    subgraph cluster_gpu1 {
+      label="GPU1"
+      fc_1 [label="fc\nGPU1", shape=box]
+      hidden_1 [label="hidden\nGPU1"]
+      node_w_g1 -> fc_1
+      fc_1 -> hidden_1
+      loss1 [label="loss\nGPU1"]
+      hidden_1 -> loss1 [label="many ops omitted"]
+      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+      loss_g1 [label="loss_grad\nGPU1"]
+      scale_loss_1->loss_g1
+      
+      fc_g_1 [label="w_grad\nGPU1", shape=box]
+      loss1 -> fc_g_1
+      loss_g1 -> fc_g_1
+      hidden_1 -> fc_g_1
+    }
+  }
+
+  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+  fc_g_0 -> all_reduce_w
+  fc_g_1 -> all_reduce_w
+
+  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+  all_reduce_w -> fc_g_0_merged
+  all_reduce_w -> fc_g_1_merged
+
+  subgraph cluster_optimization {
+    label="Optimization"
+    subgraph cluster_opt_gpu0 {
+      label="GPU0"
+      sgd_0 [label="SGD Op\nGPU0", shape=box]
+
+      fc_g_0_merged -> sgd_0
+      node_w_g0 -> sgd_0
+      optimized_w_0 [label="Optimized W\nGPU0"]
+      sgd_0 -> optimized_w_0
+    }
+    subgraph cluster_opt_gpu1 {
+      label="GPU1"
+      sgd_1 [label="SGD Op\nGPU1", shape=box]
+
+      fc_g_1_merged -> sgd_1
+      node_w_g1 -> sgd_1
+      optimized_w_1 [label="Optimized W\nGPU0"]
+      sgd_1 -> optimized_w_1
+    }
+  }
+
+
+}
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211
Binary files /dev/null and b/doc/fluid/design/concepts/images/parallel_executor_overview.png differ
diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/design/concepts/images/readers.png differ
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc894937ff328e6002623275ca3c65e87b2bb0
--- /dev/null
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -0,0 +1,19 @@
+核心概念
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b85a3055746facaa642e8fc899976b58435f1ef2
--- /dev/null
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -0,0 +1,19 @@
+Core Concepts
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..748488f6d5f2f1272e87b89047570632418da8dc
--- /dev/null
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -0,0 +1,211 @@
+# Design Doc: LoD (Level-of-Detail) Tensor
+
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
+
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
+
+
+## The Challenge: Variable-length Sequences
+
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
+
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
+
+```
+3   1 2
+||| | ||
+```
+
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+### A Mini-Batch of Videos
+
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
+
+```
+3     1  2
+口口口 口 口口
+```
+
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
+
+### A Mini-Batch of Images
+
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
+
+```
+1 1 1 1     1
+口口口口 ... 口
+```
+
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
+
+```
+口口口口 ... 口
+```
+
+### Model Parameters
+
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
+
+
+## The LoD Tensor
+
+Let us revisit above example of the 2-level LoD Tensor
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
+
+We can save the LoD index in the above example
+
+```
+3           1  2
+3   2  4    1  2  3
+```
+
+in a not-full 2D matrix:
+
+```c++
+typedef std::vector<std::vector<int> > LoD;
+```
+
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
+
+```
+3 2 4 1 2 3
+```
+
+into offsets
+
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
+
+so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
+
+Similarly, the lengths in the top level LoD
+
+```
+3 1 2
+```
+
+are transformed into offsets of elements/words as follows:
+
+```
+0 3 4   6
+  = =   =
+  3 3+1 4+2
+```
+
+## Slicing of LoD Tensors
+
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
+
+For example, the <2>-slice of above example is
+
+```
+10      15
+10  12  15
+  || |||
+```
+
+and the <2,0>-slice of above slice is
+
+```
+10  12
+  ||
+```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f88e27bed722e9f2f535e368926fe49b4e72e56
--- /dev/null
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -0,0 +1,104 @@
+# ParallelExecutor
+
+## Background
+
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
+
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+
+
+## Overview of MultiGPUs logic
+
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+
+![alt](images/parallel_executor_overview.png)
+
+There are several optimizations for this logic.
+
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
+   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+
+
+## Static single assignment Graph
+
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+
+The data structure of `SSA` graph is:
+
+```c++
+struct VarHandleBase {
+  OpHandleBase* generated_op_;
+  vector<OpHandleBase*> pending_ops_;
+  
+  string name;
+  Place place;
+  size_t version;
+};
+
+struct OpHandleBase {
+  vector<OpHandleBase*> inputs_;
+  vector<OpHnadleBase*> outputs_;
+};
+
+struct SSAGraph {
+  // vars on each devices. 
+  //   * the vars in each map in vector is on different device.
+  //   * the map is mapping a variable name to variable handles
+  //   with different versions
+  vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
+  
+  // All ops
+  vector<OpHandleBase> ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+
+## Execute SSA Graph
+
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+
+## Synchronize GPU Kernels
+
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
+
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+
+The `wait` are implemented by two strategies:
+
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+
+## What's next?
+
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfcd21ecdb9d2844bf93ed98a56db09651077c40
--- /dev/null
+++ b/doc/fluid/design/concepts/program.md
@@ -0,0 +1,139 @@
+# Design Doc: PaddlePaddle Programs
+
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
+
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
+
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
+}
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
+message BlockDesc {
+  required int32 parent = 1;
+  repeated VarDesc vars = 2;
+  repeated OpDesc ops = 3;
+}
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
+
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
+message OpDesc {
+  AttrDesc attrs = 1;
+  ...
+}
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+
+```
+message AttrDesc {
+  required string name = 1;
+
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```
+
+## InferShape
+
+With this design, the InferShape function should take the following parameters:
+
+```c++
+void InferShape(int current_block,
+                int current_operator,
+                ProgramDesc* program // might change VarDesc values.
+                ) {
+  ...
+}
+```
+
+where
+
+- `current_block` indices into `ProgramDesc::blocks`,
+- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ 
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+  void Close() { return queue_.Close(); }
+
+  bool IsClosed() const { return queue_.IsClosed(); }
+
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ 
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  
+  void ReInit() override { return; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcf76649357aaef80d6bc1a933ece8c4c1063547
--- /dev/null
+++ b/doc/fluid/design/concepts/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* Var(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* FindVar(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->FindVar(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+## Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* FindVar(const std::string& name) const;
+
+  // return if already contains same name variable.
+  Variable* Var(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/fluid/design/concepts/tensor.md b/doc/fluid/design/concepts/tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a27ac9bb6b03649d42e12100fda9e80a56e7f56
--- /dev/null
+++ b/doc/fluid/design/concepts/tensor.md
@@ -0,0 +1,189 @@
+# Tensor: An Unified Data Type in PaddlePaddle
+
+## Pain Point
+
+In this week, we discussed several potential weaknesses of PaddlePaddle caused by rapid iteration and development to promote new business products on the line in recent four years. For instance, current Matrix/Vector implementation in PaddlePaddle are long and tedious to read, which interfered seriously with the contribution of both fresh and professional engineers. More seriously for this issue, it will also become too challenging to maintain over time.
+
+
+## Learn from Majel
+
+Consequently, we decide to refactor PaddlePaddle step-by-step. First, refactor and replace Matrix/Vector to Tensor, a modern terminology in the deep learning system. Fortunately, we can learn from Majel how to define a Tensor.
+
+To simplify heterogeneous resource allocation in any dimensions (1-9) and types (double, float, float16), Majel consists of several primitives such as `Dim`, `Place` and `Array`, all of them are standard C++ class templates.
+
+1. `Place`: memory location [i.e. CPU/GPU].
+2. `Allocation`: heterogeneous resource allocator [i.e. 20MB in GPU].
+3. `Dim`: size of each dimension. [i.e. Dim<4>({10, 2, 5, 1})]
+4. `Array`: dynamic array consists of `Place`, `Dim`, and a pointer to memory.
+
+If you dig deeper into Majel source code, you will find Majel heavily use `boost.variant`. The variant class template is a safe, generic, stack-based discriminated union container, **offering a simple solution for manipulating an object from a heterogeneous set of types in a uniform manner**. Whereas standard containers such as std::vector may be thought of as "multi-value, single type," variant is "multi-type, single value."
+
+As a simple example, consider the following:
+
+```c++
+#include "boost/variant.hpp"
+#include <iostream>
+
+class my_visitor : public boost::static_visitor<int>
+{
+public:
+    int operator()(int i) const
+    {
+        return i;
+    }
+    
+    int operator()(const std::string & str) const
+    {
+        return str.length();
+    }
+};
+
+int main()
+{
+    boost::variant< int, std::string > u("hello world");
+    std::cout << u; // output: hello world
+
+    int result = boost::apply_visitor( my_visitor(), u );
+    std::cout << result; // output: 11 (i.e., length of "hello world")
+}
+```
+
+In Majel, `DDimVar` is derived from `Dim`, `DArrayVar` is from `Array`.
+
+```c++
+template<int i>
+struct Dim {
+...    
+int head;
+Dim<i-1> tail;
+}
+```
+
+```c++
+template<typename T, int D>
+class Array : public Buffer {
+    ...
+private:
+    Dim<D> size_;
+    Dim<D> stride_;
+    T* ptr_;
+};
+```
+
+```c++
+typedef boost::variant<CUDAPlace, CpuPlace> Place;
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
+                       Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
+typedef boost::variant<
+    Array<float, 1>,
+    Array<float, 2>,
+    Array<float, 3>,
+    Array<float, 4>,
+
+    Array<double, 1>,
+    Array<double, 2>,
+    Array<double, 3>,
+    Array<double, 4>,
+
+    Array<float16, 1>,
+    Array<float16, 2>,
+    Array<float16, 3>,
+    Array<float16, 4> > DArrayVar;
+```
+
+Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle.
+
+`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way:
+ 
+ ```c++
+ DArray arr = make_darray(make_ddim({2,3}), 0.0f);
+ ```
+ It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 .
+ 
+ The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this:
+
+ ```c++
+ arr[make_ddim({0, 1})] = 1.0；
+ ```
+
+## Implement Tensor in Paddle
+
+We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers.
+
+Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware.
+
+Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
+
+
+### Memory Management
+`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle.
+
+### Dim and Array
+#### Dim
+
+`Dim` decribes the dimension information of an array.
+
+`DDimVar` is an alias of a specializd class of boost.variant class template.
+
+`DDim` is introduced to represent a dynamically sized dimension.
+
+For example:
+
+```
+Dim<2> d1 = make_dim(3, 3);
+DDim d2 = make_ddim({1, 2, 3});
+```
+
+You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension.
+#### Array
+
+`Array` represents for a tensor with specific type and size.
+
+`DArrarVar` is an alias of a specialized class of boost.variant class template.
+
+`DArray` is introduced to represent a dynamically typed array.
+
+For example:
+
+```
+Array<float, 2> a1(Dim<2>(2, 2));
+DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace());
+```
+
+You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array.
+
+
+Please reference the section of `Learn from Majel` for more details.
+
+### ArrayView
+
+`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on.
+
+`ArrayView` is an encapsulation of `Array`， which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
+
+`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
+
+
+A global function `make_view` is provided to transform an array to corresponding arrayview.
+
+```
+template<typename T, int D>
+ArrayView<T, D> make_view(const Array<T, D>& in) {
+    return in;
+}
+```
+
+A global function `make_iterator` is provided to make iterator of an array.
+
+```
+template<typename T, int D>
+ViewIterator<ArrayView<T, D>> make_iterator(const Array<T, D>& in, Dim<D> idx) {
+    return make_iterator(make_view(in), idx);
+}
+```
+
+### Basic Operations
+
+The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on.
+
+An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU).
diff --git a/doc/fluid/design/concepts/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md
new file mode 100644
index 0000000000000000000000000000000000000000..37e4f7b90f94fa3eb015e733999cd84c96b2239c
--- /dev/null
+++ b/doc/fluid/design/concepts/tensor_array.md
@@ -0,0 +1,271 @@
+# Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+
+ private:
+  vector<LoDTensor> values_;
+};
+```
+
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+
+```python
+input = pd.data(...)
+num_steps = Var(12)
+
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+
+step = Var(1)
+
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+
+output = step_outputs.stack()
+```
+
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+
+An RNN can be implemented with the following pseudocode
+
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+
+
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+
+
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+
+## Introduce TensorArray to uniform all the three RNNs
+TensorArray as a new concept is borrowed from TensorFlow, 
+it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
+
+This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
+such as `recurrent_op`, `RecurrentGradientMachine`.
+
+In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
+`TensorArray` is used to segment inputs and store states in all time steps.
+By providing some methods similar to a C++ array,
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+
+## Dynamic-operations on TensorArray
+
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = Var(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+
+
+    def size(self, output):
+        '''
+        Return the number of values.
+
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
+
+## LoDTensor-related Supports
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+
+Since each step of RNN can only take a tensor-represented batch of data as input, 
+some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
+
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+
+Some definitions are like
+
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
+
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+
+With these two methods, a varience-length sentence supported RNN can be implemented like
+
+```c++
+// input is the varient-length data
+LodTensor sentence_input(xxx);
+TensorArray ta;
+Tensor indice_map;
+Tensor boot_state = xxx; // to initialize rnn's first state
+TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
+TessorArray step_outputs;
+TensorArray states;
+
+for (int step = 0; step = ta.size(); step++) {
+  auto state = states.read(step);
+  // rnnstep is a function which acts like a step of RNN
+  auto step_input = ta.read(step);
+  auto step_output = rnnstep(step_input, state);
+  step_outputs.write(step_output, true/*data_shared*/);
+}
+
+// rnn_output is the final output of an rnn
+LoDTensor rnn_output = ta.pack(ta, indice_map);
+```
+the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
+the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
new file mode 100644
index 0000000000000000000000000000000000000000..8db67f6703d142da71cf06bd4f7e2cb13556f9b0
--- /dev/null
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -0,0 +1,100 @@
+# Design Doc: Var_desc
+
+## Background
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
+
+PaddlePaddle uses proto message to describe compile time program because :
+
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
+
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
+
+
+## Definition of VarType
+
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+
+```proto
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+}
+```
+
+## Definition of TensorDesc
+
+```proto
+message TensorDesc {
+  // Should only be PODType. Is enforced in C++
+  required Type data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+The `Type` here comes from the enum defined inside of `VarType` :
+
+```proto
+enum Type {
+  // Pod Types
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+
+  // Other types that may need additional descriptions
+  LOD_TENSOR = 7;
+  SELECTED_ROWS = 8;
+  FEED_MINIBATCH = 9;
+  FETCH_LIST = 10;
+  STEP_SCOPES = 11;
+  LOD_RANK_TABLE = 12;
+  LOD_TENSOR_ARRAY = 13;
+  PLACE_LIST = 14;
+  READER = 15;
+  CHANNEL = 16;
+}
+```
+
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
+
+## Definition of LodTensorDesc
+
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+```
+
+A LoDTensorDesc contains a tensor and a lod_level.
+
+## Definition of Variable in Python
+
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/fluid/design/concepts/variable.md b/doc/fluid/design/concepts/variable.md
new file mode 100644
index 0000000000000000000000000000000000000000..442ef6b718b227d79ca73031efcbb55817558252
--- /dev/null
+++ b/doc/fluid/design/concepts/variable.md
@@ -0,0 +1,52 @@
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2.  It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation.  Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+  /* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
+  /* size */ t1.Size());
+  
+Randomize(
+  /* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
+  /* size */ t2.Size());
+  
+Mult(
+  /*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
+  /*input1*/ v1.Get<Tensor>().data(),
+  /*input2*/ v2.Get<Tensor>().data());
+```
+     
+We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable.  Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`.  Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code.
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
new file mode 100644
index 0000000000000000000000000000000000000000..df67438bcc741ac521b00ee962fc13c93db21182
--- /dev/null
+++ b/doc/fluid/design/concurrent/channel.md
@@ -0,0 +1,139 @@
+# Channel Design
+
+## Introduction
+
+A Channel is a data structure that allows for synchronous interprocess
+communication via message passing.  It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+
+## How to use it
+
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+
+### Create a channel
+
+Creates a new channel that takes in variables of a specific dtype.
+
+- **fluid.make_channel(dtype, capacity=0)**
+  - **dtype**: The data type of variables being sent/received through channel
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents
+    an unbuffered channel.  Capacity > 0 represents a buffered channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+
+### Close a channel
+
+Closes a channel.  Any pending senders and receivers will be awoken during
+this time.  Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+
+- **fluid.channel_close(channel)**
+
+```
+fluid.channel_close(ch)
+```
+
+### Send data to a channel
+
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
+`ChannelHolder` are supported.
+
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+
+- **channel_send(channel, variable, is_copy=False)**
+  - **channel**: The channel to send the variable to
+  - **variable**: The variable to send to the channel
+  - **is_copy**: If set to True, channel_send will perform a variable assign
+  to copy the source variable to a new variable to be sent.
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+
+### Receive data from a channel
+
+Receives a variable from a channel.  The data of the variable is moved to the
+receiving variable.
+
+- **channel_recv(channel, return_variable)**
+  - **channel**: The channel to receive the variable from
+  - **return_variable**: The destination variable used to store the data of the
+  variable received from the channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+
+## How it Works
+
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+
+### QueueMessage
+
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**.  It contains a condition variable used to lock the
+thread (when there are no available sends/receives).  In addition, it contains
+a callback function to notify a thread when the QueueMessage is being
+processed by the channel.
+
+### Queues
+
+- **buff_**: This queue holds the data buffer in a buffered channel.  The
+capacity is set to the capacity of the channel.  This data buffer is not
+used in an unbuffered channel.
+
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel.  When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+  1. The channel is buffered and is full
+  2. The channel is unbuffered and does not have a receiver
+
+- **recvq**:  This queue holds the QueueMessage of any pending receivers of a
+channel.  When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+  1. The channel is buffered and there is no data on the buff_
+  2. The channel is unbuffered and does not have a sender
+
+### State diagram
+
+#### Channel Send
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
+</p>
+
+#### Channel Receive
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
+</p>
+
+## Limitations and Considerations
+
+### Variable Copy
+
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent.  We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.  
+
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable.  Please
+note that **assign** operator has limited support for only certain variables
+datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..0428e74f9e00a87f6b0972057f48479b8ae56ad6
--- /dev/null
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -0,0 +1,193 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators">intrinsics/operators</a></td>
+<td></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h">class Executor</a></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+  paddlepaddle()
+  fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, L, Y],
+    ops = [
+      read(output = X)
+      kube_get_workers_addrs(output = L)
+      Y = tensor_array(len(L))
+      parallel_for(input = X, output = Y,
+                   attrs = {L, block_id(1)}) # referring to block 1
+    ]
+  }
+
+  block[1] = Block {
+    parent = 0,
+    vars = [x, y, index],
+    ops = [
+      slice(input = [X, index], output = x) # index is initialized by parallel_for
+      send(input = x, attrs = L[index])
+      recv(outputs = y, attrs = L[index])
+      assign(input = y, output = Y[index])
+    ]
+  }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() {  //// block 0
+  X = fluid.read(...)
+  L = fluid.k8s.get_worker_addrs()
+  Y = fluid.tensor_array(len(L))
+  fluid.parallel_for(X, L,
+                     func(index int) {  //// block 1
+                       x = X[index]
+                       fluid.send(L[index], x)
+                       y = fluid.recv(L[index])
+                       Y[index] = y
+                     })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.  
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+
+  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
+     1. creates an Executor instance, and
+     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+  W = Tensor(...)
+  x = fluid.listen_and_do(
+        fluid.k8s.self_addr(),
+        func(input Tensor) {
+          output = fluid.mult(input, W)
+        })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+  2. once a connection is established,
+     1. creates a scope of two parameters, "input" and "output",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
+     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+   2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
new file mode 100644
index 0000000000000000000000000000000000000000..66d19f44baf861c7847e81ca83f61024ec877faf
--- /dev/null
+++ b/doc/fluid/design/concurrent/csp.md
@@ -0,0 +1,251 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning.  Few example applications are:
+
+1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch  := make(chan int)       // a channel without buffer
+ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
+
+### Send and Recv
+
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
+### Select
+
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18b788e80f432ebb2f14b15229e7823c112001e
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+    
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+
+## Current Limitations
+
+#### <a name="block-captures"></a>Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
new file mode 100644
index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_recv.png differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
new file mode 100644
index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_send.png differ
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..719ed76f9d542d6c4f20c30f27656bb53325aa85
Binary files /dev/null and b/doc/fluid/design/concurrent/images/select_op_workflow.png differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e47135e9fc42760898083710e0a6767252a0225b
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_cn.rst
@@ -0,0 +1,8 @@
+并发编程
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0727e75798b2a869588f80d3cce7a886554e4ffb
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_en.rst
@@ -0,0 +1,8 @@
+Concurrent Programming
+-------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md
new file mode 100644
index 0000000000000000000000000000000000000000..42bd136f825986d94fafaeaa5f58edb02848a74c
--- /dev/null
+++ b/doc/fluid/design/concurrent/parallel_do.md
@@ -0,0 +1,163 @@
+# Design Doc: Parallel_Do in PaddlePaddle
+
+In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
+
+## Design overview
+
+The definition of a parallel_do op looks like the following
+
+```c++
+AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
+AddInput(kParameters, "Parameters are duplicated over different devices")
+    .AsDuplicable();
+AddInput(kPlaces, "Devices used for parallel processing");
+AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
+AddOutput(kParallelScopes,
+          "Scopes for all local variables in forward pass. One scope for each device");
+AddAttr<framework::BlockDesc *>(kParallelBlock,
+                                "List of operaters to be executed in parallel");
+```
+
+A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
+`||||` means multiple threads)
+
+```
+In the forward pass
+  |      Split input onto different devices
+  |      Copy parameter onto different devices
+  ||||   Compute forward pass in parallel
+  |      Merge output from different devices
+
+In the backward pass
+  |      Split output@grad onto different devices
+  ||||   Compute backward pass in parallel
+  |      accumulate param@grad from different devices to the first device
+  |      Merge input@grad from different devices
+  |      Copy param@grad to the place of parallel_do_op
+```
+
+This implementation allows to write mixed device program like this
+
+```python
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
+
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
+# parallel processing on multiple GPUs
+pd = ParallelDo(gpu_places)
+with pd.do(input=data):
+    prediction = softmax(fc(fc(data, W1), W2))
+    write_output(prediction)
+prediction = pd()
+loss = cross_entropy(prediction, label)
+```
+
+And the programDesc are like the following
+
+```
+# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
+start_program
+{
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2, w1_grad, w2_grad,
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),
+       sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+block1 { # the forward pass
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 { # the backward pass
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
+  ops: softmax_grad,
+       fc_grad
+       fc_grad
+}
+}
+```
+
+## Performance Imporvement
+
+There are serial places we can make this parallel_do faster.
+
+### forward: split input onto different devices
+
+If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by 
+prefetching the input onto different devices in a seperate background thread. And the python code
+looks like this.
+```python
+pd = ParallelDo(gpu_places)
+with pd.do():
+    feature = get_data_from_prefetch_queue(gpu_places)
+    prediction = my_net(feature)
+    write_output(activation)
+```
+
+### forward: Copy parameter to onto different devices
+
+We can avoid this step by making each device have a copy of the parameter. This requires:
+
+1. `fluid.default_start_up_program()` to be run on all devices
+1. In the backward, allreduce param@grad at different devices, this requires
+    1. `backward.py` add `allreduce` operators at parallel_do_grad
+    1. `allreduce` operators need to be called in async mode to achieve maximum throughput
+1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
+
+By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
+And the ProgramDesc looks like the following
+
+```
+# w1, w2 will be allocated on all GPUs
+start_program
+{
+block0 {
+  parallel_do(block1)
+}
+block1 {
+  parent_block: 0
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),      # append_backward
+       parallel_do(block3)            # append_optimization
+       
+}
+block1 {
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 {
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  ops: softmax_grad,
+       fc_grad, allreduce(places, scopes, w1_grad),
+       fc_grad, allreduce(places, scopes, w2_grad)
+}
+block3 {
+  parent_block: 0
+  vars: lr
+  ops: sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+}
+```
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fcae57cc7932cdaebe549486e7f7cebf0bd038a
--- /dev/null
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -0,0 +1,265 @@
+# select_op Design
+
+## Introduction
+
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case.  If multiple cases are ready to run, then one case is
+choosen at random to be executed.
+
+With the introduction of CSP for Paddle, we mimic this behavior by
+creating a ***select_op***.
+
+## How to use it
+
+The **select_op** is available as a c++ operator.  However most users
+will prefer to use the much simplier Python API.
+
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program.  Also creates a sub block and adds it to the
+main program.  This sub block is used to hold all variables and operators
+used by the case statements.
+
+Within the select block, users can add cases by
+calling **select.case** or **select.default** method.
+
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case.  This method creates a SelectCase block
+guard and adds it to the Select block.  The arguments into this method tells
+the select which channel operation to listen to.
+
+- **fluid.Select.default()**: Represents the fluid default case.  This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)    
+
+with while_op.block():
+    with fluid.Select() as select:
+        with select.case(fluid.channel_send, channel, x):
+            # Send x, then perform Fibonacci calculation on x and y
+            x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+            assign(input=x, output=x_tmp)
+            assign(input=y, output=x)
+            assign(elementwise_add(x=x_tmp, y=y), output=y)
+        with select.case(fluid.channel_recv, quit_channel, result2):
+            # Exit out of While loop
+            while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+            helper = layer_helper.LayerHelper('assign')
+            helper.append_op(
+                type='assign',
+                inputs={'X': [while_false]},
+                outputs={'Out': [while_cond]})
+```
+
+## How it Works
+
+### Program Description
+
+```
+blocks {
+  idx: 0
+  ...
+  // Create "case_to_execute" variable
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: -1.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create "select" operator.
+  // inputs:
+  //   X: All input variables used by operators within the select block
+  //   case_to_execute: Variable filled in by select_op when it determines
+  //     which case to execute.
+  //  
+  // outputs:
+  //   Out: All output variables referenced by operators within select block.
+  //
+  // attrs:
+  //   sub_block: The block id containing the select "cases"
+  //   cases:  Serialized list of all cases in the select op.
+  //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
+  //     where type is 0 for default, 1 for send, and 2 for receive.
+  //     No channel and values are needed for default cases.
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_103.tmp_0"
+      arguments: "fill_constant_104.tmp_0"
+    }
+    inputs {
+      parameter: "case_to_execute"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }    
+    type: "select"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+    attrs {
+      name: "cases"
+      type: STRINGS
+      strings: "0,1,channel_101,fill_constant_109.tmp_0"
+      strings: "1,2,channel_102,fill_constant_108.tmp_0"
+    }
+  }
+  ...
+}
+```
+
+The python select API will add the **select_op** to the current block.  In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**.  It will also create a temp variable called **case_to_execute**.  This variable is
+filled in by the select_op after it has completed processing the case statements.
+
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread.  The thread will
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+
+Finally the select_op will call executor.run on the **sub_block**.
+
+```
+blocks {
+  idx: 1
+  parent_idx: 0
+  ...
+  // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_111.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 0.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create an "equal" operator to compare the case index with the "case_to_execute"
+  // tensor (which was filled in by the select op).
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_111.tmp_0"  // case 0
+    }
+    inputs {
+      parameter: "Y"
+      arguments: "fill_constant_110.tmp_0"  // case_to_execute
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "equal_0.tmp_0"
+    }
+    type: "equal"
+    attrs {
+      name: "axis"
+      type: INT
+      i: -1
+    }
+  }
+  // Use the output of the "equal" operator as a condition for the "conditional_block".
+  // If the condition evaluates to true, then execute the "sub_block" (which represents
+  // the select case's body)
+  ops {
+    inputs {
+      parameter: "Params"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "equal_0.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+    }
+    outputs {
+      parameter: "Scope"
+      arguments: "_generated_var_0"
+    }
+    type: "conditional_block"
+    attrs {
+      name: "is_scalar_condition"
+      type: BOOLEAN
+      b: true
+    }
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 4
+    }
+  }
+  ...
+  // Repeat the above operators for each case statements inside the select body
+}
+
+```
+
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block,
+only one case will be executed.
+
+### select_op flow
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
+</p>
+
+The select algorithm is inspired by golang's select routine.  Please refer to
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+
+## Backward Pass
+
+TODO
diff --git a/doc/fluid/design/data_type/float16.md b/doc/fluid/design/data_type/float16.md
new file mode 100644
index 0000000000000000000000000000000000000000..844d2aafcf257b85057e1ac200ed3d5cf0be2ff0
--- /dev/null
+++ b/doc/fluid/design/data_type/float16.md
@@ -0,0 +1,183 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## float16 inference
+In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one.  
+
+### Operator level requirement
+Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output. 
+
+This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type.
+
+The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable.
+
+So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator.
+
+### Variable level requirement
+Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type.
+
+When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights. 
+
+In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h). 
+
+The following code demonstrates how to do the tensor conversion.
+```Python
+# var is the variable of float weights
+# tensor is a numpy array of data copied from the tensor data in var 
+# fp16_var is the variable that will contain float16 weights converted from var  
+tensor = numpy.array(var.get_tensor())
+fp16_tensor = fp16_var.get_tensor()
+
+# After the original tensor data is converted to numpy float16 data type, 
+# view(numpy.uint16) is used so that the internal memory of the numpy array 
+# will be reinterpreted to be of uint16 data type, which is binded to 
+# Fluid float16 class via pybind with the help of uint16_t built-in c++ type
+fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace)  
+```
+
+### Consistent API requirement
+The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type.
+
+This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode.   
+
+### float16 transpiler
+Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one.
+
+Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md),
+this transpiler mainly does the following modifications:
+
+1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel. 
+
+2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result.
+
+3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program.
+
+4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart.
+
+Below is an example of usage:
+```Python
+# Get the float inference program
+[float_inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(float_inference_program,
+                        feed={feed_target_names[0]: tensor_img},
+                        fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = fluid.InferenceTranspiler()
+t.float16_transpile(float16_inference_program, GPUPlace)
+
+# Running 
+float16_results = exe.run(float16_inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+```
+
+As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode.
+
+### Speedup on GPU
+Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+
+Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details.
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b60167b6b1599df69dfc5073ebf32bdbb0a316ec
--- /dev/null
+++ b/doc/fluid/design/data_type/index_cn.rst
@@ -0,0 +1,7 @@
+数据类型
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a88d17943f49134a2d00363845e919537ff4545
--- /dev/null
+++ b/doc/fluid/design/data_type/index_en.rst
@@ -0,0 +1,7 @@
+Data Type
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+![synchronous distributed training](./src/sync_distributed_training.png)
+
+1. Pserver
+	1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+	1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+	1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+	1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+	1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+	1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+   1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+	1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+	1. Pserver will run the optimization block to optimize the parameters.
+	1. After optimization, pserver set `barrier_condition_` to 1.
+	1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+	1. The trainer uses GetVariable to get all the parameters from pserver.
+	1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+	1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+![asynchronous distributed training](./src/async_distributed_training.png)
+
+1. Pserver:
+	1. Each parameter has a queue to receive its gradient from trainers.
+	1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+	1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+	1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+	1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+	2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..248d2ec18dafdecac9184527638754b6ba4d85b8
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,61 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
+1. After the PS node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
+so all the Trainers would get the same parameters.
+
+In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
+have completed running current mini-batch. After that, all trainers can continue to run the next
+mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends 
+on the slowest node.
+
+In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
+the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
+train such models that achieve scaling, better throughput. In this design doc, we will introduce how to 
+implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
+
+## Design
+
+<img src="./src/async_update.png" width="600"/>
+
+As the figure above, we describe a global view of the asynchronous update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PS
+instances and then send them.
+1. PS would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch the latest parameter from PS before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PS instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+<img src="./src/async_pserver.png" width="750"/>
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8b8427811cddcddf872db5badfd37c96a76c3e3
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+whatever platform you like.
+
+It has two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+<img src="src/ncc2_design.png">
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer has the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
new file mode 100644
index 0000000000000000000000000000000000000000..371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -0,0 +1,197 @@
+# Design Doc: Fluid Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
+
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
+
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
+
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
+
+## Analysis
+
+The assumption is that the user writes the trainer program in either Python or C++.
+
+### Limitation 1
+
+There are two basic functionalities in the trainer program:
+
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
+  optimizer.
+
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
+
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
+
+### Limitation 2
+
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
+
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
+
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
+
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
+
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
+
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
+
+## Distributed Training Architecture
+
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
+
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
+
+### Python API
+
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
+
+```Python
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+    for data in train_reader():
+        loss, acc = exe.run(trainer_prog,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost])
+```
+
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+   distributed training program:
+   1. Parse configurations from `RemoteExecutor`.
+   1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+   1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+      DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+      "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+    feed=feeder.feed(data),
+    fetch_list=[avg_cost],
+    job_desc=JobDesc(
+      jobname,
+      num_trainer,
+      num_pserver,
+      cpu_per_trainer,
+      gpu_per_trainer,
+      mem_per_trainer,
+      cpu_per_pserver,
+      mem_per_pserver
+    ))
+for data in train_reader():
+    loss, acc = exe.run(trainer_prog,
+                        feed=feeder.feed(data),
+                        fetch_list=[avg_cost])
+```
+
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
+
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
+
+
+### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
+
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](./README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -0,0 +1,89 @@
+# Design Doc: Distributed Lookup Table Operator
+
+A distribute lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+
+## Background
+
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+
+### The Forward Algorithm
+
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+
+$$y = x * W$$
+
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x).  Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say two symbols, and a lookup table W:
+
+![lookup table](./src/lookup_table.png)
+
+### The Backward Algorithm
+
+The backward algorithm computes W'(x) using W(x).  W'(x) has the same
+the scale of size as W(x) and is much smaller than W.
+
+To optimize W given W', we can do simple SGD update:
+
+$$W = f(W') = \lambda * W'$$
+
+or some more sophisticated algorithms that rely on both W' and W:
+
+$$W = f(W, W')$$
+
+The following figure illustrates the backward pass of the lookup
+operator: ![lookup table training](./src/lookup_table_training.png)
+
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
new file mode 100644
index 0000000000000000000000000000000000000000..c09b7c99159ace9b3df989f803ede20bc3585d92
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -0,0 +1,44 @@
+# Parallelism, Asynchronous,  Synchronous, Codistillation
+
+
+For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
+
+# Model Parallelism
+In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
+
+A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
+
+More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
+
+# Data Parallelism
+Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
+
+# Asynchronous Training
+In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
+
+In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
+
+Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
+
+# Synchronous Training
+Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
+
+An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
+
+Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
+
+Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
+
+# Codistillation
+Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
+
+
+# Reference
+
+[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
+
+[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
+
+[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
+
+[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad3afc7b7522c60460c6f1f387f9415d3738778
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args:  Need add execute args to assign when and how to use MPI operations.
+* New ops:  Need new op  ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add   ```mpi_send_op``` and ```mpi_listenandserve_op```  to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI```  compile args to control MPI to use or not. If the  ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer),  The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests,  the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives,  the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if  confirm to use MPI, we will modify  ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..38222d083084ebfca3099ce96b47868c42d55101
--- /dev/null
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
new file mode 100644
index 0000000000000000000000000000000000000000..563b70bc0e852bec953eb40dda3c46b3d45d7e68
--- /dev/null
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -0,0 +1,106 @@
+# Design Doc: Parameter Server
+
+## Abstract
+
+We propose an approach to implement the parameter server. In this
+approach, there is no fundamental difference between the trainer and
+the parameter server: they both run subgraphs, but subgraphs of
+different purposes.
+
+## Background
+
+The previous implementations of the parameter server do not run a
+fluid sub-program. Parameter initialization, optimizer computation, network
+communication and checkpointing are implemented twice on both the
+trainer as well as the parameter server.
+
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
+server becomes a natural extension.
+
+## Design
+
+### Distributed Transpiler
+
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
+steps:
+
+1. OP placement: the OPs will be placed on different nodes according
+   to a heuristic that minimizes the estimated total computation
+   time. Currently we will use a simple heuristic that puts parameter
+   variable on parameter server workers and everything else on trainer
+   workers.
+1. Add communication OPs to enable the communication between nodes.
+
+We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
+
+Below is an example of converting the user defined graph to the
+subgraphs for the trainer and the parameter server:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
+
+After converting:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
+
+1. The parameter variable W and its optimizer program are placed on the parameter server.
+1. Operators are added to the program.
+   - *Send* sends data to the connected *Recv* operator.  The
+	 scheduler on the receive node will only schedule *Recv* operator
+	 to run when the *Send* operator has ran (the *Send* OP will mark
+	 the *Recv* OP runnable automatically).
+   - *Enqueue* enqueues the input variable, it can block until space
+     become available in the queue.
+   - *Dequeue* outputs configurable numbers of tensors from the
+     queue. It will block until the queue has the required number of
+     tensors.
+
+### Sparse Update
+
+For embedding layers, the gradient may have many rows containing only 0 when training,
+if the gradient uses a dense tensor to do parameter optimization,
+it could spend unnecessary memory, slow down the calculations and waste
+the bandwidth while doing distributed training.
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
+non-zero gradient data. So when we do parameter optimization both locally and remotely,
+we only need to send those non-zero rows to the optimizer operators:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
+### Benefits
+
+- Model parallelism becomes easier to implement: it is an extension to
+  the trainer - parameter server approach. We can have several "Transpilers"
+  to achieve different goals.
+- User-defined optimizer is easier to add - user can now express it as
+  a sub-program.
+- No more duplication logic inside the trainer and the parameter
+  server mentioned in the background section.
+
+### Challenges
+
+- It is important to balance the parameter shards on multiple
+  parameter servers. If a single parameter is very big (for example: some
+  word-embedding, fully connected, softmax layer), we need to
+  automatically partition the single parameter onto different
+  parameter servers when possible (only element-wise optimizer depends
+  on the parameter variable).
+- In the "Async SGD" figure, the "W" variable on the parameter server
+  could be read and written concurrently. See
+  [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+  details about concurrent program in Fluid.
+
+### Discussion
+
+- Can the Enqueue OP be implemented under our current tensor design
+  (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
+  `min_count` attribute), does our current design support it? (similar
+  question for the *Add* OP)
+
+### References
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/design/dist_train/src/compiler.graffle differ
diff --git a/doc/fluid/design/dist_train/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/design/dist_train/src/compiler.png differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/dist-graph.graffle differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/dist-graph.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local-graph.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local-graph.png differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table_training.png differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads.graffle differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/paddle-compile.graffle differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/paddle-compile.png differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.graffle differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.png differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.png differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/2_level_rnn.png differ
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/design/dynamic_rnn/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn.jpg differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b39ae0675c45e56852293d97f45e91861cf31667
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -0,0 +1,153 @@
+# RNNOp design
+
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
+
+## RNN Algorithm Implementation
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
+</p>
+
+The above diagram shows an RNN unrolled into a full network.
+
+There are several important concepts here:
+
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
+
+### Step-scope
+
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/><br/>
+Figure 2 illustrates the RNN's data flow
+</p>
+
+Please be aware that every step runs the same step-net.  Each step does the following:
+
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
+
+The RNN operator will compose its output from step outputs in each of the step scopes.
+
+### Memory and Ex-memory
+
+Let's give more details about memory and ex-memory using a simple example:
+
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
+
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
+
+### Usage in Python
+
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
+
+We can define an RNN's step-net using a Block:
+
+```python
+import paddle as pd
+
+X = some_op() # x is some operator's output and is a LoDTensor
+a = some_op()
+
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+    x = rnn.add_input(X)
+    # declare a memory (rnn's step)
+    h = rnn.add_memory(init=a)
+    # h.pre_state(), the previous memory of rnn
+    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+    # update current memory
+    h.update(new_state)
+    # indicate that h variables in all step scopes should be merged
+    rnn.add_outputs(h)
+
+out = rnn()
+```
+
+Python API functions in above example:
+
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
+
+### Nested RNN and LoDTensor
+
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
+
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/>
+</p>
+
+```python
+import paddle as pd
+
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+
+# a is output of some op
+a = some_op()
+
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is a chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+
+def lower_level_rnn(paragraph):
+    '''
+    x: the input
+    '''
+    rnn = pd.create_rnn_op(output_num=1)
+    with rnn.stepnet():
+        sentence = rnn.add_input(paragraph, level=0)
+        h = rnn.add_memory(shape=[20, 30])
+        h.update(
+            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+        # get the last state as sentence's info
+        rnn.add_outputs(h)
+    return rnn
+
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+    paragraph_data = rnn.add_input(chapter_data, level=1)
+    low_rnn = lower_level_rnn(paragraph_data)
+    paragraph_out = low_rnn()
+
+    h = rnn.add_memory(init=a)
+    h.update(
+        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+    top_level_rnn.add_outputs(h)
+
+# output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
+
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn_2level_data.png"/>
+</p>
diff --git a/doc/fluid/design/dynamic_rnn/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..cecfcd3307ae4c4fa603220a360e9e124069fa58
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -0,0 +1,242 @@
+# RNN 变长输入设计
+对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
+但不管是padding还是bucket，对于用户都是额外的使用负担。
+
+因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
+
+为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
+
+为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
+其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
+如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下：
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
+可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
+简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性，以实现`lod_start_pos`的传递：
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
+Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
+
+比如原始的输入：
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化，这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置，并添加一个新的接口
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
+
+即下面的转变：
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9493908f4f73b3e7d91f5f6364a2a3660257d508
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
@@ -0,0 +1,175 @@
+# Varient Length supported RNN Design
+For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
+
+Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
+
+The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`, 
+which supports the variable length sequences without padding. 
+This doc will design fluid's RNN based on this idea.
+
+## Multi-layer sequence data format `LODTensor`
+At present, Paddle stores data in one mini-batch in one-dimensional array.
+
+`Argument.sequenceStartPositions` is used to store information for each sentence.
+
+In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
+
+In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
+
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+Or more clearly defined here
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+Each `level_t` here stores a level of offset information consistent with paddle's current practice.
+
+In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
+Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
+Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
+The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
+
+The definition of `LODTensor` is as follows:
+
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
+`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
+
+## How to support the framework
+### Replace `Tensor` with `LoDTensor`
+To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
+Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
+
+In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
+
+### Transmit `lod_start_pos` along with the Op call chain
+`lod_start_pos` is passed along with the Op call chain
+The framework needs to support the following features to implement the transmit of `lod_start_pos`:
+
+1. Implement the transfer as `shared_ptr`
+    - Do not modify the contents of `lod_start_pos` as a consumer
+    - Modify producer of `lod_start_pos` as producer
+    - Conventions consumer only needs to copy `shared_ptr` passed over
+    - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
+    - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
+
+2. Op is transparent enough not to sense `lod_start_pos`
+3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
+
+## sorted by length
+After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
+
+For example, the original input:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+In order to track the changes before and after sorting, use here
+
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+To track the position of the sequence after sorting, and add a new interface
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+Due to the sequence of input sequences, the following existing interfaces need to be modified:
+
+- InitMemories, memory needs to be rearranged according to `sorted_seqs`
+- SetmentInputs
+- ConcatOutputs
+
+In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
+It is passed in as an input to `RecurrentGradientOp`.
+
+## InitMemories
+Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
+
+## SegmentInputs
+
+`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
+
+the transition is as follows:
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` needs
+
+- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
+- Concat each sequence as a regular mini-batch representation
+
+## references
+1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/execution/if_else_op.md b/doc/fluid/design/execution/if_else_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..26d140f06db4ecefa86be015eaa731ffddc6910c
--- /dev/null
+++ b/doc/fluid/design/execution/if_else_op.md
@@ -0,0 +1,51 @@
+# The `IfElse` Operator
+
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
+
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+```
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b017429d168b2466d8f6b423f48bd5d78d1f
--- /dev/null
+++ b/doc/fluid/design/execution/index_cn.rst
@@ -0,0 +1,8 @@
+执行流程
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcf846da348ff0bed707c42718e08314998fbac0
--- /dev/null
+++ b/doc/fluid/design/execution/index_en.rst
@@ -0,0 +1,8 @@
+Execution Process
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8
--- /dev/null
+++ b/doc/fluid/design/execution/switch.md
@@ -0,0 +1,31 @@
+# Design Doc: Switch
+
+## Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+## The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,19 @@
+设计思想
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_cn.rst
+  execution/index_cn.rst
+  concepts/index_cn.rst
+  data_type/index_cn.rst
+  memory/index_cn.rst
+  multi_devices/index_cn.rst
+  dynamic_rnn/index_cn.rst
+  concurrent/index_cn.rst
+  algorithm/index_cn.rst
+  network/index_cn.rst
+  modules/index_cn.rst
+  interface/index_cn.rst
+  dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bfee02ad4626633b08ddff747e2886faf9ba99f
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,19 @@
+Design
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_en.rst
+  execution/index_en.rst
+  concepts/index_en.rst
+  data_type/index_en.rst
+  memory/index_en.rst
+  multi_devices/index_en.rst
+  dynamic_rnn/index_en.rst
+  concurrent/index_en.rst
+  algorithm/index_en.rst
+  network/index_en.rst
+  modules/index_en.rst
+  interface/index_en.rst
+  dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69a8d9bad4fe88935b9fa87757abf0105ca8eb75
--- /dev/null
+++ b/doc/fluid/design/interface/index_cn.rst
@@ -0,0 +1,4 @@
+多语言接口
+------------
+
+TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22abc71f984aa5da7151d5ebf0c3bdbcc69a3624
--- /dev/null
+++ b/doc/fluid/design/interface/index_en.rst
@@ -0,0 +1,4 @@
+Multi-Language Interface
+-----------------------
+
+TBD
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..83ef97c99efeaf27a27f93f0cd3857c0f1bc812e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/design/memory/README.md b/doc/fluid/design/memory/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cf61d089b39041b7a15184e0ea9211d14a66f5e
--- /dev/null
+++ b/doc/fluid/design/memory/README.md
@@ -0,0 +1,141 @@
+# Region-based Heterogeneous Memory Management
+## Design
+
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::CUDAPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template <typename Place> void* Alloc(Place, size_t);
+template <typename Place> void Free(Place, void*);
+template <typename Place> size_t Used(Place);
+}  // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
+
+```cpp
+template<>
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and 
+
+```cpp
+template<>
+void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
+  return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = NULL;
+  if (a == NULL) {
+    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+  }
+  return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator* as = NULL;
+  if (as == NULL) {
+    as = new BuddyAllocator*[platform::NumGPUs()];
+    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+    }
+  }
+  return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+  ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+  struct Block {
+    size_t size;
+    Block* left, right;
+    size_t index; // allocator id
+  };
+  ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/doc/fluid/design/memory/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/design/memory/images/control_flow_graph.png differ
diff --git a/doc/fluid/design/memory/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/design/memory/images/dataflow_equations.png differ
diff --git a/doc/fluid/design/memory/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/design/memory/images/deep_learning.png differ
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c507c638bd1a6eb428175ed2756a6ecfc6cca198
--- /dev/null
+++ b/doc/fluid/design/memory/index_cn.rst
@@ -0,0 +1,7 @@
+内存管理
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7526437a73a09b300f05e138084755f5528b242
--- /dev/null
+++ b/doc/fluid/design/memory/index_en.rst
@@ -0,0 +1,7 @@
+Memory Management
+-------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..285464ada728d8f7a086a26beca6cfa4418e98e4
--- /dev/null
+++ b/doc/fluid/design/memory/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
+
+Following graph shows the details:
+
+![](images/deep_learning.png)
+
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. 
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
+
+#### In-place Operation
+In a relu activation operator： 
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
+
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+![](images/control_flow_graph.png)
+
+#### Dataflow Analysis
+
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+![](images/dataflow_equations.png)
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._sucessors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+        self._program = Program
+    
+    def build(self):
+        pass
+    
+    def dataflow_analysis(self):
+        pass
+        
+    def memory_optimization(self):
+        pass
+        
+    def get_program(self):
+        return self._program
+```
+
+#### Make dataflow analysis
+
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+    i --> pool
+    pool --> o
+else:
+    pool --> o
+    i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/fluid/design/modules/backward.md b/doc/fluid/design/modules/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a
--- /dev/null
+++ b/doc/fluid/design/modules/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. 
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by the cost function.
+        parameter_list(list): Parameters that need to be updated by optimizers.
+            If None, it means all parameters need to be updated.
+
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+        
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
+    """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building. 
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. 
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. 
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Create a new block(`grad_s_block`), whose father is `s_block`.
+        Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+    
+    Invoke `core.get_grad_op_desc()` to get op's grad_op.
+    Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+    Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+    Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. 
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block, 
+                           start_op_idx, 
+                           grad_to_var, 
+                           grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Invoke _append_backward_vars_(), with `block=s_block`
+        
+    for var_name in op.all_output_names():
+        if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+            continue
+        create a new variable named 'var_name' in block
+        if grad_to_var.has_key(var_name):
+            set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+            
+    do op's var type inference
+    do op's shape inference
+```
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37
--- /dev/null
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -0,0 +1,134 @@
+# Batch Normalization
+
+## What is batch normalization
+
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
+
+The principle of batch normalization can be summarized into a simple function:
+
+```
+y = (x - E[x]) / STD[x]) * scale + bias
+```
+
+`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
+
+In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
+
+## Differences with normal operators
+
+`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
+
+1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
+
+2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
+
+```
+if batch_id == 0
+  estimated_mean = E[x]
+else
+  estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
+```
+
+The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
+
+## Implementation
+
+Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
+
+### C++
+
+As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
+
+#### Inputs
+
+- `x`: The inputs data, which is generated by the previous layer.
+- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
+- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
+- `scale`: trainable parameter 'scale'
+- `bias`: trainable parameter 'bias'
+
+#### Outputs
+
+- `y`: The output data.
+- `batch_mean`: The mean value of batch data.
+- `batch_var`: The standard deviation value of batch data.
+- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
+- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
+
+#### Attributes
+
+- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
+- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
+- `epsilon`: *float*. The epsilon value to avoid division by zero.
+- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
+
+#### Kernels
+
+The following graph showes the training computational process of `batch_norm_op`:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
+
+cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
+
+### Python
+
+`batch_norm_op` is warpped as a layer in Python:
+
+```python
+def batch_norm_layer(net,
+                     input,
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
+                     epsilon = 1e-6,
+                     momentum = 0.99):
+	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
+	var_cache = scop.new_var(name = 'estimated_var', trainable = False)
+	batch_mean = scope.new_var(name = 'batch_mean')
+	batch_var = scope.new_var(name = 'batch_var')
+	batch_norm_op = Operator('batch_norm_op',
+	                         x = input,
+	                         estimated_mean = mean_cache,
+	                         estimated_mean = var_cache,
+	                         scale = scale,
+	                         bias = bias,
+	                         y = output,
+	                         batch_mean = batch_mean,
+	                         batch_var = batch_var,
+	                         saved_mean = mean_cache,
+	                         saved_var = var_cache,
+	                         is_infer = False,
+	                         use_global_est = use_global_est,
+	                         epsilon = epsilon,
+	                         momentum = momentum)
+	net.append_op(batch_norm_op)
+	return output
+```
+
+Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
+
+1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
+
+2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
+
+```python
+for pass_id in range(PASS_NUM):
+    # ...
+    net.train()  # run training model
+    if pass_id % 100 == 0:
+        net.infer(test_image)    # run inferencing model
+    # ...
+```
+
+`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
+</div>
+
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
+
+When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
+
+How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md
new file mode 100644
index 0000000000000000000000000000000000000000..de9605b0e67a035ab1ef1e4cafbe838f83bc5807
--- /dev/null
+++ b/doc/fluid/design/modules/evaluator.md
@@ -0,0 +1,58 @@
+# Evaluator Design
+
+## Problem Statement
+
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
+
+## Evaluator Design
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+## Implementation
+This design is shown in the Python API.
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
+
+
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/design/modules/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/design/modules/images/batch_norm_fork.png differ
diff --git a/doc/fluid/design/modules/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/design/modules/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/design/modules/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/design/modules/images/feed_forward.png differ
diff --git a/doc/fluid/design/modules/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/design/modules/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/design/modules/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/design/modules/images/l1_regularization.png differ
diff --git a/doc/fluid/design/modules/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/design/modules/images/l2_regularization.png differ
diff --git a/doc/fluid/design/modules/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/design/modules/images/loss_equation.png differ
diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b25783f0f5120991c29ba31b7b512bd4c183eecf
--- /dev/null
+++ b/doc/fluid/design/modules/index_cn.rst
@@ -0,0 +1,14 @@
+代码结构和重要模块
+-----------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2108156e080996916f2650448f0a56f998757204
--- /dev/null
+++ b/doc/fluid/design/modules/index_en.rst
@@ -0,0 +1,14 @@
+Code Structure and Important Modules
+-------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a
--- /dev/null
+++ b/doc/fluid/design/modules/infer_var_type.md
@@ -0,0 +1,78 @@
+# Design Doc: InferVarType
+
+## The Problem Posed
+
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+
+## Proposed Solution
+
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+
+
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+
+It takes an operator description as its input and will write the output variable type and store them in block description.
+
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+
+## Register InferVarType
+
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+
+Operator developers can write the specialize `VarTypeInferer` as follow.
+
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e64ac2fb1c6898bfeb883250347da3d9a4757b97
--- /dev/null
+++ b/doc/fluid/design/modules/net_op_design.md
@@ -0,0 +1,250 @@
+# Network Design
+
+`Network` is the container and controller of a set of operators,
+user can build a real network from a `NetDesc` which is a protobuf message
+and use `Network.Run()` to run all the operators in the network.
+
+A network object knows all Operators belonging to this network. Variables,
+which are inputs and outputs of these operators,
+are created and managed by a hierarchy of Scope objects.
+
+## API
+
+### Net
+To make the `Network` extendable, a base class is defined like this
+
+```c++
+// operator's index stored in a network.
+typedef int OpIndex;
+
+// The minimum a network should be implemented.
+class Net {
+ public:
+  // run all the operators and return success(true) or not, with all the
+  // variables are located in `scope`. `context` describes the detail execution
+  // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+  // If no positive indexes are provided, all operators in `ops_` will run.
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                   OpIndex end = -1) const = 0;
+
+  // Add an Operator according to `def`.
+  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+  // Add optimizer operators acctording to `attrs`.
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+  // Add backward operators.
+  virtual Error AddBackwardOps() = 0;
+
+  // Infer the shapes of variables required by operators in the network. The
+  // `scope` will be mutated according to the inferred shapes.
+
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+};
+```
+
+All network implementations should build networks from a protobuf message which
+describes the structure of a real network; `Run` method should be implemented by
+all implementations to offer a universal method to forward or backward compute a network.
+
+`Net::Create` is a method of factory pattern and can be implemented like
+
+```c++
+std::unique<Net> Net::Create(const NetDesc& def) {
+  switch (def.model_type()) {
+    case NN:
+      return new Network(def);
+    case Recursive:
+      return new RecursiveNet(def);
+    case Recurrent:
+      return new RecurrentNet(def);
+  }
+  return nullptr;
+}
+```
+
+Network is designed as the container of operators. to make it more extendable,
+we decouple it from the related variable resources.
+
+`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
+
+Finally, `Net` can be used as followed
+
+```c++
+Scope default_scope;
+OpContext default_context;
+auto net = Net::CreateNet(def);
+
+if (net) {
+  net.Run(&default_scope, &default_context);
+}
+```
+
+### `PlainNet` as a simple implementation of `BaseNet`
+
+A very basic implementation is as follows. All it does is simply to run every operators in sequence.
+
+```c++
+class PlainNet : public Net {
+ public:
+  // Create a network describe by `def`.  NetDesc is the definition of a network.
+  PlainNet(const NetDesc &def);
+
+  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+  training.
+  virtual Error InferShape(Scope *scope) override;
+
+  // Run all the operators with the `scope`, if no scope is provided, default
+  // scope will be used instead. If no OpContext is provicded, default context will be used.
+  virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
+                   OpIndex end = -1) const override;
+
+  virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+  virtual Error AddBackwardOps() override;
+
+ protected:
+  // Create operators accordding to `def`, will be called by the constructor.
+  Error BuildNet(const NetDesc &def);
+
+  // Add a operator which is identified as `type` and has attributes described
+  // in `attrs`, the `inputs` are the keys of readonly input variables,
+  // `outputs` are keys of mutable output variables. An `OpIndex` will be
+  // returned to indicate the offset of the new operator in `ops_`.
+  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
+                const std::vector<string> &outputs,
+                const OprAttr &attrs = OprAttr());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+```
+
+`PlainNet` will create operators so that a private member `ops_` is defined,
+the operators are created by `CreateNet`, and each operator is created by `AddOp`.
+
+
+## PlainNet Usage
+`PlainNet` can be used to define and run a network as follows
+
+```c++
+// create an empty scope located on CPU device.
+Scope scope(CPUPlace());
+
+// create and init variables described in `net_desc`.
+scope.CreateVariables(net_desc);
+scope.InitVariables(net_desc);
+
+// create a network according to `net_desc`
+auto net = Net::CreateNet(net_desc);
+// Add more operators if needed.
+net->AddOp(add...);
+net->AddOp(fc...);
+
+net->AddBackwardOps();
+net->AddOptimizerOps();
+
+// run the network providing the `scope`.
+net.Run(&scope);
+```
+
+## `NetBuilder` as a C++ syntax wrapper
+This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
+
+The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
+
+```c++
+Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
+Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
+Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
+Variable* avg_loss = builder.AddOp("mean", loss);
+
+builder.BackwardFrom(avg_loss)
+builder.AddOptimization(1e-4, "adam");
+builder.Run();
+```
+
+`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
+
+```c++
+class NetBuilder final {
+ public:
+  NetBuilder(Net* net) : net_(net) {}
+
+  Variable* AddOp(const string& type, const vector<Variable>& inputs,
+                  size_t size, Activation act) {
+    // much code here.
+    // ...
+    net_->AddOp(def);
+    need_rebuild_net_ = true;
+    net_->InferShape();
+    // ...
+  }
+
+  Error BackwardFrom(const Variable& cost);
+
+  Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
+    // backward.
+    if (need_backward) {
+      if (need_rebuild_net_) {
+        AddBackwardOps();
+        AddOptimizerOps();
+      }
+      net_->Run(scope, context);
+      return;
+    }
+    // just forward.
+    net_->Run(scope, context, 0, last_forward_op_);
+  }
+
+ protected:
+  Error AddBackwardOps();
+  Error AddOptimizerOps();
+
+ private:
+  Net* net_;
+  OpIndex last_forward_op_{-1};
+  bool need_rebuild_net_{true};
+}
+```
+
+### Compatibility with RNN
+
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
+for example we can implement a simple recurrent neural network as follows
+
+```c++
+// copy some `vars` form `source` to `target`
+void Copy(const Scope &source, Scope &target,
+          const std::vector<std::string> &vars);
+
+Scope default_scope;
+// some initial mutations on `default_scope` here.
+
+auto rnn_step_net = PlainNet(rnn_step_net_def);
+
+// Create rnn's states, the last scope is used to store rnn outputs.
+Scope *rnn_states = new Scope[num_states + 1];
+
+for (int i = 0; i < num_states + 1; i++) {
+  // Initialize all rnn state scopes, copy parameters and so on.
+  rnn_states[i].CreateVars(rnn_step_net_def);
+  Copy(default_scope, rnn_states[i], rnn_related_vars);
+  // Prepare rnn's inlinks, just copy inlink variables to each state.
+  Copy(default_scope, rnn_states[i], inlink_vars);
+}
+
+// Run the rnn.
+for (int i = 0; i < num_states; i++) {
+  rnn_step_net.Run(rnn_states[i]);
+  // Copy current state's state variables to next state, the related variables
+  // are named like "previous_state_xxx".
+  Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
+}
+
+// Copy rnn's final outputs to `default_scope`.
+Copy(rnn_states[num_states], default_scope, outlink_vars);
+```
diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c25fde9cafb322f789662077d3fc6cc1d64ce38
--- /dev/null
+++ b/doc/fluid/design/modules/optimizer.md
@@ -0,0 +1,91 @@
+# Optimizer Design
+
+## The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+## High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/fluid/design/modules/prune.md b/doc/fluid/design/modules/prune.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a
--- /dev/null
+++ b/doc/fluid/design/modules/prune.md
@@ -0,0 +1,63 @@
+# Prune
+
+## Motivation
+
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+
+## Challenge
+
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+
+## Solution
+
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+
+```c++
+message OpDesc {
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+```
+
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+
+### Algorithm
+
+If an operator needs to be run, it must fall into one of the following cases:
+
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+```
+
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..83af4e55485c079265d3f2b1e15070825b532c02
--- /dev/null
+++ b/doc/fluid/design/modules/python_api.md
@@ -0,0 +1,325 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+
+    def type(self):
+        return self.desc.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Function
+
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
+
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/fluid/design/modules/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c
--- /dev/null
+++ b/doc/fluid/design/modules/register_grad_op.md
@@ -0,0 +1,92 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+
+However, we noticed two problems with the current design:
+
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+
+## The Current Implementation
+
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+
+map<string, OpInfo> OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
+
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
new file mode 100644
index 0000000000000000000000000000000000000000..519a9143033386678351ff78a465e5ba6e220c52
--- /dev/null
+++ b/doc/fluid/design/modules/regularization.md
@@ -0,0 +1,66 @@
+# Regularization in PaddlePaddle
+
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
+
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
+
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+
+##### L2 Regularization:
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
+
+##### L1 Regularization
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
+
+A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+
+## Regularization Survey
+
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
+
+## Proposal for Regularization in PaddlePaddle
+
+### Low-Level implementation
+
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Computation Graph
+
+Below is an example of a really simple feed forward neural network.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
+
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
+   
+### Python API implementation for Regularization
+
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
+
+The proposal is to add these ops in a lazy manner just before the backward pass.
+
+#### Storage of Regularization attributes
+
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/fluid/design/modules/selected_rows.md b/doc/fluid/design/modules/selected_rows.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5
--- /dev/null
+++ b/doc/fluid/design/modules/selected_rows.md
@@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators that need to be written to support `SelectedRows`. These are:
+
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc222564e3ec28e306ca0572b6a23104f6e9cbc5
--- /dev/null
+++ b/doc/fluid/design/motivation/api.md
@@ -0,0 +1,261 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems as neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use GAN in this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code of `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/fluid/design/motivation/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.graffle differ
diff --git a/doc/fluid/design/motivation/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.png differ
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid.md
@@ -0,0 +1,140 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..6dd3840a0734e8593890dcf8044746197350c6f5
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -0,0 +1,110 @@
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself.  The C++ class `Executor` can run this
+protobuf message as an interpreter.  This article describes the Fluid
+compiler.
+
+![](fluid-compiler.png)
+
+## ProgramDesc
+
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+
+```python
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+This program consists of a [block](../concepts/block.md) of three operators --
+`read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
+the following
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+## Transpilers
+
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`.  Let us take some
+examples:
+
+1. *Memory optimization transpiler*: We can write a transpiler that
+   inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+   to free memory early, before the end of an iteration, so to keep a
+   small memory footprint.
+
+1. *Distributed training transpiler*: We can write a transpiler that
+   converts a`ProgramDesc` into its distributed version of two
+   `ProgramDesc`s -- one for running by the trainer processes and the
+   other for the parameter server.
+
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+
+## Native Code Generator
+
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+
+```c++
+void main() {
+  auto X = fluid_cuda_read(...);
+  auto W = fluid_cuda_create_tensor(...);
+  auto Y = fluid_cuda_mult(X, W);
+}
+```
+
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`.  Please be aware
+that each function could just define a C++ instance of an operator and
+run it.  For example
+
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+  paddle::Tensor t;
+  paddle::operator::Read r(&t, ...);
+  r.Run();
+  return t;
+}
+```
+
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
+                               const paddle::Tensor& b) {
+  paddle::Tensor t;
+  paddle::operator::Mult m(a, b, ...);
+  Mult.Run(cuda_context);
+}
+```
+
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+
+## Multi-Block Code Generation
+
+Most Fluid application programs may have more than one blocks.  To
+execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7706e73eca644ed6db772fd77da947395313237f
--- /dev/null
+++ b/doc/fluid/design/motivation/index_cn.rst
@@ -0,0 +1,10 @@
+设计动机和目标
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10b64b257c604ced6b957d6d6018e8a363f00fac
--- /dev/null
+++ b/doc/fluid/design/motivation/index_en.rst
@@ -0,0 +1,10 @@
+Design Motivations and Goals
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d0f6d3f3ad9884f108826e8410871fffd51bf
--- /dev/null
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -0,0 +1,275 @@
+# Design Doc: Refactorization Overview
+
+The goals of refactoring include:
+
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
+
+## Computation Graphs
+
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
+
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
+
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
+
+1. A graph is composed of *variables* and *operators*.
+
+1. The description of graphs must be serializable/deserializable, so that:
+
+   1. It can be sent to the cloud for distributed execution, and
+   1. It can be sent to clients for mobile or enterprise deployment.
+
+1. The Python program does two things
+
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
+      1. the C++ library `libpaddle.so` for local execution,
+      1. the master process of a distributed training job for training, or
+      1. the server process of a Kubernetes serving job for distributed serving.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
+
+## Description and Realization of Computation Graph
+
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
+
+At runtime, the C++ program realizes the graph and runs it.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
+
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+
+## Compilation and Execution
+
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
+
+   1. Create `VarDesc` to represent local/intermediate variables,
+   1. Create operators and set attributes,
+   1. Validate attribute values,
+   1. Infer the type and the shape of variables,
+   1. Plan memory-reuse for variables,
+   1. Generate the backward graph
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
+
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
+
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
+      1. realize local variables defined in the BlockDesc message in the new scope,
+      1. a scope is similar to the stack frame in programming languages,
+
+   1. Create an instance of class `Block`, in which,
+      1. realize operators in the BlockDesc message,
+
+   1. Run the Block by calling
+      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
+      1. `Block::Eval(vector<Operator>* targets)` for optimization.
+
+
+## Intermediate Representation (IR)
+
+```text
+Compile Time -> IR -> Runtime
+```
+
+### Benefits of IR
+
+- Optimization
+  ```text
+  Compile Time -> IR -> Optimized IR -> Runtime
+  ```
+- Automatically send partitioned IR to different nodes.
+  - Automatic Data Parallelism
+    ```text
+    Compile Time
+    |-> Single GPU IR
+        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+            |-> Node-0 (runs trainer-IR-0)
+            |-> Node-1 (runs trainer-IR-1)
+            |-> Node-2 (runs pserver-IR)
+    ```
+  - Automatic Model Parallelism (planned for future)
+
+---
+
+## Operator/OpWithKernel/OpKernel
+
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
+
+---
+
+## Operator
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
+
+* `Operator` is the fundamental building block of the user interface.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+    * Use `Run` to compute the `output` variables from the `input` variables.
+
+---
+
+## OpWithKernel/Kernel
+
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
+
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+
+---
+
+## Why separate Kernel and Operator
+
+* Separate GPU and CPU code.
+    * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
+---
+
+## Libraries for Kernel development
+
+* `Eigen::Tensor` contains basic math and element-wise functions.
+    * Note that `Eigen::Tensor` has broadcast implementation.
+    * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
+---
+## Operator Registration
+
+### Why is registration necessary?
+We need a method to build mappings between Op type names and Op classes.
+
+### How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
+
+---
+## The Registry Map
+
+### `OpInfoMap`
+
+`op_type(string)` -> `OpInfo`
+
+`OpInfo`:
+
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+
+---
+## Related Concepts
+
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
+
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+
+---
+## Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+	1. Call maker class to complete `proto` and `checker`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
+
+---
+## Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forward Op to backward Op
+![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
+
+---
+## Backward Module (2/2)
+### Build Backward Network
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+	- Shared Variables => insert an `Add` operator to combine gradients
+	- No Gradient => insert a `fill_zero_grad` operator
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
+	- RNN Op => recursively call `Backward` on stepnet
+
+
+---
+## Scope, Variable, Tensor
+
+* `Tensor` is an n-dimension array with type.
+	* Only dims and data pointers are stored in `Tensor`.
+	* All operations on `Tensor` are written in `Operator` or global functions.
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+	* `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
+
+---
+## Block (in design)
+### the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
+
+---
+## Milestone
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+  - Framework development gives **priority support** to model migration, for example,
+    - the MNIST demo needs a Python interface,
+    - the RNN models require the framework to support `LoDTensor`.
+  - Determine some timelines,
+  - Frequently used Ops need to be migrated first,
+  - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
+
+---
+## Control the migration quality
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+  - The documentation of layers and ops should be written inside the code.
+  - Take the documentation quality into account when submitting pull requests.
+  - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/fluid/design/multi_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f8439e8623e1c1ae9a12c24d08079f0ec3d761f
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_cn.rst
@@ -0,0 +1,9 @@
+多设备支持
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_en.rst
@@ -0,0 +1,9 @@
+Multi-Device Support
+----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..6edc14ca73b1abf824981b59511a9aca4e0f3b47
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
@@ -0,0 +1,59 @@
+# Kernel Hint Design
+
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d2aab87b8cf30d03075e96cc4c67070efaf963a
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -0,0 +1,101 @@
+# Kernel Selection
+
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
+
+The `OpKernelType ` is as follows:
+
+```cpp
+struct OpKernelType {
+  Place place_;
+  DataType data_type_;
+  DataLayout data_layout_;
+  LibraryType library_type_;
+};
+```
+
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
+
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
+
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+     |
+ op1_2_op2
+     |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution: data transform
+
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
+
+The algorithm is described as following
+
+```cpp
+void OperatorWithKernel::Run(
+        const Scope& scope,
+        const platform::Place& place) const {
+  ExecutionContext ctx(...);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name : this->Inputs()) {
+    auto* tensor_in = GetTensor(var_name);
+    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+      auto* trans_var = new_scope.Var(var_name);
+      auto* out = TransformData(expected_kernel_key,
+                                kernel_type_for_var,
+                                *tensor_in);
+      SetTensorToVariable(...);
+    }
+  }
+
+  auto kernel = kernels.find(expected_kernel_key);
+  kernel->Compute(ExecutionContext(...));
+}
+```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+     |
+op1_2_op2(on CPU)
+     |
+[transform](from CPU to GPU)
+     |
+op1_2_op2(on GPU)
+     |
+OP2(CUDAPlace)
+```
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e391bd62b4f4e123a9a6f35b7adf5726f205635
--- /dev/null
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
new file mode 100644
index 0000000000000000000000000000000000000000..f32a5b7e8a4d820319a666dab4c3129360e2c924
--- /dev/null
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -0,0 +1,235 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
+Figure 1. Archetecture of Deep Speech 2 Network.
+</div>
+
+We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
+
+Key ingredients about the layers:
+
+- **Data Layers**:
+   - Frame sequences data of audio **spectrogram** (with FFT).
+   - Token sequences data of **transcription** text (labels).
+   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
+- **2D Convolution Layers**:
+   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
+   - With striding for only the first convlution layer.
+   - No pooling for all convolution layers.
+- **Uni-directional RNNs**
+	- Uni-directional + row convolution: for low-latency inference.
+	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
+- **Row convolution**:
+	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
+	- Not nessesary if with bi-direcitional RNNs.
+	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
+- **Batch Normalization Layers**:
+   - Added to all above layers (except for data and loss layer).
+   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
+
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
+
+### Row Convolution
+
+TODO by Assignees
+
+### Beam Search with CTC and LM
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
+
+## Future Work
+
+- Efficiency Improvement
+- Accuracy Improvement
+- Low-latency Inference Library
+- Large-scale benchmarking
+
+## References
+
+1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
+2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/design/network/images/beam_search.png b/doc/fluid/design/network/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/design/network/images/beam_search.png differ
diff --git a/doc/fluid/design/network/images/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/design/network/images/ds2_network.png differ
diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3557d55fe4dbae1f712e0760ca15111ec6f6792d
--- /dev/null
+++ b/doc/fluid/design/network/index_cn.rst
@@ -0,0 +1,7 @@
+复杂网络设计
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73a7137236bdf0548d35721609351d6deca3013b
--- /dev/null
+++ b/doc/fluid/design/network/index_en.rst
@@ -0,0 +1,7 @@
+Complex Network Design
+------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..b95773c50ca0dcbd1b93529332e035d4de90faa8
--- /dev/null
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -0,0 +1,229 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
+
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
+
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
+
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following examples are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx_expanded],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
+
+In this way, users can customize anything on the input or output of beam search, for example:
+
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`:
+
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
+
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state:
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state is stored in `encoder_ctx_expanded`:
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
+
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
+
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
+
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According to the image above, the only phase that changes the LoD is beam search.
+
+## Beam search design
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
+
+1. `topk_ids`, the top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables:
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab1c2ff23cfff586516876684348bb15bd2084fc
Binary files /dev/null and b/doc/fluid/design/onnx/images/project_structure.png differ
diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc1665d7c33eb54cb63e5306a439c1ca67016d1e
--- /dev/null
+++ b/doc/fluid/design/onnx/onnx_convertor.md
@@ -0,0 +1,131 @@
+# Background
+
+[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
+
+Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
+
+
+# How it works
+
+ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
+
+When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
+
+Here are a few major considerations when it comes to converting models:
+
+- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
+- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
+- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
+- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
+- **Model validation**: There are two kinds of validations that are necessary:
+   1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
+   2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
+- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
+
+One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
+
+
+# Project structure
+
+<p align="center">
+<img src="./images/project_structure.png"/>
+</p>
+
+The project contains four important parts:
+
+* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
+
+* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
+
+* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
+
+* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
+
+
+# Usage
+The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
+
+* Convert Fluid inference model to ONNX binary model
+
+    ```
+    python convert.py --fluid_model <fluid inference model> --onnx_model <ONNX model> validate True
+    ```
+
+* Validate the converted model
+
+    ```
+    python validate.py --fluid_model <fluid inference model> --onnx_model <ONNX model>
+    ```
+
+The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
+
+
+# Challenges and mitigation
+
+## Cycles
+
+Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
+
+*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
+
+## Sequences
+
+Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
+
+*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
+
+## Ops that can't easily be mapped
+
+There are ops that just aren't possible to map today:
+
+**Control flow operators**
+
+Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
+
+*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
+
+
+**Non-existent in Fluid**
+
+There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
+
+*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
+
+
+**Concurrency**
+
+ONNX does not have any considerations for concurrency right now.
+
+*Resolution*: There are two ways to approach this:
+
+a. We choose to not support concurrent models.
+b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
+
+
+**Overloaded in Fluid**
+
+There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
+
+*Resolution*: Chain multiple Paddle operators.
+
+
+## Lack of LoDTensors
+
+As stated above, ONNX only supports simple Tensor values.
+
+*Resolution*: Deprecate to plain old numpy-able tensors.
+
+
+## Reconstruction from deprecated ONNX ops
+
+For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
+
+*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
+
+
+# Supported models
+
+As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
+
+- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
+- Recurrent models: language model, stacked LSTMs etc.
diff --git a/doc/fluid/design/others/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md
new file mode 100644
index 0000000000000000000000000000000000000000..773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf
--- /dev/null
+++ b/doc/fluid/design/others/auto_gradient_check.md
@@ -0,0 +1,150 @@
+## Auto Gradient Check Design
+
+## Background：
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+  1. The formula for backpropagation formula should be correct according to the forward computation.
+  2. The Implementation of the above shoule be correct in CPP.
+  3. It is difficult to prepare an unbiased test data.
+
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+  1. Numerical gradient checker only needs the forward operator.
+  2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
+
+## Mathematical Theory
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numerical Gradient Implementation
+### Python Interface
+```python
+def get_numerical_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=0.005,
+                         local_scope=None):
+    """
+    Get Numerical Gradient for the input of an operator.
+
+    :param op: C++ operator instance, could be an network.
+    :param input_values: The input variables. Should be an dictionary, whose key is
+    variable name, and value is a numpy array.
+    :param output_name: The final output variable name.
+    :param input_to_check: The input variable with respect to which the gradient has to be computed.
+    :param delta: The perturbation value for numerical gradient method. The
+    smaller the delta, the more accurate the result. But if the delta is too
+    small, it will suffer from the numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+```
+
+### Explanation:
+
+- Why do we need an `output_name`
+  - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
+
+- Why do we need `input_to_check`
+  - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
+
+
+### Core Algorithm Implementation
+
+
+```python
+    # we only compute the gradient of one element a time.
+    # we use a for loop to compute the gradient of each element.
+    for i in xrange(tensor_size):
+        # get one input element using the index i.
+        original = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run the forward op and then
+        # get the new value of the result tensor.
+        x_pos = original + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        # Subtract delta from this element, run the op again
+        # and get the new value of the result tensor.
+        x_neg = original - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        # restore old value
+        tensor_to_check.set_float_element(i, original)
+
+        # compute the gradient of this element and store
+        # it into a numpy array.
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Gradient Check Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+1. Numerical gradient
+2. CPU kernel gradient
+3. GPU kernel gradient (if supported by the device)
+
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
+
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
+
+#### Python Interface
+
+```python
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+          computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which the
+          gradient will be computed.
+        :param output_name: The final output variable name.
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used to create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+```
+
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
+
+```python
+numerical_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numerical_grad = numpy.abs(numerical_grad)
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
+abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes：
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
+
+
+#### References:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/fluid/design/others/dcgan.png b/doc/fluid/design/others/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/design/others/dcgan.png differ
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..7167470088766985fa5ad31657410309330fd725
--- /dev/null
+++ b/doc/fluid/design/others/gan_api.md
@@ -0,0 +1,253 @@
+# Design for GAN
+
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
+
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+
+## The Conditional-GAN might be a class.
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one.
+Returns a 0/1 binary label.
+
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+
+To be more detailed, we introduce our design of DCGAN as following:
+
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively.
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+
+    # executor
+    sess = pd.executor()
+
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+
+        if batch_id % 2 == 0:
+          sess.run(d_step,
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/fluid/design/others/graph.md b/doc/fluid/design/others/graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..7519a65df835a39fe14f6ef45530afff170191ff
--- /dev/null
+++ b/doc/fluid/design/others/graph.md
@@ -0,0 +1,70 @@
+# Design Doc: Computations as a Graph
+
+A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
+
+This document explains that the construction of a graph as three steps:
+
+- construct the forward part
+- construct the backward part
+- construct the optimization part
+
+## The Construction of a Graph
+
+Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+### Forward Part
+
+The first four lines of above program build the forward part of the graph.
+
+![](images/graph_construction_example_forward_only.png)
+
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
+
+In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
+
+### Backward Part
+
+The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
+
+`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
+
+![](images/graph_construction_example_forward_backward.png)
+
+According to the chain rule of gradient computation, `ConstructBackwardGraph` would
+
+1. create a gradient operator G for each operator F,
+1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
+1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
+1. make all these gradients as outputs of G.
+
+### Optimization Part
+
+For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
+
+![](images/graph_construction_example_all.png)
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+  repeated OpDesc ops = 1;
+  repeated VarDesc vars = 2;
+}
+```
+
+in the order that they appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
new file mode 100644
index 0000000000000000000000000000000000000000..97f395133b48a1d0ed5136f0ebc8720b8ca87ded
--- /dev/null
+++ b/doc/fluid/design/others/graph_survey.md
@@ -0,0 +1,232 @@
+## Survey on Graph
+
+Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
+
+### Mxnet
+
+The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
+
+
+`Symbol` is help class used to represent the operator node in Graph.
+`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
+
+
+A simple network topology wrote by Symbol is as follows:
+
+```python
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.symbol.Flatten(data=data)
+    fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+    fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+    act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+    fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+    mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+    return mlp
+```
+
+
+
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
+
+Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
+
+And Symbol can be saved to a Json file.
+
+Here is a detailed example:
+
+```
+>>> import mxnet as mx
+>>> data = mx.symbol.Variable('data')
+>>> print data.debug_str()
+Variable:data
+
+>>> data = mx.symbol.Flatten(data=data)
+>>> print data.debug_str()
+Symbol Outputs:
+	output[0]=flatten0(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+
+>>> fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+>>> print fc1.debug_str()
+Symbol Outputs:
+	output[0]=fc1(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+	arg[0]=flatten0(0)
+	arg[1]=fc1_weight(0) version=0
+	arg[2]=fc1_bias(0) version=0
+Attrs:
+	num_hidden=128
+
+```
+
+
+### TensorFlow
+
+
+The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
+
+A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
+
+A simple example is as follows:
+
+```python
+  # Build a dataflow graph.
+  c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+  e = tf.matmul(c, d)
+
+  # Construct a `Session` to execute the graph.
+  sess = tf.Session()
+
+  # Execute the graph and store the value that `e` represents in `result`.
+  result = sess.run(e)
+```
+
+  
+The main method of `Tensor` is as follows: 
+ 
+ 
+```python
+@property
+def op(self):
+  """The `Operation` that produces this tensor as an output."""
+  return self._op
+
+@property
+def dtype(self):
+   """The `DType` of elements in this tensor."""
+  return self._dtype
+
+@property
+def graph(self):
+  """The `Graph` that contains this tensor."""
+  return self._op.graph
+
+@property
+def name(self):
+  """The string name of this tensor."""
+  if not self._op.name:
+    raise ValueError("Operation was not named: %s" % self._op)
+  return "%s:%d" % (self._op.name, self._value_index)
+
+@property
+def device(self):
+  """The name of the device on which this tensor will be produced, or None."""
+  return self._op.device
+```
+
+
+Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
+
+
+Here is a detailed example:
+
+
+```
+>>> import tensorflow as tf
+>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+>>> print c.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+>>> print d.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> e = tf.matmul(c, d)
+>>> print e.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+```
+
+### Dynet
+
+
+The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
+
+
+A simple example is as follows:
+
+```cpp
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+
+Expression in = input(cg, xs[i]);
+Expression label = input(cg, ys[i]);
+Expression pred = W * in;
+Expression loss = square(pred - label);
+```
+
+The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. 
+
+Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
+
+
+Here is a detailed example:
+
+write topology in C++
+
+```
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+cg.print_graphviz();
+
+Expression pred = W * xs[i];
+cg.print_graphviz();
+
+Expression loss = square(pred - ys[i]);
+cg.print_graphviz();
+```
+
+compile and print
+
+```
+# first print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+}
+# second print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+}
+# third print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+  N2 [label="v2 = -1.88387 - v1"];
+  N1 -> N2;
+  N3 [label="v3 = -v2"];
+  N2 -> N3;
+  N4 [label="v4 = square(v3)"];
+  N3 -> N4;
+}
+```
+
+### Conclusion
+
+
+Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
+
+- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
+- Expression corresponds with a global Graph, and Expression can also be composed.
+- Expression tracks all dependency and can be taken as a run target
diff --git a/doc/fluid/design/others/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/design/others/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/design/others/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/design/others/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/design/others/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/design/others/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7ac3f17c44ca94a669a8f1e283b291bceb42317
--- /dev/null
+++ b/doc/fluid/design/others/parameters_in_cpp.md
@@ -0,0 +1,41 @@
+# Design Doc: The C++ Class `Parameters`
+
+`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of  sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+
+We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
+* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
+* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
+
+It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
+
+1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
+It is evident that we should use `paddle::Parameter` when developing `Parameters`.
+However, the `Parameter` class contains many functions and does not have a clear interface.
+It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
+When we developing `Parameters`, we only use `create/store Parameter` functionality.
+We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
+
+2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
+We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
+Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
+`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
+
+3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. 
+So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
+
+
+The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
+
+1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
+
+2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
+
+3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
+Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
+`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
+   * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
+   * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
+
+4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
+
+5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/fluid/design/others/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7aeed7f9b4637e1c29d530f37b42d12500af82f
--- /dev/null
+++ b/doc/fluid/design/others/simple_op_design.md
@@ -0,0 +1,202 @@
+## Interaction between C++ and Python
+
+Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. 
+
+The Interaction between Python and C++ can be simplified as two steps:
+
+1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
+
+2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
+
+### Message from C++ to Python
+
+We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
+
+Following message are necessary:
+
+1. Op's name, and its simple comment.
+2. Input and output variable number; each variable's name, type, and comment.
+3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
+
+So `OpProto` can be defined as follows:
+
+```proto
+enum AttrType {
+	INT = 1;
+	FLOAT = 2;
+	STRING = 3;
+	INTS = 4;
+	FLOATS = 5;
+	STRINGS = 6;
+};
+
+message AttrValue {
+	AttrType type = 1;
+	optional int iv = 2;
+	optional float fv = 3;
+	optional string sv = 4;
+	repeated int ivs = 5;
+	repeated float fvs = 6;
+	repeated string svs = 7;
+};
+
+message AttrProto {
+	required string name = 1;
+	required string comment = 2;
+	required AttrType type = 3;
+};
+
+message VarProto {
+	required string name = 1;
+	required string comment = 2;
+	required bool is_tensor = 3;
+};
+
+message OpProto {
+	repeated VarProto inputs = 1;
+	repeated VarProto outputs = 2;
+	repeated AttrProto attrs = 3;
+	required string type = 4;
+	required string comment = 5;
+};
+```
+
+To generate Python code automatically:
+
+```python 
+def create_python_ops_creatation_functions():
+	op_protos = paddle.framework.OpRegistry.get_all_op_proto()
+	for type_name in op_protos:
+		op_proto = op_protos[type_name]
+		def __impl__(**kwargs):  # User must use key word args in Paddle API
+			inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs]
+			outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs]
+			attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs]
+			opdesc = （input, outputs, type_name, attrs）
+			return paddle.framework.OpRegistry.CreateOp(opdesc)
+		__impl__.__doc__ = create_doc_string(op_proto)
+		globals()[type_name] = __impl__
+
+create_python_ops_creatation_functions()
+```
+
+### Message from Python to C++
+
+To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
+
+```proto
+message OpDesc {
+	required string type = 1;	
+	repeated string inputs = 2;
+	repeated string outputs = 3;
+	map<string, AttrValue> attrs = 4;
+};
+```
+
+## OpProto Register
+
+Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
+
+```cpp
+class OpProtoMaker {
+public:
+	OpProtoMaker(OpProto* proto): proto_(proto) {}
+protected:
+	OpProto* proto_;
+	void AddInput(const std::string& name, const std::string& desc) {...}
+	void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
+	void AddComment(const std::string& comment) { ... }
+};
+
+class OpRegistry {
+public:
+	using OpCreator = std::function<OperatorBase* (OpDesc& desc)>;
+	
+	template <typename OpType, typename OpMaker>
+	static void RegisterOp(const std::string& name) {
+		gCreators_[name] = [](const OpDesc& desc) {
+			return new OpType(desc);
+		};
+		OpProto& opProto = gProtos_[name];
+		OpMaker()(&opProto);
+	}
+
+	static map<string, OpCreator> gCreators_;
+	static map<string, OpProto> gProtos_;
+};
+
+template <typename OpType, typename OpMaker>
+class OpRegister {
+  public:
+    OpRegister(std::string type) {
+        OpRegistry::RegisterOp<OpType, OpMaker>(type);
+    }
+};
+
+#define REGISTER_OP(op_class, op_maker_class, type_name)         \
+    class op_class##Register {                                   \
+      private:                                                   \
+        const static OpRegister<#op_class, #op_maker_class> reg; \
+    };                                                           \
+    const Register op_class##Register::reg(#type_name);
+    
+class CosineOp {
+// ...
+}
+
+struct CosineOpProtoMaker : public OpProtoMaker {
+	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
+		AddInput("input", "input of cosine op");
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
+		AddType("cos");
+		AddComment("This is cos op");
+	}
+}
+
+REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
+```
+
+In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. 
+
+## Python API
+
+Python  APIs are divided into two types, high-level API and low-level API.
+
+### High-Level API
+
+High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
+
+Here is a sample about how a define a fc layer:
+
+```python
+hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
+```
+
+`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
+
+The definition of `fc_layer()`:
+
+```python
+def fc_layer(input, size, with_bias, activation):
+	attr_map = {"size":size}
+	check_attrs(attr_map)
+	w = make_variable('w')
+	if with_bias:
+		b = make_variable('b')
+	else:
+		b = None
+	fc_output = make_variable('fc_output');
+	fc_op(input, w, b, fc_output, attr_map)
+	act_output = make_variable('sigmod_output');
+	if activation == "sigmod":
+		sigmod_op(fc_output, act_output);
+	elif:
+		# ...
+	return act_output;
+```
+
+### Low Leval API
+
+In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
+
+*TODO*
diff --git a/doc/fluid/design/others/test.dot b/doc/fluid/design/others/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/design/others/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/design/others/test.dot.png b/doc/fluid/design/others/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/design/others/test.dot.png differ
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements.  It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$  denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer. 
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M  = max(abs(x))  $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer.  For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer. 
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure. 
+
+<p align="center"> 
+<img src="quantization_forward.png" width="300" height="340"><br/>
+Figure 1. Forward in training with simulated quantization.
+</p>
+
+- Firstly, both input and weight will be quantized to 8-bit integers. 
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil  $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m) 
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+<p align="center"> 
+<img src="quantization_equivalent_forward.png"  width="300" height="330"><br/>
+Figure 2. Equivalent forward in training with simulated quantization.
+</p>
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+<p align="center"> 
+<img src="quantization_backward_and_optimization.png"><br/>
+Figure 3. Backward and weight updating in training with simulated quantization.
+</p>
+
+So the quantization transipler will change some inputs of the corresponding backward operators. 
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+    $$ Vt = (1 - k) * V +  k * V_{t-1}  $$
+    
+    where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -0,0 +1,221 @@
+# API注释撰写标准
+
+- [API注释撰写标准](#api)
+    - [API注释模块](#api)
+    - [格式及示例](#)
+    - [完整示例](#)
+
+
+## API注释模块
+
+API文档须包含以下几个模块（排列顺序为文档撰写顺序）：
+
+- Python API Definition
+
+  API的代码定义。
+
+- Function Description
+
+  API的功能描述。描述该API的含义、作用或对输入所做的操作，及参考文献和对应链接（如果有），必要时给出公式，并解释公式中关键变量的含义。
+
+- Args Description
+
+  API参数介绍。按代码定义中的参数顺序逐个介绍，介绍内容包含数据类型、默认值（如果有）、含义等。
+
+- Returns
+
+  API返回值介绍。介绍返回值含义，必要时给出对应的形状。若返回值为包含多个参数的tuple，则按顺序逐个介绍各参数。
+
+- Raises（如果有）
+
+  可能抛出的异常或错误及可能的产生原因，当可能抛出多种异常或错误时应分条列出。
+
+- Note（如果有）
+
+  注意事项。当有多条注意事项时，应分条列出。
+
+- Examples
+
+  API的使用示例。
+
+
+## 格式及示例
+
+API文档须使用reStructuredText格式撰写，该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下（以下以fc为例进行说明）：
+
+- Python API Definition
+
+  - 格式：
+
+      [Python API Definition]
+
+  - 示例
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - 格式
+
+      本模块应包含以下内容（排列顺序为文档撰写顺序）：
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - 示例
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      因fc没有必要列出的参考文献，故该内容省略。其他情况下需明确给出对应的参考文献和对应连接，以 layer_norm 为例：
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - 格式
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - 示例
+
+      fc的部分参数注释如下：
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - 格式
+
+      [Name][Shape]
+
+  - 示例
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      当返回值为包含多个参数的tuple时，应按顺序逐个介绍各参数，以dynamic_lstm为例：
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - 格式
+
+      [Exception Type][Condition]
+
+  - 示例
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - 格式
+
+     [Note]
+
+  - 示例
+
+      fc没有注意事项，故该模块省略不写。如有注意事项应明确给出，当有多条注意事项，须分条列出，以scaled\_dot\_product\_attention为例：
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - 格式
+
+      \[Python Code Snipper]
+
+  - 示例
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## 完整示例
+
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f175b219750d1c765a6a111c2ec3aa732fa46175
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,227 @@
+# API Doc Standard
+
+- [API Doc Standard](#api-doc-standard)
+    - [API Doc Structure](#api-doc-structure)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises（if any）
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note（if any）
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/dev/ci_build_whl.png differ
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..955216ca62e71b4d3666e1662aa86c9495d2e7d6
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..f9fc68c37e17a8a365b0d7fae86c16b0d094631f
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37e608160db0ad5a92297987937bbbfa8f842ea8
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,16 @@
+开发标准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
+  api_doc_std_cn.md
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_cn.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7f83035010f13c30514673ecbee301f194dc175
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,16 @@
+Development
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_en.md
+  write_docs_en.md
+  api_doc_std_en.md
+  new_op_en.md
+  new_op_kernel.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_en.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b4244d0f506c8cd6c08739141eabad27c581ca7
--- /dev/null
+++ b/doc/fluid/dev/name_convention.md
@@ -0,0 +1,65 @@
+# Operator's Parameter Name Convention
+
+To make the operator document itself more clear, we recommend operator names obey the listing conventions.
+
+## OpProtoMaker names
+
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
+
+- Input/Output.
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
+  - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
+
+- Attribute.
+  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
+
+- Comments.
+  - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
+
+- Order.
+  - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
+
+## Best Practice
+
+Here we give some examples to show how these rules will be used.
+
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
+
+- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
+
+- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
+
+  We give a full example of Accumulator Operator.
+
+```c++
+class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
+    the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
+    AddOutput("Out", "(Tensor) Accumulated output tensor");
+    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
+    AddComment(R"DOC(
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
+output tensor already has the right size, we add to it; otherwise, we first
+initialize the output tensor to all zeros, and then do accumulation. Any
+further calls to the operator, given that no one else fiddles with the output
+in the interim, will do simple accumulations.
+
+Accumulation is done as follows:
+
+Out = 1*X + gamma*Out
+
+where X is the input tensor, Out is the output tensor and gamma is the multiplier
+argument.
+
+)DOC");
+  }
+};
+```
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb6ca6012487f238ea5b84d8c43d277931220874
--- /dev/null
+++ b/doc/fluid/dev/new_op_cn.md
@@ -0,0 +1,452 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
+   - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
+
+
+## 概念简介
+
+简单介绍需要用到基类，详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写，Op)基类。
+- `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+- `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+- `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> .cc 文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
+</tr>
+</tbody>
+</table>
+
+
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 定义ProtoMaker类
+
+矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释：
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+
+   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
+   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
+
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
+
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
+)DOC");
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
+  }
+};
+```
+
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker，若未定制对应前向Op的GradProtoMaker，fluid提供了DefaultGradProtoMaker，默认注册会使用全部输入输出，包括Input, Output, Output@Grad等，使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+```
+
+### 定义Operator类
+
+下面实现了MulOp的定义：
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    //never use Input<Tensor> or Output<Tensor> if you want a to get a LoDTensor.
+    auto dim0 = ctx.Input<LoDTensor>("X")->dims();
+    auto dim1 = ctx.Input<LoDTensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<LoDTensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
+
+  - 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
+
+### 定义OpKernel类
+
+`MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型，如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+
+- `Compute`接受一个输入参数：`const framework::ExecutionContext& context`。
+
+- 与`InferShapeContext`相比，`ExecutionContext`增加了设备类型，同样可获取到输入输出和属性参数。
+
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+Op的输入和输出可分别通过`ExecutionContext::Input<T>()`和`ExecutionContext::Output<T>()`获得。
+
+**注意：** 若op的输入/输出的变量类型是`LoDTensor`（fluid默认所有的Tensor默认都是LoDTensor类型），请写成`ExecutionContext::Input<LoDTensor>()`和`ExecutionContext::Output<LoDTensor>()`，不要写`ExecutionContext::Input<Tensor>()`和`ExecutionContext::Output<Tensor>()`。因为若实际的变量类型为`SelectedRows`，`Input<Tensor>()`和`Output<Tensor>()`方法会将`SelectedRows`类型特化为`Tensor`，导致潜在的错误。
+
+下面是 `MulKernel` `Compute`的实现：
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<LoDTensor>("X");
+    auto* Y = context.Input<LoDTensor>("Y");
+    auto* Z = context.Output<LoDTensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
+
+### 注册Operator
+
+- 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+    在上面的代码中：
+
+	   - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
+
+	   - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
+
+
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### 编译
+
+运行下面命令可以进行编译：
+
+```
+make mul_op
+```
+
+## 绑定Python
+
+系统会对新增的op自动绑定Python，并链接到生成的lib库中。
+
+## 实现单元测试
+
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
+
+### 前向Operator单测
+
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
+
+1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
+
+
+	  ```python
+	  import unittest
+	  import numpy as np
+	  from op_test import OpTest
+
+
+	  class TestMulOp(OpTest):
+	      def setUp(self):
+	          self.op_type = "mul"
+	          self.inputs = {
+	              'X': np.random.random((32, 84)).astype("float32"),
+	              'Y': np.random.random((84, 100)).astype("float32")
+	          }
+	          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+	      def test_check_output(self):
+	          self.check_output()
+
+	      def test_check_grad_normal(self):
+	          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+	      def test_check_grad_ingore_x(self):
+	          self.check_grad(
+	              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+	      def test_check_grad_ingore_y(self):
+	          self.check_grad(
+	              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+	  ```
+
+	上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
+
+	- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+	- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+	- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
+
+### 反向operator单测
+
+而反向测试中：
+
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
+
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
+
+
+### 编译和执行
+
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+或者:
+
+```bash
+ctest -R test_mul_op
+```
+
+## 注意事项
+
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义，基本格式如下：
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真，或者比较对象A=B，则检查通过，否则会终止程序运行，向用户反馈相应的错误提示信息。
+为了确保提示友好易懂，开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方，必须有详略得当的备注解释！**错误提示信息**不能为空！
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了？为什么错了？
+
+    - 例如：`ValueError: Mismatched label shape`
+
+2. [optional] 期望的输入是什么样的？实际的输入是怎样的？
+
+    - 例如：`Expected labels dimension=1. Received 4.`
+
+3. [optional] 能否给出修改意见？
+
+    - 例如：`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点，根据情况书写亦可。
+
+#### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单，不能给用户提供有效的提示！
+
+	问题示例1 ：未写提示信息
+	```
+	PADDLE_ENFORCE(ctx->HasInput("X"), "");
+	```
+	问题示例2 ：提示信息过于简单
+	```
+	PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
+	```
+
+2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
+
+	问题示例：
+	```
+	PADDLE_ENFORCE(forward_pd != nullptr,
+	                    "Fail to find eltwise_fwd_pd in device context");  //eltwise_fwd_pd用户可能看不懂
+	```
+
+3. OP内部调用非法接口：Op内部如果出现Output = ShareDataWith(Input) 
+	问题示例：
+	```cpp
+	auto *out = ctx.Output<framework::LoDTensor>("Out");
+	auto *in = ctx.Input<framework::LoDTensor>("X");
+	out->ShareDataWith(*in);
+	```
+	Op内部如果出现Output = ShareDataWith(Input)，相当于operator图的中有一条隐藏边，连接了Input和Output，这条边无法在图分析中表达，引发基于图优化的错误。
+
+4. OP实现的性能实践
+	调用了eigen的broadcast, chop等操作，性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen，gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量，请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`  
+
+	正确示例：
+	```
+	PADDLE_ENFORCE(ctx->HasInput("Input"),
+	                        "Input(Input) of LSTMP operator should not be null.");
+	```
+
+- 反向Op的输入输出检查，要写明反向Op的名字
+
+	正确示例：
+	```
+	PADDLE_ENFORCE(ctx->HasInput("X"),
+	                        "Input(X) of LoDResetGrad opreator should not be null.");
+	```
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8de271ed4e5e0fb4018478bffd4b525d4319738
--- /dev/null
+++ b/doc/fluid/dev/new_op_en.md
@@ -0,0 +1,352 @@
+# How to write a new operator
+
+ - [Background](#background)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
+   - [Defining Operator](#defining-operator)
+   - [Defining OpKernel](#defining-opkernel)
+   - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
+   - [Compilation](#compilation)
+ - [Python Binding](#python-binding)
+ - [Unit Tests](#unit-tests)
+   - [Testing Forward Operators](#testing-forward-operators)
+   - [Testing Backward Operators](#testing-backward-operators)
+   - [Compiling and Running](#compiling-and-running)
+ - [Remarks](#remarks)
+## Background
+
+Here are the base types needed. For details, please refer to the design docs.
+
+- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation kernel.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
+
+
+Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+
+
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
+
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+
+
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+
+
+## Implementing C++ Types
+
+
+### Defining ProtoMaker
+
+Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
+
+First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+
+   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
+   - `framework::OpAttrChecker` is used to validate variable attributes.
+
+The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
+
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
+
+
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
+
+
+### Defining Operator
+
+The following code defines the interface for MulOp:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
+
+  - 1). validate and error out early: it checks input data dimensions and types.
+  - 2). configures the tensor shape in the output.
+
+Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
+
+### Defining OpKernel
+
+`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
+
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
+
+- `typename T` denotes data type, such as `float` or `double`.
+
+`MulKernel` types need to rewrite the interface for `Compute`.
+
+- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
+- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
+- `Compute` implements the computation logics of an `OpKernel`.
+
+`MulKernel`'s implementation of `Compute` is as follows:
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
+
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
+
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
+
+
+This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
+
+The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
+
+### Registering Operator and OpKernel
+
+- In `.cc` files, register forward and backward operator classes and the CPU kernel.
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
+
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   In that code block,
+
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
+
+
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### Compilation
+
+Run the following commands to compile.
+
+```
+# maybe you need to rerun cmake
+make mul_op
+```
+
+## Python Binding
+
+The system will automatically bind to Python and link it to a generated library.
+
+## Unit Tests
+
+Unit tests for an operator include
+
+1. comparing a forward operator's implementations on different devices,
+
+2. comparing a backward operator's implementation on different devices, and
+
+3. a scaling test for the backward operator.
+
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
+
+### Testing Forward Operators
+
+A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following:
+
+1. Defining input, output and relevant attributes in `setUp` method.
+
+2. Generating random input data.
+
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
+
+  ```python
+  import unittest
+  import numpy as np
+  from op_test import OpTest
+
+
+  class TestMulOp(OpTest):
+      def setUp(self):
+          self.op_type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
+Get its output, and compare it with the forward operator's own output.
+
+The code above first loads required packages. In addition, we have
+
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.inputs` defines input, with type `numpy.array` and initializes it.
+- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
+
+### Testing Backward Operators
+
+Some key points in checking gradient above include:
+
+- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The second variable `"Out"` points to the network's final output target `Out`.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
+
+### Compiling and Running
+
+
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
+
+Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
+
+After successfully compiling the project, run the following command to run unit tests:
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+Or,
+
+```bash
+ctest -R test_mul_op
+```
+
+## Remarks
+
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md
new file mode 100644
index 0000000000000000000000000000000000000000..87e617d44041bde9c9051151878ffb4304689b3c
--- /dev/null
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -0,0 +1,121 @@
+# Add Kernels for a New Device
+
+## Background
+
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
+
+## Write Kernels for A New Device
+
+### Add A New Device
+
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24).  We will correct this ASAP.
+
+To register a new device, we need to add an enum value to `LibraryType`:
+
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+
+
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
+
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
+
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
+
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+```
+
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
+
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
+
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+
+
+### Register the OpKernel to framework
+
+After writing the components described above, we should register the kernel to the framework.
+
+We use `REGISTER_OP_KERNEL` to do the registration.
+
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+
+In the code above:
+
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..4e539d7992e5f67ee7b07193b59b6b425b73c9e5
--- /dev/null
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+## PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+## Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+## Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+## LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+## The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..acea9a2b5df903a958edf3683900e165670e196f
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -0,0 +1,195 @@
+# PaddlePaddle发行规范
+
+PaddlePaddle使用Trunk Based Development，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
+
+PaddlePaddle每次发新的版本，遵循以下流程:
+
+1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
+2. 将新分支的版本打上tag，tag为`版本号rc-Patch号`。例如，第一个tag为`0.10.0-rc0`。
+3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
+4. QA和研发发现的bug，在develop上修复验证后，cherry-pick修复到release分支。直到release分支相对稳定。
+5. 如果有需要，在release分支最新代码上打上新的tag，比如`0.10.0-rc1`，让更多的用户加入测试。重复3-4步。
+6. release分支稳定后，打上正式的release tag，比如`0.10.0`。
+7. 将这个版本的python wheel包发布到pypi。
+8. 更新Docker镜像（参考后面的操作细节）。
+
+需要注意的是:
+
+* bug修复需要先在develop上进行，然后进入release分支。而不是直接在release分支上开发。
+
+* release分支原则上只接受修复类的修改，不接受新feature。
+
+## 发布wheel包到pypi
+
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
+
+* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可：
+
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
+
+## PaddlePaddle 分支规范
+
+PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
+
+* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
+* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试，bug修复和最终发版。
+* `master`分支因为历史原因，已经废弃。
+
+* 其他开发者fork的feature branch。
+	* 建议，开发者的feature branch需要同步主版本库的`develop`分支。
+	* 建议，开发者的feature branch需要基于主版本库中的`develop`分支。
+	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
+
+## PaddlePaddle回归测试列表
+
+TODO
+
+### PaddlePaddle Book中所有章节
+
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..00650946ff2e658cfad0e63a8f1e008902a2d36e
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,228 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using Trunk Based Development, and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Create a new release branch from `develop`，named `release/[version]`. E.g.，`release/0.10.0`
+2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。
+3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
+4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable.
+5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
+6. After release branch is stable，Create the official release tag，such as `0.10.0`.
+7. Release the python wheel package to pypi.
+8. Update the docker image (More details below).
+
+NOTE:
+
+* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch.
+
+* release normally only accept bug fixes. Don't add new features.
+
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+### Publish wheel Packages for MacOS
+
+You need to build the binary wheel package for MacOS before publishing, to
+make sure that the package can be used by many versions of MacOS
+(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
+you must build the package ***exactly*** following below steps:
+
+Build steps:
+
+1. install python from python.org downloads, and make sure it's currently in use
+   in your system.
+1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
+1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
+1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF  ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
+1. `make -j`
+1. `pip install delocate`
+1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
+
+Then the whl under `fixed_wheel` is ready to upload.
+
+Install steps:
+
+1. run `pip install paddlepaddle...whl`
+1. find the `libpython.dylib` that are currently in use:
+    - for python.org package installs, do nothing.
+    - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model.
+
+* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests.
+* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release.
+* `master` branch as been deprecated for historical reasons
+
+* Developer's feature branch。
+	* Developer's feature branch should sync with upstream `develop` branch.
+	* Developer's feature branch should be forked from upstream `develop` branch.
+	* After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review.
+	   * In the review process, develop modify codes and push to their own feature branch.
+
+## PaddlePaddle Regression Test List
+
+TODO
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/src/fc.py b/doc/fluid/dev/src/fc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b074821cc2276a29b2a8639e82199fcf4d72020
--- /dev/null
+++ b/doc/fluid/dev/src/fc.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+
+    Returns:
+        A tensor variable storing the transformation result.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
new file mode 100644
index 0000000000000000000000000000000000000000..051a463cfcf97df2e2d5b6a880923ca70fefbd6e
--- /dev/null
+++ b/doc/fluid/dev/support_new_device.md
@@ -0,0 +1,240 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicate the device id and manage hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+
+```
+        |   CPUPlace
+Place --|   CUDAPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   
+DeviceContext ---->  CUDADeviceContext  
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_;
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in the header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implementation is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implementation is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
+
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+For more details, please refer to following docs:
+
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..56203d6fad444f61ef1be187ad0d149b2aa99ba4
--- /dev/null
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -0,0 +1,146 @@
+# 在Paddle中如何使用Eigen
+
+神经网络本质上是一个计算图，计算需要的数据存放在`Tensor`中，而计算过程是由`Operartor`来描述的。在执行时，`Operator`调用对应`OpKernel`中的`Compute`接口，实现对`Tensor`的操作。
+
+
+## Eigen Tensor模块
+
+Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
+
+关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+
+
+## paddle::framework::Tensor
+
+Paddle Tensor定义在framework目录下，其主要接口如下：
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:  
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder`的作用是延迟分配内存，即我们可以先定义一个Tensor，然后使用Resize接口设置Tensor的大小，最后再调用mutable_data接口分配实际的内存。
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor使用样例
+下面以AddOp为例说明Tensor的使用过程：
+
+- InferShape
+
+在运行神经网络计算图时，我们先调用每个`Operator`的`InferShape`接口，根据输入Tensor的大小来设置输出Tensor的大小，`Resize`接口会被调用。
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口，在这时真正的分配内存，`mutable_data`接口会被调用。
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+### paddle::framework::Tensor到EigenTensor的转换
+
+如上一小节所示，在具体的计算中，我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
+
+以EigenTensor为例，做一个介绍
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+From是EigenTensor模板提供的一个接口，可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数，因此在转换时需要显示的指定。
+
+在Eigen中，不同rank的Tensor是不同类型，Vector是rank为1的Tensor。需要额外注意的是，EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor，在这里用EigenVector来表示；而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作，压扁成为Eigen的一维Tensor，类型仍然为EigenVector。
+
+更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。
+
+
+
+## 实现计算
+
+当需要完成计算时，我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是，这里的EigenTensor之间的运算只是改变了原有Tensor中的数据，而不会改变原有Tensor的shape信息。
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+在这段代码中，input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口，把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后，input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息，可以调用Resize接口进行改变。
+
+由于Eigen Tensor模块的文档较少，我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。
diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3313d097cb21e40c23aa13187b6a50562f12403a
--- /dev/null
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -0,0 +1,146 @@
+# How to use Eigen in Paddle
+
+Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
+
+
+## Eigen Tensor Module
+
+The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
+
+Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse.
+
+For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
+
+
+## paddle::framework::Tensor
+
+Paddle Tensor's is defined in the framework directory with the following interface:
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor Usage
+`AddOp` demonstrates Tensor's usage.
+
+- InferShape
+
+When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor.
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+## paddle::framework::Tensor到EigenTensor的转换
+
+As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
+
+Using EigenTensor as an example:
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation.
+
+In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
+
+For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file.
+
+
+
+## Implementing Computation
+
+While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface.
+
+Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels).
diff --git a/doc/fluid/dev/versioning_en.md b/doc/fluid/dev/versioning_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f15fd029dc92e5582e0219cd81c7fce90a9a3d0e
--- /dev/null
+++ b/doc/fluid/dev/versioning_en.md
@@ -0,0 +1,66 @@
+# Versioning (Work In Progress)
+
+
+PaddlePaddle framework follows Semantic Versioning 2.0 (semver).
+Each release has version of the following format: MAJOR.MINOR.PATCH
+(e.g. 1.2.0). Some key points:
+
+
+ * Major version number change can result in backward-incompatible changes. Codes working in old version don’t necessarily work in the new version. In addition, data, such as program model and checkpointed parameters, generated by the previous major version might not work in the new version. Tools will be attempted to be built to help the release migration.
+
+ * Minor version number change always maintain backward compatibility. It normally contains compatible improvements and bug fixes.
+
+ * Patch number change is for bug fixes.
+
+ * Violation of the policy are considered as bugs and should be fixed.
+
+### What is Covered
+
+* All public documented Python APIs, excluding those live in the contrib namespace.
+
+### What is Not Covered
+
+* If an API’s implementation has bugs, we reserve the rights to fix the bugs and change the behavior.
+
+* The Python APIs in contrib namespace.
+
+* The Python function and classes that start with ‘_’.
+
+* The offline tools.
+
+* The data generated by the framework, such as serialized Program model file and checkpointed variables, are subject to different versioning scheme described below.
+
+* C++ Inference APIs. (To be covered)
+
+
+## Data
+
+
+Data refers to the artifacts generated by the framework. Here, we specifically mean model Program file and the checkpointed variables.
+
+
+
+* Backward Compatibility: User sometimes generates Data at PaddlePaddle version 1.1 and expects it to be consumed by PaddlePaddle version 1.2.
+  This can happen when an new online system wants to serve an old model trained previously.
+
+
+
+* Forward Compatibility: User sometimes generates Data at PaddlePaddle version 1.2 and expects it to be consumed by PaddlePaddle version 1.1.
+  The can happen when an new successful research model want to be served by an old online system that is not frequently upgraded.
+
+
+
+### Versioning
+
+Data version. Data is assigned an integer version number. Version is increased when incompatible change is introduced.
+
+PaddlePaddle framework has an interval of Data version that it supports. PadlePaddle framework within the same major version (semver) cannot drop support of lower version of Data. Hence, a minor version change cannot drop support of Data version.
+
+
+For example, For PaddlePaddle version 1.1, it supports Program version 3 to 5. Later, Program version is increased from 5 to 6 due to addition of an attribute. As a result PaddlePaddle version 1.1 won’t be able to consume it. PaddlePaddle 1.2 should support Program version 3 to 6. PaddlePaddle can only drop support for Program version 3 until PaddlePaddle version 2.0.
+
+
+
+### Known Issues
+
+Currently, forward compatibility for new Data version is best-effort.
diff --git a/doc/fluid/dev/write_docs_cn.md b/doc/fluid/dev/write_docs_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..57812a2714b3f45ed8f67a8ce568ab00fd2fba34
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.md
@@ -0,0 +1 @@
+../../v2/dev/write_docs_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..2c281eaaf43bbfad84c3be9ed1d1bd0dbc77fa9b
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cb2b9b0ff1f1d9e0e5201d160f6b7d9d451374e2
--- /dev/null
+++ b/doc/fluid/dev/write_docs_en.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
diff --git a/doc/fluid/faq/faq.rst b/doc/fluid/faq/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3b4bd4f895162fa3b0ba12e785e38ad694590b25
--- /dev/null
+++ b/doc/fluid/faq/faq.rst
@@ -0,0 +1,12 @@
+###################
+编译安装与单元测试
+###################
+
+1. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bb2ed99217609d3a9edd179d4f98ad5b8b649860
--- /dev/null
+++ b/doc/fluid/faq/index_cn.rst
@@ -0,0 +1,9 @@
+FAQ
+====
+
+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
+..  toctree::
+  :maxdepth: 1
+  
+  faq.rst
diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_en.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..a993a43001d93335039c6bb7a2ee4a8799a7472c
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/fluid-compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+-
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+---
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block`
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor`
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    }
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p>
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Fluiddoc/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/refactorization.md)
+  - fluid [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/fluid.md)
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+  - TensorArray [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Fluiddoc/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e7f70fc4cb871a80ffaffec6c06797973cd2f85
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
@@ -0,0 +1,4 @@
+基本使用概念
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..78cca1e2a3443c2949ca0655190b0f05502f519a
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_en.rst
@@ -0,0 +1,4 @@
+Concepts
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/reader/README.md b/doc/fluid/getstarted/concepts/reader/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cd4b6225b61cf374458e40afabad7745f61ba71
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/reader/README.md
@@ -0,0 +1,206 @@
+# Python Data Reader Design Doc
+
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
+
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
+
+```
+iterable = data_reader()
+```
+
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
+
+An example implementation for single item data reader creator is as follows:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator is as follows:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
+
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],)]
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It is easy to convert from a reader to a batch reader:
+
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+It is also straight forward to create a custom batch reader:
+
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
+
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do the following :
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care about the second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why does a reader return only a single entry, and not a mini batch?
+
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
+
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
+
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
+
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
+
+### Why use a dictionary instead of a list to provide mapping?
+
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create a custom data reader creator ?
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train is:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f12ba0497369eacc6a2db7984781b5672f45ea1
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -0,0 +1,76 @@
+# Design Doc: Model Format
+
+## Motivation
+
+A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
+
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
+
+## Implementation
+
+The topology is saved as a plain text in a detailed self-contain protobuf file.
+
+The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
+
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
+
+The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
+
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
+
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
+
+## Summary
+
+- We introduce a model format.
+- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message.
+- A bunch of specified format binary tensors describe the **parameters**.
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3daea71d0933a2774227ff2b5e744392ca6b1765
--- /dev/null
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -0,0 +1,20 @@
+新手入门
+============
+
+
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fb20bb4f245281c3acf67c417979dc63c144fef3
--- /dev/null
+++ b/doc/fluid/getstarted/index_en.rst
@@ -0,0 +1,19 @@
+GET STARTED
+============
+
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/index_en.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a964d4f8561f30aa10936d2399698c51583442c
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -0,0 +1,45 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考： :ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..680122f25893a5a48fac103266bda4788f891f6d
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -0,0 +1,49 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..55326940ce7c7dbaa5bf19f1950f470527ddf4f0
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,181 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+
+### 分布式训练
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步，启动Parameter Server：
+```bash
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步，启动Trainer：
+```bash
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_en.md b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4465e8269c2e1603c02404ea33f8c4572e76442
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
@@ -0,0 +1,153 @@
+# Fluid Distributed Training
+
+## Introduction
+
+In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
+
+## Preparations
+
+### Getting the cluster ready
+
+Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other.
+
+### Have PaddlePaddle installed
+
+PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries.
+
+PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
+
+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+### Update the training script
+
+#### Non-cluster training script
+
+Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example.
+
+The non-cluster version of this demo with fluid API is as follows:
+
+``` python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes.
+
+Now let's try to convert it to a distributed version to run on a cluster.
+
+#### Introducing parameter server
+
+As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
+
+![parameter server architecture](src/trainer.png)
+
+Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to  [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
+
+Now we need to create programs for both: trainers and parameter servers, the question is how?
+
+#### Slice the program
+
+Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program.
+
+The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
+
+Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function.
+
+To put them together:
+
+``` python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+
+
+```
+
+### E2E demo
+
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
+```
+
+*please note we assume that your parameter server runs at 192.168.1.2:6174*
+
+Wait until the prompt `Server listening on 192.168.1.2:6174`
+
+Then in 2 of your trainer nodes run this:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
+```
+
+*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50*
+
+Now you have 2 trainers and 1 parameter server up and running.
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000000000000000000000000000000000..92859e8f622d0c155128821c54252113c5016989
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..8adaf324fccb4cda7af16b9bace559c0642ae444
--- /dev/null
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -0,0 +1,110 @@
+# Distributed Training with NCCL2 and RDMA
+
+When doing distributed multi-GPU training, network bandwidth often becomes the
+bottleneck. We introduce a way to use NCCL2 to do such training job to
+achieve best performance.
+
+## Prepare Hardware with RDMA and Multiple GPUs
+
+I'm using two Linux servers each of them installed with 8 GPUs and
+one 100Gb RDMA card.
+Base environment is:
+
+* OS: CentOS 7.4
+* RDMA device: "Mellanox Technologies MT27700 Family [ConnectX-4]"
+* Kernel version: `4.4.88-1.el7.elrepo.x86_64`
+* Docker version: `1.12.6`
+* Docker storage driver: `overlay2`
+* IP addresses: 192.168.16.30,192.168.16.34
+
+In general, the steps including:
+
+1. Install GPU drivers
+1. Install RDMA drivers
+1. Install "InfiniBand Support"
+1. Use docker to run tests and make sure GPUs and RDMA can work inside
+   the container.
+
+I'll omit the section "Install GPU drivers" because we can find it easily
+somewhere else.
+
+### Install RDMA drivers
+
+For my case, I've got two machines with device
+"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
+"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
+work with the latest overlay2 filesystem.
+
+***NOTE: before you start, make sure you have a way to get a console
+of the server other than ssh because we may need to re-configure the
+network device.***
+
+1. Go to http://www.mellanox.com/page/products_dyn?product_family=26,
+   download `MLNX_OFED` software in the bottom of the page, and upload it
+   onto the server.
+1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
+1. Run `/etc/init.d/openibd restart` to make everything work, note that
+   this operation may cause the network goes down if you are using this
+   RDMA device as default network device and use ssh to log in the server.
+1. Re-configure the network interface, for example:
+   `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
+   `ip route add default via 192.168.16.1 dev eth2`.
+1. Do the same thing on the other node.
+1. Use `ping` to test if the two nodes have typical ICMP connection.
+1. Use either `udaddy` or `ib_write_bw` to test the network connection is
+   ready and have the desired bandwidth.
+
+### Prepare Docker Image to Run RDMA Programs
+
+1. Build a docker image using cuda base image like: `nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04` and install paddlepaddle whl
+   package in it.
+1. Start a docker container and mount GPU driver libs into it (you can
+   skip this step if you are using nvidia-docker).
+1. Mount RDMA drivers and libs into the docker image (see below section),
+   also `udaddy` and `ib_write_bw` if needed.
+1. Mount GPU devices and RDMA devices into the container using `--device`
+   or just use privileged mode `--privileged`.
+1. Start the container using host network mode: `--net=host`
+
+### RDMA Library Files Needed
+
+Usually, `MLNX_OFED` install latest supported libs under
+`/usr/lib64/mlnx_ofed/valgrind`. Other libs also needed to run RDMA programs
+is listed below. These libs must be mounted into the docker container.
+
+* Libs under `/usr/lib64/mlnx_ofed/valgrind`
+  * libibcm.so
+  * libibverbs.so
+  * libmlx4.so
+  * libmlx5.so
+  * libmlx5-rdmav2.so
+  * librdmacm.so
+* Other libs:
+  * libnl-3.so.200
+  * libnl-route-3.so.200
+  * libnuma.so.1
+
+## Start to Run the Training Job
+
+Setting NCCL environment variables to turn NCCL switches on and off:
+
+
+| Env Name | Description |
+| --- | --- |
+| NCCL_SOCKET_IFNAME | The RDMA device, e.g. eth2 |
+| NCCL_P2P_DISABLE | Set to 1 to disable P2P transfer between GPUs |
+| NCCL_IB_DISABLE | Set to 1 to disable using RDMA |
+| NCCL_IB_CUDA_SUPPORT | Set to 1 to enable GPU Direct if supported |
+| NCCL_DEBUG | Set debug level: VERSION, WARN, INFO |
+
+My two servers are: `192.168.16.30,192.168.16.34`, On node 1, Run :
+
+```bash
+PADDLE_TRAINER_ID=0 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.30 stdbuf -oL python vgg16.py
+```
+
+On node 2, Run:
+
+```bash
+PADDLE_TRAINER_ID=1 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.34 stdbuf -oL python vgg16.py
+```
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b57af64f44da82926c4862578f3072960ca5aa92
--- /dev/null
+++ b/doc/fluid/howto/index_cn.rst
@@ -0,0 +1,8 @@
+进阶使用
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  inference/index_cn.rst
+  optimization/index_cn.rst
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd21e167ce3a46da167db1e9d7013804f730e047
--- /dev/null
+++ b/doc/fluid/howto/index_en.rst
@@ -0,0 +1,7 @@
+HOW TO
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  optimization/index_en.rst
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..91357dd8c8da19f2f33c6f285ed7eb234428b1ab
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,97 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a
--- /dev/null
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -0,0 +1,304 @@
+# 使用指南
+
+## 目录：
+
+- Python Inference API
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+  ```python
+  def save_inference_model(dirname,
+                           feeded_var_names,
+                           target_vars,
+                           executor,
+                           main_program=None,
+                           model_filename=None,
+                           params_filename=None):
+  ```
+  Inference模型和参数将会保存到`dirname`目录下：
+  - 序列化的模型
+    - `model_filename`为`None`，保存到`dirname/__model__`
+    - `model_filename`非`None`，保存到`dirname/model_filename`
+  - 参数
+    - `params_filename`为`None`，单独保存到各个独立的文件，各文件以参数变量的名字命名
+    - `params_filename`非`None`，保存到`dirname/params_filename`
+
+- 两种存储格式
+  - 参数保存到各个独立的文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`None`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+    ```
+  - 参数保存到同一个文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`__params__`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ __params__
+    ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+  ```python
+  def load_inference_model(dirname,
+                           executor,
+                           model_filename=None,
+                           params_filename=None):
+    ...
+    return [program, feed_target_names, fetch_targets]
+  ```
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+  - GCC配置
+    ```bash
+    $ g++ -o a.out -std=c++11 main.cc \
+          -I${PADDLE_ROOT}/ \
+          -I${PADDLE_ROOT}/third_party/install/gflags/include \
+          -I${PADDLE_ROOT}/third_party/install/glog/include \
+          -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+          -I${PADDLE_ROOT}/third_party/eigen3 \
+          -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+          -lrt -ldl -lpthread
+    ```
+
+  - CMake配置
+    ```cmake
+    include_directories(${PADDLE_ROOT}/)
+    include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+    include_directories(${PADDLE_ROOT}/third_party/eigen3)
+    target_link_libraries(${TARGET_NAME}
+                          ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+                          -lrt -ldl -lpthread)
+    ```
+
+  - 设置环境变量：
+  `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+  - 1、 初始化设备
+    ```cpp
+    #include "paddle/fluid/framework/init.h"
+    paddle::framework::InitDevices(false);
+    ```
+
+  - 2、 定义place，executor，scope
+    ```cpp
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    auto* scope = new paddle::framework::Scope();
+    ```
+
+  - 3、 加载模型
+    ```cpp
+    #include "paddle/fluid/inference/io.h"
+    auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+    // or
+    auto inference_program = paddle::inference::Load(executor,
+                                                     *scope,
+                                                     dirname + "/" + model_filename,
+                                                     dirname + "/" + params_filename);
+    ```
+
+  - 4、 获取`feed_target_names`和`fetch_target_names`
+    ```cpp
+    const std::vector<std::string>& feed_target_names = inference_program->GetFeedTargetNames();
+    const std::vector<std::string>& fetch_target_names = inference_program->GetFetchTargetNames();
+    ```
+
+  - 5、 准备`feed`数据
+    ```cpp
+    #include "paddle/fluid/framework/lod_tensor.h"
+    std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+    ...
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+    for (size_t i = 0; i < feed_target_names.size(); ++i) {
+      // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+      feed_targets[feed_target_names[i]] = cpu_feeds[i];
+    }
+    ```
+
+  - 6、 定义`Tensor`来`fetch`结果
+    ```cpp
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs;
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+      fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+    }
+    ```
+
+  - 7、 执行`inference_program`
+    ```cpp
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    ```
+
+  - 8、 使用`fetch`数据
+    ```cpp
+    for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+      std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+      std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+      std::cout << "result:";
+      float* output_ptr = cpu_fetchs[i]->data<float>();
+      for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+        std::cout << " " << output_ptr[j];
+      }
+      std::cout << std::endl;
+    }
+    ```
+    针对不同的数据，4. - 8.可执行多次。
+
+  - 9、 释放内存
+    ```cpp
+    delete scope;
+    ```
+
+
+- 接口说明
+
+  ```cpp
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+  ```
+  - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`，用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+  - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致，可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+  - 默认情况下，除了`persistable`属性设置为`True`的`Variable`之外，每次执行`executor.Run`会创建一个局部`Scope`，并且在这个局部`Scope`中创建和销毁所有的`Variable`，以最小化空闲时的内存占用。
+  - `persistable`属性为`True`的`Variable`有：
+    - Operators的参数`w`、`b`等
+    - `feed_op`的输入变量
+    - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    executor.CreateVariables(*inference_program, scope, 0);
+    // Call as many times as you like
+    executor.Run(
+        *inference_program, scope, feed_targets, fetch_targets, false);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁变量的时间（约占每次`Run`总时间的1% ~ 12%）
+    - 执行结束后可获取所有Operators的计算结果
+  - **缺点**
+    - 空闲时也会占用大量的内存
+    - 在同一个`Scope`中，相同的变量名是公用同一块内存的，容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    auto ctx = executor.Prepare(*inference_program, 0);
+    // Call as many times as you like if you have no need to change the inference_program
+    executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁Op的时间
+  - **缺点**
+    - 一旦修改了`inference_program`，则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+  - 主线程
+    - 1、 初始化设备
+    - 2、 定义`place`，`executor`，`scope`
+    - 3、 加载模型，得到`inference_program`
+  - 从线程
+    - **复制`inference_program`得到`copy_program`，修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+      ```cpp
+      auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+                 new paddle::framework::ProgramDesc(*inference_program));
+      std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+      std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+      copy_program->SetFeedHolderName(feed_holder_name);
+      copy_program->SetFetchHolderName(fetch_holder_name);
+      ```
+    - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+    - 5、 准备feed数据，定义Tensor来fetch结果
+    - 6、 执行`copy_program`
+      ```cpp
+      executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+      ```
+    - 7、 使用fetch数据
+  - 主线程
+    - 8、 释放资源
+
+
+- 基本概念
+  - 数据相关：
+    - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md)，一个N维数组，数据可以是任意类型（int，float，double等）
+    - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)，带LoD(Level-of-Detail)即序列信息的Tensor
+    - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)，记录了变量Variable
+  - 执行相关：
+    - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md)，无状态执行器，只跟设备相关
+    - Place
+      - CPUPlace，CPU设备
+      - CUDAPlace，CUDA GPU设备
+  - 神经网络表示：
+    - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+    详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+  1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+  1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+  1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+  1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+  1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+  1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+  1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+  ```python
+  class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        ...
+        if scope is None:
+            scope = global_scope()
+        ...
+  ```
+  - 使用`InferenceTranspiler`将会直接修改`program`。
+  - 使用`InferenceTranspiler`会修改参数的值，请确保`program`的参数在`scope`内。
+- 支持的优化
+  - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+  ```python
+  import paddle.fluid as fluid
+  # NOTE: Applying the inference transpiler will change the inference_program.
+  t = fluid.InferenceTranspiler()
+  t.transpile(inference_program, place, inference_scope)
+  ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+  ```python
+  fluid.memory_optimize(inference_program)
+  ```
diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9404800eb86ca6d27886258b67393028c76954dc
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
@@ -0,0 +1,8 @@
+基准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e200b660cc7f6aeaf8b3d94fd7a14999a52bccd
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
@@ -0,0 +1,8 @@
+Benchmark
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..198a05a79e19227e90eaafe116217a164cd51a7d
--- /dev/null
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,183 @@
+# CPU性能调优
+
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05
--- /dev/null
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -0,0 +1,224 @@
+# Tune CPU performance
+
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
+
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
+
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
+
+1. the Python code and
+1. the mixture of Python and C++ code.
+
+## Profiling the Python Code
+
+### Generate the Performance Profiling File
+
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
+
+### Look into the Profiling File
+
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
+
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
+
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
+
+### Identify Performance Bottlenecks
+
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
+```
+
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
+
+```
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+
+## Profiling Python and C++ Code
+
+### Generate the Profiling File
+
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
+
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+Then we can run the following command
+
+```bash
+python -m yep -v main.py
+```
+
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
+
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
+
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
+
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
+
+### Examining the Profiling File
+
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
+
+```bash
+go get github.com/google/pprof
+```
+
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
+
+![result](./pprof_1.png)
+
+### Identifying the Performance Bottlenecks
+
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
+
+![kernel_perf](./pprof_2.png)
+
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
+
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb0883dd937465d15479b29df95078edb50e069
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+# 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+
+
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能：
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 环境
+
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+
+## 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+	
diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27cc96702356703b339db845dc81913bdcc9f23b
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_cn.rst
@@ -0,0 +1,9 @@
+性能优化
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_cn.md
+  benchmark/index_cn.rst
diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ce624fe8f108a6afc7cd08a1542332755d22e04
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_en.rst
@@ -0,0 +1,9 @@
+Performance Optimization
+---------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_en.md
+  benchmark/index_en.rst
diff --git a/doc/fluid/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/howto/optimization/pprof_1.png differ
diff --git a/doc/fluid/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/howto/optimization/pprof_2.png differ
diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/howto/optimization/timeline.jpeg differ
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..9f9303f46770a18e79dc2874ae0ca8123113d60d
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,53 @@
+# timeline工具简介
+
+## <span id="local">本地使用</span>
+
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
+
+## 分布式使用
+一般来说，分布式的训练程序都会有两种程序：pserver和trainer。我们提供了把pserver和trainer的profile日志用timeline来显示的方式。 
+ 
+1. trainer打开方式与[本地使用](#local)部分的第1步相同
+
+1. pserver可以通过加两个环境变量打开profile，例如：
+```
+FLAGS_rpc_server_profile_period=10 FLAGS_rpc_server_profile_path=./tmp/pserver python train.py
+```
+
+3. 把pserver和trainer的profile文件生成一个timeline文件，例如：  
+```
+python /paddle/tools/timeline.py
+    --profile_path trainer0=local_profile_10_pass0_0,trainer1=local_profile_10_pass0_1,pserver0=./pserver_0,pserver1=./pserver_1
+    --timeline_path ./dist.timeline
+```
+
+4. 在chrome中加载dist.timeline文件，方法和[本地使用](#local)第4步相同。
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..673452efe0ef4e77081db74da04a9ba15fc0a59d
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -0,0 +1,53 @@
+# How to use timeline tool to do profile
+
+## <span id="local">Local</span>
+
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+	```python
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
+	            ...
+	```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
+
+	![chrome tracing](./tracing.jpeg)
+
+1. The resulting timeline should be like:
+
+
+	![chrome timeline](./timeline.jpeg)
+	
+## Distributed
+This tool can support distributed train programs(pserver and trainer) too.
+
+1. Open traniner profiler just like how to use in [local](#local).
+
+1. Open pserver profiler: add some enviroment variables, eg:
+```
+FLAGS_rpc_server_profile_period=10 FLAGS_rpc_server_profile_path=./tmp/pserver python train.py
+```
+
+1. Merge pservers' and trainers' profiler file, eg:
+```
+python /paddle/tools/timeline.py
+    --profile_path trainer0=local_profile_10_pass0_0,trainer1=local_profile_10_pass0_1,pserver0=./pserver_0,pserver1=./pserver_1
+    --timeline_path ./dist.timeline
+```
+ 
+1. Load `dist.timeline` in chrome://tracing
diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/howto/optimization/tracing.jpeg differ
diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md
new file mode 100644
index 0000000000000000000000000000000000000000..749cf7693c75696feb17f8556224ed03649baa80
--- /dev/null
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -0,0 +1,92 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc.set_attr("min", self.min)
+        clip_op_desc.set_attr("max", self.max)
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (error_clip is None or isinstance(error_clip,
+                                                 BaseErrorClipAttr)):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/fluid/howto/performance/images/profiler.png b/doc/fluid/howto/performance/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/howto/performance/images/profiler.png differ
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..e38abebdc46f02adb10d75d8e2576712e41d926d
--- /dev/null
+++ b/doc/fluid/howto/performance/profiler.md
@@ -0,0 +1,116 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/howto/performance/images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range.
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece.
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
+
+```c++
+enum ProfilerState {
+  kDisabled,
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
+
+### Report sample
+
+```
+Event                                             Calls       Total       Min.        Max.        Ave.        Ratio.      
+thread101::deserial                               1410        392.302     0.032768    14.1058     0.278228    0.00117247  
+thread100::GetRPC                                 11          2951.13     7.60675     1426.75     268.284     0.00882     
+thread100::serial                                 14          75.3212     0.07584     36.2135     5.38009     0.000225112 
+thread100::SendRPC                                14          13.9494     0.003072    3.97517     0.996389    4.16905e-05 
+thread99::GetRPC                                  15          3012.62     2.79062     1426.61     200.841     0.00900378  
+... 
+thread0::matmul_grad                              1480        3674.28     0.375808    181.608     2.48262     0.0109813   
+thread0::matmul                                   1480        3365.82     0.196608    172.256     2.2742      0.0100594   
+thread0::mul_grad                                 3840        3167.39     0.411648    3.33824     0.82484     0.00946633  
+thread0::fetch_barrier                            5           3082.82     354.385     1617.88     616.564     0.00921359  
+thread0::dropout                                  2480        3014.05     0.201728    6.76454     1.21534     0.00900807  
+```
+
+Note: profiler can merge the same operator's time which runs multiple times in the same thread.
\ No newline at end of file
diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.png b/doc/fluid/howto/third_party/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.png b/doc/fluid/howto/third_party/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/howto/third_party/mkldnn_fluid.md b/doc/fluid/howto/third_party/mkldnn_fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..bef126f3f0577b69f646dfe5d10539b372c6a8a5
--- /dev/null
+++ b/doc/fluid/howto/third_party/mkldnn_fluid.md
@@ -0,0 +1,149 @@
+# Design Doc: Add MKLDNN Kernel in Fluid Operator
+
+## Principles
+
+First of all, we should follow some basical principles like:
+1.  [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
+2.  [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
+3.  [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
+4.  [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
+
+## Sulution
+
+In general, there are four parts we should follow to run a MKL-DNN primitive.
+-  Create a primitive descriptor that describe this operator
+-  Create a primitive itself by primitive descriptor and the engine
+-  Create all memory buffers that primitive needed
+-  Launch a stream to execute the primitive created
+More details can refer to [here](http://01org.github.io/mkl-dnn).
+
+It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
+So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
+
+It's assumed that following three conditions should be satisfied.
+1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
+2. the `Input Tensor` inside `Compute` function is the one after converted.
+3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
+
+### Compute
+The algorithm of `Compute` would be described as follow, let's take conv like an example.
+
+```c++
+
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
+  PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
+
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+  // find primitive by unique key from mkldnn context
+  // the op_key should be a unique name of this op instance
+  auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  // assuming the input tensor inside this compute function is the one after converted
+  // this point should be guarantee by another mechanism
+  auto& i = dev_ctx.findMemory(op_key + "_input");
+  
+  if (p == nullptr || i == nullptr || inputSizeChanged(p, i))  {
+    auto fwd_primitive_desc = createPrimitiveDesc(ctx);
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
+    shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
+    shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
+    shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
+
+    dev_ctx.addMemory(op_key+"_input", in);
+    dev_ctx.addMemory(op_key+"_output", out);
+    dev_ctx.addMemory(op_key+"_filer", wgt);
+    dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
+    dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  PADDLE_ENFORCE(p, "Should have forward Primitive");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
+  PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
+  dev_ctx.submit(p);
+  dev_ctx.execute();  // the convert primitive should have already contained.
+
+```
+
+The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
+```c++
+  auto* input = ctx.Input<Tensor>("Input");
+  auto* filter = ctx.Input<Tensor>("Filter");
+  auto* output = ctx.Output<Tensor>("Output");
+  std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+  int groups = ctx.Attr<int>("groups");
+  algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
+  prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
+    
+  auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
+  shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
+
+  return fwd_primitive_desc;
+  }
+```
+
+### MKLDNNDeviceContext
+`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
+
+
+### mkldnn_helper
+Some functions would be put in `paddle/platform/mkldnn_helper.h`.
+- create MKLDNN memories
+- create MKLDNN primitives
+- error check function
+- etc
+
+
+### Kernel Switch
+We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
+
+`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. 
+`trans` would be like this:
+
+```c++
+void trans(inputs, ctx) override {
+  if (NoNeedTrans()) {
+    return;
+  }
+  // find reorder primitive by op_key from context
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  auto& i = dev_ctx.findMemory(op_key + "_src_input");
+
+  if (p == nullptr || i == nullptr || changeSized(i, input)) {
+    auto prim = createPrimitiveDesc(ctx);
+    auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
+    auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
+    auto dst = createMemory(p->expected_desc(), newbuffer->data);
+    auto reorder_primitive(new mkldnn::reorder(src, dst));
+
+    dev_ctx.addMemory(op_key+"_src_input", src);
+    dev_ctx.addMemory(op_key+"_input", dst);
+    dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  PADDLE_ENFORCE(p, "Should have Reorder Primitive");
+  dev_ctx.submit(p);
+  if (! this->isMKLDNNKernel()) {
+    // execute immediately only if this is not mkldnn kernel function.
+    // otherwise, it can be executed with the operator primitive in Compute
+    dev_ctx.stream();
+  }
+  // after submit, the input tensor in ExecutionContext should be changed as the converted one
+  // there should be another mechanism to ensure this
+}
+```
+
+### Unit Test
+All the functions should be tested corresponding.
+TBD
diff --git a/doc/fluid/howto/third_party/paddle_nccl.md b/doc/fluid/howto/third_party/paddle_nccl.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7dac70998a6cfec3a6d2fc72b698ff9722e6805
--- /dev/null
+++ b/doc/fluid/howto/third_party/paddle_nccl.md
@@ -0,0 +1,65 @@
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/images/2_level_rnn.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/images/asgd.gif differ
diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/images/batch_norm_fork.png differ
diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/images/beam_search.png differ
diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/images/ci_build_whl.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/images/compiler.graffle differ
diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/images/compiler.png differ
diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/images/control_flow_graph.png differ
diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/images/dataflow_equations.png differ
diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/images/dcgan.png differ
diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/images/deep_learning.png differ
diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/images/dist-graph.graffle differ
diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/images/dist-graph.png differ
diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.graffle differ
diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.png differ
diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/images/ds2_network.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/images/feed_forward.png differ
diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.graffle differ
diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/images/l1_regularization.png differ
diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/images/l2_regularization.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/images/local-graph.graffle differ
diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/images/local-graph.png differ
diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/images/local_architecture.graffle differ
diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/images/local_architecture.png differ
diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/images/lookup_table.png differ
diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/images/lookup_table_training.png differ
diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/images/loss_equation.png differ
diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/images/multi-threads.graffle differ
diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/images/multi-threads@3x.png differ
diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/images/multiple_reader.png differ
diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c8ad839cb88788e9b5906402257cc7bbc3ddcb54
--- /dev/null
+++ b/doc/fluid/images/op.dot
@@ -0,0 +1,4 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8f24e9ea83acf879c7008f2d97113c0a4cc111c3
--- /dev/null
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
@@ -0,0 +1,38 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+
+  mul_op [label="MulOp"]
+  op_with_kern -> mul_op [dir=back, arrowtail=onormal]
+  mul_kernel [label="template &#60;typename Place&#62;\lclass MulOpKernel\l"]
+  op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
+  mul_op -> mul_kernel [arrowhead=vee, label="register many"]
+  
+  {
+    rank=same;
+    mul_op;
+    mul_kernel;
+  }
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4f5af4f7b5f5a69693a058c99eb658900136077a
--- /dev/null
+++ b/doc/fluid/images/op_with_kernel.dot
@@ -0,0 +1,26 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+}
\ No newline at end of file
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/images/paddle-compile.graffle differ
diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/images/paddle-compile.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/images/pprof_1.png differ
diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/images/pprof_2.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/images/profiler.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/images/readers.png differ
diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/images/remote_executor.graffle differ
diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/images/remote_executor.png differ
diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/images/rnn.jpg differ
diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/images/rnn.png differ
diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/images/rnn_2level_data.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/images/single-thread@3x.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/images/sparse_update.graffle differ
diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/images/sparse_update.png differ
diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/images/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/images/test.dot.png differ
diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/images/theta_star.gif differ
diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/images/timeline.jpeg differ
diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/images/tracing.jpeg differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/source/index.rst b/doc/fluid/index_cn.rst
similarity index 59%
rename from source/index.rst
rename to doc/fluid/index_cn.rst
index 21c0c38f08eb01af3f3bb4b1837d088f6e8198b1..0baa349574e3cdcbe669f027bbbf1af591070b8d 100644
--- a/source/index.rst
+++ b/doc/fluid/index_cn.rst
@@ -4,27 +4,13 @@
    contain the root `toctree` directive.
 
 ##############
-欢迎来到 Fluid
+欢迎使用 Fluid
 ##############
 
-..  todo::
-    内容简介，导引
-
-
 ..  toctree::
     :maxdepth: 1
 
-    quick_start/index.rst
+
+    beginners_guide/index.rst
     user_guides/index.rst
     advanced_usage/index.rst
-    api_guides/index.rst
-    api_reference/index.rst
-    faq.rst
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bc76b58982cf50e637d15cca0c5d78166aa73a9
--- /dev/null
+++ b/doc/fluid/index_en.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  build_and_install/index_en.rst
+  design/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/doc/fluid/overview.md b/doc/fluid/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7c253a54aafc7dd576fc442ba4d1db1cd13beeb
--- /dev/null
+++ b/doc/fluid/overview.md
@@ -0,0 +1,25 @@
+# 概览
+
+使用文档部分将帮助您更好的了解和学习 PaddlePaddle，本单元将主要为您展示 **教程** 和 **API** 两个板块。
+
+
+## 教程
+
+如果您想了解深度学习知识与 Fluid 使用方法，可以在教程部分查找相关内容。教程模块主要包含：
+
+- [新手入门](beginners_guide/index.html)：包含安装说明和多个简单的模型案例供您快速上手
+
+- [使用指南](user_guides/index.html)：包含 Fluid 使用说明和已开源的[模型库](user_guidex/models/index.html)助您更好地应用Fluid
+
+- [进阶使用](advanced_usage/index.html)：包含移动端部署、模型调优、书写Operator等高阶使用说明，使 Fluid 更贴合您的需求。
+ 
+ 
+## API
+
+如果您是PaddlePaddle的老用户，想查找与您项目相关的API，可以直接阅读：
+
+- [API Guide](api/api_guides/index.html)：介绍 Fluid 主要 API 的功能以及说明文档的接口
+
+
+- [API](api/index.html)：Fluid 已有 API 的设计思想与使用说明
+
diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb6d4563f5617fb98af055bca2f6f0479bdb4393
--- /dev/null
+++ b/doc/fluid/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
+      - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
+      - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
+      - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
+  - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
+    - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
diff --git a/doc/fluid/user_guides/design_idea/fluid_design_idea.md b/doc/fluid/user_guides/design_idea/fluid_design_idea.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b469ca3ac4b7ec5f18baf72dbfea43db2362a30
--- /dev/null
+++ b/doc/fluid/user_guides/design_idea/fluid_design_idea.md
@@ -0,0 +1,363 @@
+# Fluid设计思想
+
+## 简介
+
+本篇文档主要介绍Fluid底层的设计思想，帮助用户更好的理解框架运作过程。
+
+阅读本文档，您将了解：
+
+- Fluid 内部的执行流程
+- Program 如何描述模型
+- Executor 如何执行运算
+
+
+## 1. Fluid内部执行流程
+
+Fluid使用一种编译器式的执行流程，分为编译时和运行时两个部分，具体包括：编译器定义 Program ，创建Executor 运行 Program 。
+
+本地训练任务执行流程图如下所示：
+<p align="center">
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/design_idea/image/fluid_process.png" width=800>
+</p>
+
+ 1. 编译时，用户编写一段python程序，通过调用 Fluid 提供的算子，向一段 Program 中添加变量（Tensor）以及对变量的操作（Operators 或者 Layers）。用户只需要描述核心的前向计算，不需要关心反向计算、分布式下以及异构设备下如何计算。
+ 
+ 2. 原始的 Program 在平台内部转换为中间描述语言： `ProgramDesc`。
+ 
+ 3. 编译期最重要的一个功能模块是 `Transpiler`。`Transpiler` 接受一段 `ProgramDesc` ，输出一段变化后的 `ProgramDesc` ，作为后端 `Executor` 最终需要执行的 Fluid Program
+
+ 4. 后端 Executor 接受 Transpiler 输出的这段 Program ，依次执行其中的 Operator（可以类比为程序语言中的指令），在执行过程中会为 Operator 创建所需的输入输出并进行管理。
+	
+
+
+ 
+## 2. Program设计思想 
+
+用户完成网络定义后，一段 Fluid 程序中通常存在 2 段 Program：
+
+  1. fluid.default_startup_program：定义了创建模型参数，输入输出，以及模型中可学习参数的初始化等各种操作
+    
+    default_startup_program 可以由框架自动生成，使用时无需显示地创建
+    
+    如果调用修改了参数的默认初始化方式，框架会自动的将相关的修改加入default_startup_program
+  
+  2. fluid.default_main_program ：定义了神经网络模型，前向反向计算，以及优化算法对网络中可学习参数的更新
+    
+    使用Fluid的核心就是构建起 default_main_program
+
+
+
+### Programs and Blocks
+Fluid 的 Program 的基本结构是一些嵌套 blocks，形式上类似一段 C++ 或 Java 程序。
+
+blocks中包含：
+
+-  本地变量的定义
+-  一系列的operator 
+
+block的概念与通用程序一致，例如在下列这段C++代码中包含三个block：
+
+``` cpp
+int main(){ //block 0
+	int i = 0;
+	if (i<10){ //block 1
+		for (int j=0;j<10;j++){ //block 2
+		}
+	}
+	return 0;
+}
+```
+
+类似的，在下列 Fluid 的 Program 包含3段block：
+
+```python
+import paddle.fluid as fluid  # block 0
+
+limit = fluid.layers.fill_constant_batch_size_like(
+    input=label, dtype='int64', shape=[1], value=5.0)
+cond = fluid.layers.less_than(x=label, y=limit)
+
+ie = fluid.layers.IfElse(cond)
+with ie.true_block(): # block 1
+    true_image = ie.input(image)
+    hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
+    prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    ie.output(prob)
+
+with ie.false_block(): # block 2
+    false_image = ie.input(image)
+    hidden = fluid.layers.fc(
+        input=false_image, size=200, act='tanh')
+    prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    ie.output(prob)
+
+prob = ie()
+```
+### BlockDesc and ProgramDesc
+
+用户描述的block与program信息在Fluid中以[protobuf](https://en.wikipedia.org/wiki/Protocol_Buffers) 格式保存，所有的`protobub`信息被定义在`framework.proto`中，在Fluid中被称为BlockDesc和ProgramDesc。ProgramDesc和BlockDesc的概念类似于一个[抽象语法树](https://en.wikipedia.org/wiki/Abstract_syntax_tree)。
+
+`BlockDesc`中包含本地变量的定义`vars`，和一系列的operator`ops`：
+
+```cpp
+ message BlockDesc {
+  required int32 parent = 1;
+  repeated VarDesc vars = 2;
+  repeated OpDesc ops = 3;
+}
+```
+parent ID表示父块，因此block中的操作符可以引用本地定义的变量，也可以引用祖先块中定义的变量。
+
+Program 中的每层 block 都被压平并存储在数组中。blocks ID是这个数组中块的索引。
+
+```cpp
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+### 使用Blocks的Operator
+
+[Programs and Blocks](#Programs and Blocks)的例子中，IfElseOp这个Operator包含了两个block——true分支和false分支。
+
+下述OpDesc的定义过程描述了一个operator可以包含哪些属性：
+
+```cpp
+message OpDesc {
+  AttrDesc attrs = 1;
+  ...
+}
+```
+属性可以是block的类型，实际上就是上面描述的block ID:
+```cpp
+message AttrDesc {
+  required string name = 1;
+
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```
+<a name="Executor设计思想"></a>
+## 3. Executor设计思想
+
+Executor 在运行时将接受一个`ProgramDesc`、一个`block_id`和一个`Scope`。`ProgramDesc`是`block`的列表，每一项包含`block`中所有参数和`operator`的`protobuf`定义；`block_id`指定入口块；`Scope`是所有变量实例的容器。
+
+完成的编译执行的具体过程如下图所示：
+
+<p align="center">
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/design_idea/image/executor_design.png" width=600>
+</p>
+
+1. Executor 为每一个block创建一个Scope，Block是可嵌套的，因此Scope也是可嵌套的
+2. 创建所有Scope中的变量
+3. 按顺序创建并执行所有operator
+
+
+
+
+Executor的C++实现代码如下：
+
+```cpp
+class Executor{
+	public:
+		void Run(const ProgramDesc& pdesc,
+				Scope* scope,
+				int block_id) {
+			auto& block = pdesc.Block(block_id);
+			
+			//创建所有变量
+			for (auto& var : block.AllVars())
+				scope->Var(Var->Name());
+			}
+			
+			//创建OP并按顺序执行
+			for (auto& op_desc : block.AllOps()){
+				auto op = CreateOp(*op_desc);
+				op->Run(*local_scope, place_);
+			}
+		}
+	};
+```
+
+**创建Executor**
+
+Fluid中使用fluid.Executor(place)创建Executor，place属性由用户定义，代表程序将在哪里执行。
+
+下例代码表示创建一个Executor，其运行场所在CPU内：
+
+```python
+cpu=core.CPUPlace()
+exe = fluid.Executor(cpu)
+```
+
+**运行Executor**
+
+Fluid使用Executor.run来运行程序。定义中通过Feed映射获取数据，通过fetch\_list获取结果：
+
+```python
+...
+x = numpy.random.random(size=(10, 1)).astype('float32')
+outs = exe.run(
+	feed={'X': x},
+	fetch_list=[loss.name])
+```
+
+
+## 代码实例
+本节通过[Fluid编程指南](../../beginners_guide/programming_guide/programming_guide.html)中简单的线性回归例子，为您介绍上述内容如何在代码中实现。
+
+**定义Program**
+
+您可以随意定义自己的数据和网络结构，定义的结果都将作为一段 Program 被 Fluid 接收，Program 的基本结构是一些 blocks，本节的 Program 仅包含一个 block 0：
+
+```python
+#加载函数库
+import paddle.fluid as fluid #block 0
+import numpy
+
+#定义数据
+train_data=numpy.array([[1.0],[2.0],[3.0],[4.0]]).astype('float32')
+y_true = numpy.array([[2.0],[4.0],[6.0],[8.0]]).astype('float32')
+#定义网络
+x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+y = fluid.layers.data(name="y",shape=[1],dtype='float32')
+y_predict = fluid.layers.fc(input=x,size=1,act=None)
+#定义损失函数
+cost = fluid.layers.square_error_cost(input=y_predict,label=y)
+avg_cost = fluid.layers.mean(cost)
+#定义优化方法
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+sgd_optimizer.minimize(avg_cost)
+```
+
+完成上述定义，也就是完成了 fluid.default_main_program 的构建过程，fluid.default_main_program 中承载着神经网络模型，前向反向计算，以及优化算法对网络中可学习参数的更新。
+
+此时可以输出这段 Program 观察定义好的网络形态：
+```python
+print(fluid.default_main_program().to_string(True))
+```
+完整ProgramDesc可以在本地查看，本次仅节选前三个变量的结果如下：
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "mean_1.tmp_0"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: FP32
+          dims: 1
+        }
+      }
+    }
+    persistable: false
+  }
+  vars {
+    name: "square_error_cost_1.tmp_1"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: FP32
+          dims: -1
+          dims: 1
+        }
+        lod_level: 0
+      }
+    }
+    persistable: false
+  }
+  vars {
+    name: "square_error_cost_1.tmp_0"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: FP32
+          dims: -1
+          dims: 1
+        }
+        lod_level: 0
+      }
+    }
+    persistable: false
+    ...
+```
+从输出结果中可以看到，整个定义过程在框架内部转化为了一段ProgramDesc，以block idx为索引。本次线性回归模型中仅有1个block，ProgramDesc中也仅有block 0一段BlockDesc。
+
+BlockDesc中包含定义的 vars 和一系列的 ops，以输入x为例，python代码中定义 x 是一个数据类型为"float 32"的1维数据：
+```python
+x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+```
+在BlockDesc中，变量x被描述为：
+``` 
+vars {
+    name: "x"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: FP32
+          dims: -1
+          dims: 1
+        }
+        lod_level: 0
+      }
+    }
+    persistable: false
+```
+在Fluid中所有的数据类型都为LoD-Tensor，对于不存在序列信息的数据（如此处的变量X），其lod_level=0。
+
+dims表示数据的维度，这里表示 x 的维度为[-1,1]，其中-1是batch的维度，无法确定具体数值时，Fluid 自动用 -1 占位。
+
+参数`persistable`表示该变量在整个训练过程中是否为持久化变量。
+
+**创建Executor**
+
+Fluid使用Executor来执行网络训练，Executor运行细节请参考[Executor设计思想](#Executor设计思想)的介绍。作为使用者，实际并不需要了解内部机制。
+
+创建Executor只需调用 fluid.Executor(place) 即可，在此之前请您依据训练场所定义place变量：
+```python
+ #在CPU内执行训练
+ cpu = fluid.core.CPUPlace()
+ #创建Executor
+ exe = fluid.Executor(cpu)
+```
+**运行Executor**
+
+Fluid使用Executor.run来运行一段Program。
+
+正式进行网络训练前，需先执行参数初始化。其中 defalut_startup_program 中定义了创建模型参数，输入输出，以及模型中可学习参数的初始化等各种操作。
+```python
+ #参数初始化
+ exe.run(fluid.default_startup_program())
+```
+由于传入数据与传出数据存在多列，因此 fluid 通过 feed 映射定义数据的传输数据，通过 fetch_list 取出期望结果：
+```python
+#开始训练
+ outs = exe.run(
+     feed={'x':train_data,'y':y_true},
+     fetch_list=[y_predict.name,avg_cost.name])
+```
+上述代码段中定义了train_data传入x变量，y_true传入y变量，输出y的预测值和最后一轮cost值。
+
+输出结果为：
+```
+[array([[1.5248038],
+       [3.0496075],
+       [4.5744114],
+       [6.099215 ]], dtype=float32), array([1.6935859], dtype=float32)]
+```
+
+至此您已经了解了Fluid 内部的执行流程的核心概念，更多框架使用细节请参考[使用指南](../../user_guides/index.html)相关内容，[模型库](../../user_guides/models/index.html
+)中也为您提供了丰富的模型示例以供参考。
diff --git a/doc/fluid/user_guides/design_idea/image/executor_design.png b/doc/fluid/user_guides/design_idea/image/executor_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b3b7ad72fd487e4193beb41db7b7df748a2844c
Binary files /dev/null and b/doc/fluid/user_guides/design_idea/image/executor_design.png differ
diff --git a/doc/fluid/user_guides/design_idea/image/fluid_process.png b/doc/fluid/user_guides/design_idea/image/fluid_process.png
new file mode 100644
index 0000000000000000000000000000000000000000..12ab2b9a780e6b94fe6d7b6728f5501a62078424
Binary files /dev/null and b/doc/fluid/user_guides/design_idea/image/fluid_process.png differ
diff --git a/doc/fluid/user_guides/howto/basic_concept/fluid_basic_concept.rst b/doc/fluid/user_guides/howto/basic_concept/fluid_basic_concept.rst
new file mode 100644
index 0000000000000000000000000000000000000000..55c3c761f932713ffa2b462b35f9f46a8edae536
--- /dev/null
+++ b/doc/fluid/user_guides/howto/basic_concept/fluid_basic_concept.rst
@@ -0,0 +1,392 @@
+================================
+PaddleFluid设计思想和基本使用概念
+================================
+
+
+
+Paddle Fluid 是用来让用户像 PyTorch 和 Tensorflow Eager Execution 一样执行程序。
+在这些系统中，不再有模型这个概念，应用也不再包含一个用于描述 Operator 图或者一系列层的符号描述，
+而是像通用程序那样描述训练或者预测的过程。
+
+
+深度学习平台的演化
+================
+
+时至今日，深度学习已成为事实上最流行的机器学习技术。学术界多年研究加上工业界的长期实践提出了若干有效的基本建模单元：
+全连接，卷积，循环神经网络等；设计各类训练技巧：初始化方法，跨层连接，各类 norm 技术等；
+发明了各种新的优化算法：Adadelta，Adam 等；
+各类固定的网络结构：highway, residual, attention 等纷纷涌现，不胜枚举。
+学术界工业界多年的付出共同促成了深度学习方法今日的影响力。
+
+学术研究和生产实践中积累了大量的知识，能够很好的解释神经网络中基本模块各自独的学习能力和特性。
+基本模块和训练技术的组合能够搭建出千变万化的神经网络模型。
+基本模块和训练技术是有限的，但他们的组合却是千变万化，这是深度学习方法的魅力所在，也是难度所在。
+
+正是这样高度的模块化特性，研究者和工程师们都在努力避免重复造轮子以提高研究和生产的效率，
+又进一步催生了深度学习平台技术的发展，深度学习框架已演变成为 AI 基础设施中重要的一部分。
+从 Theano，到 DistBelief，到 TensorFlow；从 Caffe 到 Caffe2；
+从 Torch 到 PyTorch；从 PaddlePaddle 到 PaddleFluid，
+深度学习平台技术也经历了两代的演化，并向着第三代平台技术迈进。
+
+站在历史发展的今天，当我们准备切换尝试使用一个新的深度学习平台作为支持自己学习和研究的工具时，
+平台技术都发生了哪些演化，能够为我们的带来什么便利呢？
+
+先让我们来看看深度学习框架解决的三大问题：
+
+- 如何描述计算以支持未来潜在会出现的新模型？
+- 如何高效利用异构设备最大化算力？
+- 如何利用网络中的计算机进行分布式计算来处理千万亿级别的数据？
+
+以上三个问题中的第一个和使用者研究者最为密切相关。
+这篇文章我们通过分析 PaddleFluid的设计理念，
+来了解一个深度学习框架如何抽象深度学习模型，来看看我们的使用经验如何在不同深度学习平台之间过度和迁移。
+
+如何描述计算
+=============
+
+让我们首先来看看 PaddleFluid 如何描述机器学习模型
+
+
+PaddleFluid之 :code:`Program`
+
+如何描述计算很大程度决定了一个神经网络框架计算功能的完备性。
+深度学习模型和方法历经二十多年的发展：“依次执行一组计算的前向，
+再以和前向计算相反的顺序执行反向计算，中间无分支无交互”，
+这样的模型结构已经无法满足研究者和千千万万框架使用者的想象力。
+
+从 `PaddleFluid 的设计目标 <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md>`_ 来看，
+在如何描述机器学习模型这一核心问题上，PaddleFluid 的目标是：
+创造一种新的计算描述方式，不但能够描述至今为止人们已知的主流神经网络模型，并且能够支持未来会出现的任意模型。
+
+PaddleFluid 是如何做到支持未来出现的新模型这一目标呢？PaddleFluid 的设计选择是：
+对用户来说，用一段 :code:`Program` （在 PaddleFluid 内部会被转化为一种叫作 :code:`ProgramDesc` 的描述语言），
+而不是用计算图来描述机器学习模型。 :code:`Program` 用符合用户使用直觉的方式，
+提供一种新的描述语言能够描述任意复杂的机器学习模型。
+
+对所有计算机专业同学学习编程语言的第一课一定是建立对“程序语言的三种执行结构：顺序执行，条件选择和循环执行”的认识。
+计算机世界的所有可计算逻辑都是由这三种执行结构表示，用这三种结构描述的逻辑是可计算的。那么同样道理，
+对一个神经网络框架来说，如果可以和程序语言一样提供对这三种执行结构的支持，那么将可以描述任意复杂的，
+可被计算机计算的机器学习模型。PaddleFluid通过提供对这三种执行结构的支持，来做到对任意复杂模型的描述。
+
+具体来说：
+
+1. Fluid 的核心设计理念都可以类比到程序语言，如果已经有写程序的经验，那么使用 Fluid 构建神经网络模型的体验，将非常接近写程序；
+
+2. 在 PaddleFluid 中，用户不会显示地感知“计算图”这样的概念，一个机器学习模型被描述为一个 Fluid :code:`Program` （Fluid 内部称之为 :code:`ProgramDesc` ）；
+
+- 一个 Fluid :code:`Program` 由一组嵌套的 :code:`Block` 构成。 :code:`Block` 的概念可以类比到 C++ 或是 Java 中的一对大括号，或是 Python 语言中的一个缩进快；
+-  :code:`Block` 中的计算由顺序执行、条件选择或者循环执行三种方式组合，构成复杂的计算逻辑。
+
+3. Fluid :code:`Program` 中包含对计算和计算对象的描述。计算的描述称之为 Operator；计算作用的对象（或者说 Operator 的输入和输出）被统一为 Tensor。
+
+在描述计算和计算的作用对象这一问题上，各个深度学习框架的选择是相同的，如果有一个平台的使用经验，那么将非常容易在各个平台之间进行迁移。
+
+核心使用概念
+=============
+
+下面，我们将更详细地了解核心使用概念在PaddlePaddle的使用方法。
+
+数据表示和计算的对象：Tensor
+--------------------------
+
+Tensor 是向量矩阵概念的扩展，是神经网络模型计算操作的基本对象。这在是今天所有主流深度学习平台的共同选择。
+
+可以简单地将 Tensor 理解为一个 N 维向量，它可以有任意多的维度。一个 Tensor 具有两个基本特征：
+
+1. 数据类型：每个 Tensor 的所有元素具有同样的、已知的数据类型；
+
+2. 大小（或者说形状）：即维度的个数（rank，阶）以及各维度的长度。
+
+Tensor 某些维度的长度在定义模型阶段可能是未知的，在实际算法执行时才能确定。例如一个 mini-batch 中包含的样本数目（batch size），或者是一个 mini-batch 中序列的最大长度。
+
+PaddleFluid中的Tensor
+""""""""""""""""""""""
+
+PaddleFluid 中也使用 Tensor 作为神经网络中输入输出数据的统一表示。Tensor 的概念在今天主流的深度学习平台中都是完全相同，可以在各个深度学习框架之间直接无缝迁移。
+
+在 Fluid 中也同样存在三种特殊的 Tensor：
+
+1. 模型中的可学习参数
+
+模型中的可学习参数生存期和整个训练任务一样长，会接受优化算法的更新。在 PaddleFluid 中同样以 :code:`Variable` 表示；
+用户在绝大多数情况下都不需要自己来创建网络中的可学习参数，Fluid 为几乎常见的神经网络基本计算模块都提供了封装。
+以最简单的全连接模型为例，下面的代码片段会直接为全连接层创建连接权值 WW 和偏置（ :code:`bias` ）两个可学习参数，
+无需显示地调用 variable 相关接口创建可学习参数。
+
+
+::
+
+  import paddle.fluid as fluid
+
+  y = fluid.layers.fc(input=x, size=128, bias_attr=True)
+
+2. 输入输出Tensor
+
+整个神经网络的输入数据也是一个特殊的 Tensor，在这个 Tensor 中，
+一些维度的大小在定义模型时无法确定（通常包括：batch size；
+如果 mini-batch 之间，数据可变，也会包括序列的最大长度，图片的宽度和高度等），在定义模型时需要占位；
+PaddleFluid 中使用 :code:`fluid.layers.data` 来接入输入数据， :code:`fluid.layer.data` 需要提供输入 Tensor 的 形状信息，
+当遇到无法确定的维度 时， 相应维度指定为 None ，如下面的代码片段所示：
+
+::
+
+  import paddle.fluid as fluid
+
+  x = fluid.layers.data(name="x", shape=[2, None, 3], dtype="int64")
+
+3. 常量 Tensor 在 PaddleFluid 中需要通过组合 Tensor 和 :code:`fluid.layers.assign` 来实现。
+
+
+计算原语：Operation/Operator
+----------------------------
+
+Tensor 是今天所有主流深度学习框架的统一数据表示（输入、输出、中间计算结果、模型的可学习参数都是 Tensor）。
+另一方面，对数据的操作，在主流深度学习框架中也高度统一为：Operator/Operation。
+在中文中，通常我们会习惯将其称之为算子。
+
+注：在 PaddleFluid 中使用 Operator 称呼对 Tensor 的操作。
+
+Operation/Operator 接受多个 Tensor 作为输入，输出若干个 Tensor，表示了从输入到输出的变化。
+
+PaddleFluid中的Operator
+""""""""""""""""""""""""
+
+PaddleFluid 支持的所有算子，可以在 `API 帮助文档 <http://www.paddlepaddle.org/docs/develop/api/en/fluid/layers.html>`_ 中查看。
+
+为了便于用户使用，在 Python 端，Fluid 中的 Operator 被进一步封装入 :code:`paddle.fluid.layers` ，
+:code:`paddle.fluid.networks` 等模块。这是因为：一些常见的对Tensor的操作可能是有更多基础操作构成，
+例如：l2 norm 内部由 reduce、elementwise_add，scale 等多个 Operator 组合计算逻辑完成，
+为了提高使用的便利性，框架内部对基础 Operator 进行了一些封装，包括创建 Operator 依赖可学习参数，
+可学习参数的初始化细节等，减少用户重复开发的成本。
+
+对所有深度学习框架都面临同样的封装，在绝大多数情况下，用户很少会直接与框架底层的 Operator 直接打交道，而是使用框架提供的 layers，networks 等模块，降低开发的代码量。不论是什么样的概念，他们在各框架之间的本质和作用都是相同的：对 Tensor 的变换。
+
+总结
+>>>>>>
+
+不论叫作 Operation、Operator 还是 layers，他们在各深度学习平台中的含义和作用都是相同的：对 Tensor 的变换。是一个深度学习平台提供的基础计算能力。可以在每个平台各自的 API 帮助文档中查到。
+
+在各个深度学习平台都已加入 ONNX 项目的今天，每个深度学习平台提供给大家的基本算子都已趋同，与此同时，每个平台也各有其特点，会提供一些独特的算子，方便某一类任务的开发。
+
+构建模型并执行
+--------------
+
+整个训练任务运行方法如下：
+
+Fluid中的Program和Executor
+"""""""""""""""""""""""""""
+
+1. Fluid 使用 :code:`Program` 描述神经网络模型，对用户来说，并没有计算图的概念。
+用户定义的所有 Tensor 以及对 Tensor 的操作：Operator 都会被加入一段 :code:`Program` 中；
+
+一段 Program 由嵌套的 :code:`Block` 构成，但用户无需显示地创建 :code:`Block` 或是显示地注意到 :code:`Block` 的存在；
+在 Fluid 程序中， :code:`Block` 是在调用 :code:`while_op` ， :code:`if_op` ， :code:`parallel_do` 等特殊 :code:`Operator` 时，由这些 :code:`Operator` 来创建；
+对用户使用来说，只需要知道自己正在向一段 Fluid Program 中添加变量（ :code:`Tensor` ）和操作（ :code:`Operator` ）即可。
+
+2. Fluid 利用 :code:`Executor` 来执行一段 Fluid :code:`Program` 。
+
+为进一步理解 Fluid 中 :code:`Executor` 的作用，需要先解释一下 Fluid 程序的执行流程。 下图展示单机上，Fluid 程序的执行流程：
+
+.. figure:: fluid_local_train.jpeg
+
+   :scale: 50%
+   :align: center
+
+   Figure.1
+
+   Fluid本地训练任务执行流程图
+
+1. Fluid 设计思想和灵感非常类似于程序设计语言，和高级编译语言 C++/Java 编写程序的过程非常类似，Fluid 程序执行分为两个重要阶段：编译时和运行时；
+
+2. 编译期，用户通过调用 Fluid 提供的算子，向一段 :code:`Program` 中添加变量（Tensor）以及对变量的操作（Operators 或者 Layers）。用户只需要描述核心的前向计算，不需要关心反向计算，分布式下，异构设备下如何计算；
+
+3. 原始的 :code:`Program` 在平台内部转换为中间描述语言： :code:`ProgramDesc` ；
+
+4. 编译期最重要的一个功能模块是 Transpiler。Transpiler 接受一段 :code:`ProgramDesc` ，输出一段变化后的 :code:`ProgramDesc` ，作为后端 Executor 最终需要执行的 :code:`Fluid Program` ；
+
+最为常用的 Transipler 包括：
+
+1. 内存优化 Transipler：通过对变量读写依赖关系分析，插入内存回收 Operator 以维持运行过程中较小的内存开销；
+
+2. 分布式环境下的 Transpiler：接受用户定义的 local Program ，生成 Parameter Client 和 Parameter Server 执行的两段 :code:`Program` 。
+
+3. 后端 Executor 接受 Transpiler 输出的这段 :code:`Program` ，依次执行其中的 Operator（可以类比为程序语言中的指令），在执行过程中会为 Operator 创建所需的输入输出并进行管理。
+
+从上面的过程中可以看到，Fluid 程序的执行过程分为：编译器的定义 :code:`Program` ，和创建 :code:`Executor` 运行 :code:`Program` 。
+ :code:`Executor` 执行一段 :code:`Program` 的过程是不可交互和不可中断的。
+
+在 Fluid 中，可以创建多余一段 :code:`Program` 。默认情况，一个 PaddleFluid 程序中存在 2 段 Program：
+
+1.  :code:`fluid.framework.default_startup_program` ：其中定义了创建模型参数，输入输出，以及模型中可学习参数的初始化等各种操作；
+
+-  :code:`default_startup_program` 可以由框架自动生成，使用时无需显示地创建；
+- 如果调用修改了参数的默认初始化方式，框架会自动的将相关的修改加入 :code:`default_startup_program` 。
+
+2.  :code:`fluid.framework.default_main_program` ：定义了神经网络模型，前向反向计算，以及优化算法对网络中可学习参数的更新；
+
+- 使用 Fluid 的核心就是构建起 :code:`default_main_program` 。
+
+3. PaddleFluid 中的 :code:`Scope` 类似于 TensorFlow 中的 collection 这一概念，但在 Fluid 中 :code:`Scope` 是框架后端概念，用户无法直接操作。因此，在使用框架时无需关心。
+
+总结
+"""""
+
+Fluid 中通过 Executor 来执行一段用户定义的 Fluid :code:`Program` 。
+1. Executor 连接了 Fluid 的前端和后端；
+
+2. Executor 接受用户定义的原始模型（一段 :code:`Program` ），通过调用系统中不同功能更的 :code:`Transpiler` 完成对原始 :code:`Program` 的变化，进行优化。
+
+完整实例：如何完成一个机器学习模型的训练
+===================================
+
+
+
+这一节，我们以 MNIST 手写数字识别问题 —— 机器学习任务的“Hello World”问题和数据，为例，通过一个可以运行的完整实例，来学习上文介绍的概念如何在PaddleFluid 平台使用。
+
+步骤1：定义数据
+----------------
+
+PaddleFluid 中以 :code:`fluid.layers.data` 来接收输入数据。
+
+::
+
+  import numpy as np
+
+  import paddle.fluid as fluid
+  import paddle.v2 as paddle
+
+  # define the input layers for the network.
+  x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
+  y_ = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+Fluid 中 Tensor 的第 0 维度固定为 batch size。在上面代码段中，图像输入 :code:`x` 的形状为：[1, 28, 28]。这三个维度的含义分别是：channel 数目，图像的高度和宽度。
+
+实际上 Fluid 框架内部,一幅图像输入是一个 4-D Tensor，所有 Tensor 的第 0 维固定为 batch size。框架内部会自动为batch size进行填充占位。无需对batch size指定填充占位。
+
+如果除去 batch size（第 0 维度）外，如果 Tensor 某一维度的大小只能在运行时确定，可以在该位置上直接指定 :code:`None` 进行占位。
+
+步骤2：定义模型
+--------------
+
+通过调用 Fluid 提供的算子定义含有一个隐层的神经网络。Fluid 模型的分为模型结构和优化方法两部分。这一点与 TensorFlow 程序十分相似似，使用概念可以直接对应进行迁移。
+
+::
+
+  # define the network topology.
+  y = fluid.layers.fc(input=x, size=10, act="softmax")
+  loss = fluid.layers.cross_entropy(input=y, label=y_)
+  avg_loss = fluid.layers.mean(loss)
+
+  # define the optimization algorithm.
+  optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
+  optimizer.minimize(avg_loss)
+
+Fluid 使用 Program 而不是计算图描述模型，一般情况下，用户无需关心 Program 的细节，当调用以上 layers 时，会向一个全局的 Program： :code:`fluid.framework.default_main_program` 中插入变量（Tensor）和对变量的操作（上述代码段中的 layers 和 optimzier）。
+
+步骤3：参数初始化
+----------------
+
+如上文介绍，Fluid 程序中的 Executor 是连接 Fluid 前端和后端的接口。
+
+默认一个Fluid模型存在至少两段 Program。用于初始化网络中的可学习参数的那一段 :code:`Program` 叫作 :code:`fluid.default_startup_program()` 。
+
+只有执行器 executor 可以执行 Fluid Program，因此，在初始化网络中的可学习参数之前，需要首先创建一个 Fluid executor。
+
+::
+
+  # define the executor.
+  place = fluid.CPUPlace()
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+
+在以上代码段中， :code:`place` 用于告诉 executor 一段 Fluid Program 在何种设备上执行，
+常见的有 :code:`fluid.CPUPlace()` 和 :code:`fluid.CUDAPlace()` 。
+
+步骤4：数据输入 + 执行模型训练
+----------------------------
+
+我们在步骤 2 中定义的神经网络模型最终被插入一段叫做 :code:`fluid.framework.default_main_program` 的 Fluid Program 中。
+
+网络可学习参数初始化之后，可以通过让执行器 Executor 执行这段 :code:`fluid.framework.default_main_program` 来进行训练。
+
+::
+
+  train_reader = paddle.batch(
+        paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000),
+        batch_size=BATCH_SIZE)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y_])
+
+  for pass_id in range(100):
+    for batch_id, data in enumerate(train_reader()):
+        loss = exe.run(
+            fluid.framework.default_main_program(),
+            feed=feeder.feed(data),
+            fetch_list=[avg_loss])
+        print("Cur Cost : %f" % (np.array(loss[0])[0]))
+
+从上面的代码片段中可以看到，Fluid 程序的训练过程和 TensorFlow 程序的训练过程非常接近，
+都放在一个 :code:`for` 循环中，循环读取一个 mini-batch 数据，
+调用执行器执行 Fluid :code:`default_main_program` ：接收 mini-batch 输入，在其上进行前向，反向和参数更新计算。
+
+`注：上面程序使用了 Fluid 内置的 MNIST 数据，和我们提供给 TensorFlow 示例程序的 MNIST 数据完全一样。`
+
+步骤5：观察模型效果
+-----------------
+
+以上步骤已经构成了完整的 Tensorflow 模型训练程序，每个 batch 观察一次 loss，可以直观看到模型的迭代效果：
+
+.. figure:: fluid_mnist.png
+
+   :scale: 40%
+   :align: center
+
+   Figure.2
+
+   Fluid MNIST手写数字识别任务代价下降曲线
+
+附：完整代码
+------------
+
+::
+
+  import numpy as np
+
+  import paddle.fluid as fluid
+  import paddle.v2 as paddle
+
+
+  def main():
+      BATCH_SIZE = 128
+
+      # define the input layers for the network.
+      x = fluid.layers.data(name="img", shape=[1, 28, 28], dtype="float32")
+      y_ = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+      # define the network topology.
+      y = fluid.layers.fc(input=x, size=10, act="softmax")
+      loss = fluid.layers.cross_entropy(input=y, label=y_)
+      avg_loss = fluid.layers.mean(loss)
+
+      optimizer = fluid.optimizer.Adam(learning_rate=5e-3)
+      optimizer.minimize(avg_loss)
+
+      # define the executor.
+      place = fluid.CPUPlace()
+      exe = fluid.Executor(place)
+      exe.run(fluid.default_startup_program())
+
+      train_reader = paddle.batch(
+          paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=5000),
+          batch_size=BATCH_SIZE)
+      feeder = fluid.DataFeeder(place=place, feed_list=[x, y_])
+
+      for pass_id in range(100):
+          for batch_id, data in enumerate(train_reader()):
+              loss = exe.run(
+                  fluid.framework.default_main_program(),
+                  feed=feeder.feed(data),
+                  fetch_list=[avg_loss])
+              print("Cur Cost : %f" % (np.array(loss[0])[0]))
+
+  if __name__ == "__main__":
+      main()
diff --git a/doc/fluid/user_guides/howto/basic_concept/fluid_local_train.jpeg b/doc/fluid/user_guides/howto/basic_concept/fluid_local_train.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0a495901fafb85987e34acc3c454fb87e8160fca
Binary files /dev/null and b/doc/fluid/user_guides/howto/basic_concept/fluid_local_train.jpeg differ
diff --git a/doc/fluid/user_guides/howto/basic_concept/fluid_mnist.png b/doc/fluid/user_guides/howto/basic_concept/fluid_mnist.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5ad0ba058c863cf68ef0789e58fcf67b3115fdb
Binary files /dev/null and b/doc/fluid/user_guides/howto/basic_concept/fluid_mnist.png differ
diff --git a/doc/fluid/user_guides/howto/configure_simple_model/index.rst b/doc/fluid/user_guides/howto/configure_simple_model/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5946a2ccb7e43004eae39ec4b3c6112c66c1fd04
--- /dev/null
+++ b/doc/fluid/user_guides/howto/configure_simple_model/index.rst
@@ -0,0 +1,88 @@
+..  _user_guide_configure_simple_model:
+
+##############
+配置简单的网络
+##############
+
+在解决实际问题时，可以先从逻辑层面对问题进行建模，明确模型所需要的 **输入数据类型**、**计算逻辑**、**求解目标** 以及 **优化算法**。PaddlePaddle提供了丰富的算子来实现模型逻辑。下面以一个简单回归任务举例说明如何使用PaddlePaddle构建模型。该例子完整代码参见 `fit_a_line <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py>`_。
+
+问题描述及定义
+##############
+
+问题描述: 给定一组数据 :math:`<X, Y>`，求解出函数 :math:`f`，使得 :math:`y=f(x)`，其中 :math:`x\subset X` 表示一条样本的特征，为 :math:`13` 维的实数向量；:math:`y \subset Y` 为一实数表示该样本对应的值。
+
+我们可以尝试用回归模型来对问题建模，回归问题的损失函数有很多，这里选择常用的均方误差。为简化问题，这里假定 :math:`f` 为简单的线性变换函数，同时选用随机梯度下降算法来求解模型。
+
++----------------+----------------------------------------------+
+| 输入数据类型   |  样本特征: 13 维 实数                        |
++                +----------------------------------------------+
+|                |  样本标签: 1 维 实数                         |
++----------------+----------------------------------------------+
+| 计算逻辑       | 使用线性模型，产生 1维实数作为模型的预测输出 |
++----------------+----------------------------------------------+
+| 求解目标       | 最小化模型预测输出与样本标签间的均方误差     |
++----------------+----------------------------------------------+
+| 优化算法       | 随机梯度下降                                 |
++----------------+----------------------------------------------+
+
+使用PaddlePadle建模
+###################
+
+从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后，需要使用PaddlePaddle提供的API及算子来实现模型逻辑。一个典型的模型主要包含4个部分，分别是：输入数据格式定义，模型前向计算逻辑，损失函数以及优化算法。
+
+数据层
+------
+
+PaddlePaddle提供了 :code:`fluid.layers.data()` 算子来描述输入数据的格式。
+
+:code:`fluid.layers.data()` 算子的输出是一个Variable。这个Variable的实际类型是Tensor。Tensor具有强大的表征能力，可以表示多维数据。为了精确描述数据结构，通常需要指定数据shape以及数值类型type。其中shape为一个整数向量，type可以是一个字符串类型。目前支持的数据类型参考    :ref:`user_guide_paddle_support_data_types` 。 模型训练一般会使用batch的方式读取数据，而batch的size在训练过程中可能不固定。data算子会依据实际数据来推断batch size，所以这里提供shape时不用关心batch size，只需关心一条样本的shape即可，更高级用法请参考 :ref:`user_guide_customize_batch_size_rank`。从上知，:math:`x` 为 :math:`13` 维的实数向量，:math:`y` 为实数，可使用下面代码定义数据层：
+
+.. code-block:: python
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+该模型使用的数据比较简单，事实上data算子还可以描述变长的、嵌套的序列数据。也可以使用 :code:`open_files` 打开文件进行训练。更详细的文档可参照 :ref:`user_guide_prepare_data`。
+
+前向计算逻辑
+------------
+
+实现一个模型最重要的部分是实现计算逻辑，PaddlePaddle提供了丰富的算子。这些算子的封装粒度不同，通常对应一种或一组变换逻辑。算子输出即为对输入数据执行变换后的结果。用户可以灵活使用算子来完成复杂的模型逻辑。比如图像相关任务中会使用较多的卷积算子、序列任务中会使用LSTM/GRU等算子。复杂模型通常会组合多种算子，以完成复杂的变换。PaddlePaddle提供了非常自然的方式来组合算子，一般地可以使用下面的方式：
+
+.. code-block:: python
+
+    op_1_out = fluid.layers.op_1(input=op_1_in, ...)
+    op_2_out = fluid.layers.op_2(input=op_1_out, ...)
+    ...
+
+其中op_1和op_2表示算子类型，可以是fc来执行线性变换(全连接)，也可以是conv来执行卷积变换等。通过算子的输入输出的连接来定义算子的计算顺序以及数据流方向。上面的例子中，op_1的输出是op_2的输入，那么在执行计算时，会先计算op_1，然后计算op_2。更复杂的模型可能需要使用控制流算子，依据输入数据来动态执行，针对这种情况，PaddlePaddle提供了IfElseOp和WhileOp等。算子的文档可参考 :code:`fluid.layers`。具体到这个任务, 我们使用一个fc算子：
+
+.. code-block:: python
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+损失函数
+--------
+
+损失函数对应求解目标，我们可以通过最小化损失来求解模型。大多数模型使用的损失函数，输出是一个实数值。但是PaddlePaddle提供的损失算子一般是针对一条样本计算。当输入一个batch的数据时，损失算子的输出有多个值，每个值对应一条样本的损失，所以通常会在损失算子后面使用mean等算子，来对损失做归约。模型在一次前向迭代后会得到一个损失值，PaddlePaddle会自动执行链式求导法则计算模型里面每个参数和变量对应的梯度值。这里使用均方误差损失：
+
+.. code-block:: python
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+优化方法
+--------
+
+确定损失函数后，可以通过前向计算得到损失值，然后通过链式求导法则得到参数的梯度值。获取梯度值后需要更新参数，最简单的算法是随机梯度下降法：:math:`w=w - \eta \cdot g`。但是普通的随机梯度下降算法存在一些问题: 比如收敛不稳定等。为了改善模型的训练速度以及效果，学术界先后提出了很多优化算法，包括： :code:`Momentum`、:code:`RMSProp`、:code:`Adam` 等。这些优化算法采用不同的策略来更新模型参数，一般可以针对具体任务和具体模型来选择优化算法。不管使用何种优化算法，学习率一般是一个需要指定的比较重要的超参数，需要通过实验仔细调整。这里采用随机梯度下降算法：
+
+.. code-block:: python
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+
+更多优化算子可以参考 :code:`fluid.optimizer()` 。
+
+下一步做什么？
+##############
+
+使用PaddlePaddle实现模型时需要关注 **数据层**、**前向计算逻辑**、**损失函数** 和 **优化方法**。不同的任务需要的数据格式不同，涉及的计算逻辑不同，损失函数不同，优化方法也不同。PaddlePaddle提供了丰富的模型示例，可以以这些示例为参考来构建自己的模型结构。用户可以访问 `模型库 <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_ 查看官方提供的示例。
diff --git a/doc/fluid/user_guides/howto/evaluation_and_debugging/debug/index.rst b/doc/fluid/user_guides/howto/evaluation_and_debugging/debug/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0878e17b4069be6b08bc85a35e77ba6421633218
--- /dev/null
+++ b/doc/fluid/user_guides/howto/evaluation_and_debugging/debug/index.rst
@@ -0,0 +1,10 @@
+############
+Debug 工具
+############
+
+PaddlePaddle 提供了如下方式方便 Debug 训练 情况
+
+.. toctree::
+   :maxdepth: 2
+
+   visualdl.md
diff --git a/doc/fluid/user_guides/howto/evaluation_and_debugging/debug/visualdl.md b/doc/fluid/user_guides/howto/evaluation_and_debugging/debug/visualdl.md
new file mode 100644
index 0000000000000000000000000000000000000000..747f32c31dffc873fa5ed0459046f93cd86834c0
--- /dev/null
+++ b/doc/fluid/user_guides/howto/evaluation_and_debugging/debug/visualdl.md
@@ -0,0 +1,220 @@
+# Visual DL工具
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/VisualDL/develop/docs/images/vs-logo.png" width="60%" />
+</p>
+
+## 介绍
+VisualDL是一个面向深度学习任务设计的可视化工具，包含了scalar、参数分布、模型结构、图像可视化等功能，项目正处于高速迭代中，新的组件会不断加入。
+
+目前大多数DNN平台均使用Python作为配置语言，VisualDL原生支持python的使用，
+通过在模型的Python配置中添加几行，便可以为训练过程提供丰富的可视化支持。
+
+除了Python SDK之外，VisualDL底层采用C++编写，其暴露的C++ SDK也可以集成到其他平台中，
+实现原生的性能和定制效果。
+
+## 组件
+VisualDL 目前支持4种组件：
+
+- graph
+- scalar
+- image
+- histogram
+
+### Graph
+兼容 ONNX(Open Neural Network Exchange)[https://github.com/onnx/onnx], 通过与 python SDK的结合，VisualDL可以兼容包括 PaddlePaddle, pytorch, mxnet在内的大部分主流DNN平台。
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/daming-lu/large_files/master/graph_demo.gif" width="60%" />
+</p>
+
+### Scalar
+可以用于展示训练测试的误差趋势
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_scalar.gif" width="60%"/>
+</p>
+
+### Image
+可以用于可视化任何tensor，或模型生成的图片
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_image.gif" width="60%"/>
+</p>
+
+### Histogram
+
+用于可视化任何tensor中元素分布的变化趋势
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/histogram.gif" width="60%"/>
+</p>
+
+## 快速尝试
+请使用下面的命令，来快速测试 VisualDL。
+
+```
+# 安装，建議是在虚拟环境或anaconda下。
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果以上步骤出现问题，很可能是因为python或pip不同版本或不同位置所致，以下安装方法能解决。
+
+## 使用 virtualenv 安装
+
+[Virtualenv](https://virtualenv.pypa.io/en/stable/) 能创建独立Python环境，也能确保Python和pip的相对位置正确。
+
+在macOS上，安装pip和virtualenv如下：
+```
+sudo easy_install pip
+pip install --upgrade virtualenv
+```
+
+在Linux上，安装pip和virtualenv如下:
+```
+sudo apt-get install python3-pip python3-dev python-virtualenv
+```
+
+然后创建一个虚拟环境：
+```
+virtualenv ~/vdl  # for Python2.7
+virtualenv -p python3 ~/vdl for Python 3.x
+```
+
+```~/vdl``` 是你的Virtualenv目录, 你也可以选择任一目录。
+
+激活虚拟环境如下：
+```
+source ~/vdl/bin/activate
+```
+
+现在再安装 VisualDL 和运行范例：
+
+```
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
+
+如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。
+
+
+## 使用 Anaconda 安装
+
+Anaconda是一个用于科学计算的Python发行版，提供了包管理与环境管理的功能，可以很方便地解决多版本python并存、切换以及各种第三方包安装问题。
+
+请根据[Anaconda下载网站](https://www.anaconda.com/download) 的指示去下载和安装Anaconda.
+下载Python 3.6版本的command-Line installer.
+
+创建conda环境名字为```vdl```或任何名字:
+```
+conda create -n vdl pip python=2.7 # or python=3.3, etc.
+```
+
+激活conda环境如下:
+```
+source activate vdl
+```
+
+现在再安装 VisualDL 和运行范例：
+
+```
+pip install --upgrade visualdl
+
+# 运行一个例子，vdl_create_scratch_log 将创建测试日志
+vdl_create_scratch_log
+visualDL --logdir=scratch_log --port=8080
+
+# 访问 http://127.0.0.1:8080
+```
+
+如果仍然遇到安装问题，请尝试以下用源代码安装方法。
+
+### 使用代码安装
+```
+#建議是在虚拟环境或anaconda下。
+git clone https://github.com/PaddlePaddle/VisualDL.git
+cd VisualDL
+
+python setup.py bdist_wheel
+pip install --upgrade dist/visualdl-*.whl
+```
+
+如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
+
+
+## SDK
+VisualDL 同时提供了python SDK 和 C++ SDK 来实现不同方式的使用。
+
+### Python SDK
+VisualDL 现在支持 Python 2和 Python 3。
+
+以最简单的Scalar组件为例，尝试创建一个scalar组件并插入多个时间步的数据：
+
+```python
+import random
+from visualdl import LogWriter
+
+logdir = "./tmp"
+logger = LogWriter(logdir, sync_cycle=10000)
+
+# mark the components with 'train' label.
+with logger.mode("train"):
+    # create a scalar component called 'scalars/scalar0'
+    scalar0 = logger.scalar("scalars/scalar0")
+
+# add some records during DL model running.
+for step in range(100):
+    scalar0.add_record(step, random.random())
+```
+
+### C++ SDK
+上面 Python SDK 中代码完全一致的C++ SDK用法如下
+```c++
+#include <cstdlib>
+#include <string>
+#include "visualdl/sdk.h"
+
+namespace vs = visualdl;
+namespace cp = visualdl::components;
+
+int main() {
+  const std::string dir = "./tmp";
+  vs::LogWriter logger(dir, 10000);
+
+  logger.SetMode("train");
+  auto tablet = logger.AddTablet("scalars/scalar0");
+
+  cp::Scalar<float> scalar0(tablet);
+
+  for (int step = 0; step < 1000; step++) {
+    float v = (float)std::rand() / RAND_MAX;
+    scalar0.AddRecord(step, v);
+  }
+
+  return 0;
+}
+```
+## 启动Board
+当训练过程中已经产生了日志数据，就可以启动board进行实时预览可视化信息
+
+```
+visualDL --logdir <some log dir>
+```
+
+board 还支持一下参数来实现远程的访问：
+
+- `--host` 设定IP
+- `--port` 设定端口
+- `--model_pb` 指定 ONNX 格式的模型文件
diff --git a/doc/fluid/user_guides/howto/evaluation_and_debugging/evaluation/index.rst b/doc/fluid/user_guides/howto/evaluation_and_debugging/evaluation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f6698cadcba4d9645fdc4a8a74d899598b96d99
--- /dev/null
+++ b/doc/fluid/user_guides/howto/evaluation_and_debugging/evaluation/index.rst
@@ -0,0 +1,10 @@
+############
+模型评估和调试
+############
+
+PaddlePaddle Fluid提供了常用的模型评估指标，并提供了VisualDL工具可视化模型效果。
+
+.. toctree::
+   :maxdepth: 2
+
+   metrics
diff --git a/doc/fluid/user_guides/howto/evaluation_and_debugging/evaluation/metrics.rst b/doc/fluid/user_guides/howto/evaluation_and_debugging/evaluation/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f37968a50350a90e698cb1a63bd501635753e7fb
--- /dev/null
+++ b/doc/fluid/user_guides/howto/evaluation_and_debugging/evaluation/metrics.rst
@@ -0,0 +1,62 @@
+############
+模型评估
+############
+
+模型评估是用指标反映模型在预期目标下精度，根据模型任务决定观察指标，作为在训练中调整超参数，评估模型效果的重要依据。
+metric函数的输入为当前模型的预测preds和labels，输出是自定义的。metric函数和loss函数非常相似，但是metric并不是模型训练网络组成部分。
+
+用户可以通过训练网络得到当前的预测preds和labels，在Python端定制metric函数；也可以通过定制c++ Operator的方式，在GPU上加速metric计算。
+
+paddle.fluid.metrics模块包含该功能
+
+
+常用指标
+############
+
+metric函数根据模型任务不同，指标构建方法因任务而异。
+
+回归类型任务labels是实数，因此loss和metric函数构建相同，可参考MSE的方法。
+分类任务常用指标为分类指标，本文提到的一般是二分类指标，多分类和多标签需要查看对应的API文档。例如排序指标auc，多分类可以作为0，1分类任务，auc指标仍然适用。
+Fluid中包含了常用分类指标，例如Precision, Recall, Accuracy等,更多请阅读API文档。以 :ref:`Precision` 为例，具体方法为
+
+.. code-block:: python
+
+   >>> import paddle.fluid as fluid
+   >>> labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+   >>> data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+   >>> pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+   >>> acc = fluid.metrics.Precision()
+   >>> for pass in range(PASSES):
+   >>>   acc.reset()
+   >>>   for data in train_reader():
+   >>>       loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+   >>>   acc.update(preds=preds, labels=labels)
+   >>>   numpy_acc = acc.eval()
+      
+
+其他任务例如MultiTask Learning，Metric Learning，Learning To Rank各种指标构造方法请参考API文档。
+
+自定义指标
+############
+Fluid支持自定义指标，灵活支持各类计算任务。下文通过一个简单的计数器metric函数，实现对模型的评估。
+其中preds是模型预测值，labels是给定的标签。
+
+.. code-block:: python
+
+   >>> class MyMetric(MetricBase):
+   >>>     def __init__(self, name=None):
+   >>>         super(MyMetric, self).__init__(name)
+   >>>         self.counter = 0  # simple counter
+
+   >>>     def reset(self):
+   >>>         self.counter = 0
+
+   >>>     def update(self, preds, labels):
+   >>>         if not _is_numpy_(preds):
+   >>>             raise ValueError("The 'preds' must be a numpy ndarray.")
+   >>>         if not _is_numpy_(labels):
+   >>>             raise ValueError("The 'labels' must be a numpy ndarray.")
+   >>>         self.counter += sum(preds == labels)
+
+   >>>     def eval(self):
+   >>>         return self.counter
diff --git a/doc/fluid/user_guides/howto/evaluation_and_debugging/index.rst b/doc/fluid/user_guides/howto/evaluation_and_debugging/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2395bc09ce5a9aad899601e17742a73cb621082
--- /dev/null
+++ b/doc/fluid/user_guides/howto/evaluation_and_debugging/index.rst
@@ -0,0 +1,9 @@
+###############
+模型评估/调试
+###############
+
+..  toctree::
+    :maxdepth: 2
+
+    evaluation/metrics.rst
+    debug/visualdl.md
diff --git a/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7b6faef0bcada84801cc9610738d434f75c2b2fc
--- /dev/null
+++ b/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,99 @@
+.. _install_or_build_cpp_inference_lib:
+
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid_inference.tgz>`_ 
+cpu_avx_openblas         `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid_inference.tgz>`_
+cpu_noavx_openblas       `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid_inference.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid_inference.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid_inference.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid_inference.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid_inference.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INFERENCE_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+ON_INFER            ON（预测优化）
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           -DON_INFER=ON \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   ├── include
+     │   │   └── paddle_inference_api.h
+     │   └── lib
+     │       ├── libpaddle_fluid.a
+     │       └── libpaddle_fluid.so
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: 23da8defc8314b0c711130c1d9536e2cf2fb8414
+     WITH_MKL: ON
+     WITH_MKLDNN: OFF
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/user_guides/howto/inference/image/image1.png b/doc/fluid/user_guides/howto/inference/image/image1.png
new file mode 100644
index 0000000000000000000000000000000000000000..04e91da704b07fb68e2d7825e80d384bbfd5ba09
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image1.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image2.png b/doc/fluid/user_guides/howto/inference/image/image2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2d4ca01ebbffaaad14a6a5eade02baaec4e732f2
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image2.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image3.png b/doc/fluid/user_guides/howto/inference/image/image3.png
new file mode 100644
index 0000000000000000000000000000000000000000..7eb8c16146175f9d28e0a216ac18788f601e600c
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image3.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image4.png b/doc/fluid/user_guides/howto/inference/image/image4.png
new file mode 100644
index 0000000000000000000000000000000000000000..34a0c21880e29abb1cfbacba0ff8a1c2dde2e757
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image4.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image5.png b/doc/fluid/user_guides/howto/inference/image/image5.png
new file mode 100644
index 0000000000000000000000000000000000000000..4aa8529185854877e1b3c6bc6236fd8f9902a884
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image5.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image6.png b/doc/fluid/user_guides/howto/inference/image/image6.png
new file mode 100644
index 0000000000000000000000000000000000000000..499b1dc265d0101515183d0ff78ba6004ad82b07
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image6.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image7.png b/doc/fluid/user_guides/howto/inference/image/image7.png
new file mode 100644
index 0000000000000000000000000000000000000000..a9f40af362a6e9e507ca549ae846bb6fee28387a
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image7.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image8.png b/doc/fluid/user_guides/howto/inference/image/image8.png
new file mode 100644
index 0000000000000000000000000000000000000000..6db078a7ae6efd8544ee37b4d57f21d0e111fe0c
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image8.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/image9.png b/doc/fluid/user_guides/howto/inference/image/image9.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0dea70856a87854e20a5629093ca5ef7b9d0e51
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/image9.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/model_graph_original.png b/doc/fluid/user_guides/howto/inference/image/model_graph_original.png
new file mode 100644
index 0000000000000000000000000000000000000000..c1ce03d1cd77f7a8d07ccbca3964642f2faefe00
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/model_graph_original.png differ
diff --git a/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png b/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png
new file mode 100644
index 0000000000000000000000000000000000000000..6db0d35f0a9bdd7ec9376eb71f69b0ab16924181
Binary files /dev/null and b/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png differ
diff --git a/doc/fluid/user_guides/howto/inference/index.rst b/doc/fluid/user_guides/howto/inference/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e572b2d4df708b07906bc09096228230b8ff710
--- /dev/null
+++ b/doc/fluid/user_guides/howto/inference/index.rst
@@ -0,0 +1,13 @@
+############
+预测部署
+############
+
+PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
+
+.. toctree::
+   :maxdepth: 2
+
+   build_and_install_lib_cn.rst
+   native_infer.md
+   paddle_tensorrt_infer.md
+   windows_cpp_inference.md
diff --git a/doc/fluid/user_guides/howto/inference/native_infer.md b/doc/fluid/user_guides/howto/inference/native_infer.md
new file mode 100644
index 0000000000000000000000000000000000000000..9a82c2982e9b80dfc2e940eb37f04b8922929d75
--- /dev/null
+++ b/doc/fluid/user_guides/howto/inference/native_infer.md
@@ -0,0 +1,136 @@
+# Paddle 预测 API
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
+
+预测库包含:
+
+- 头文件 `paddle_inference_api.h` 定义了所有的接口
+- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
+
+下面是详细介绍
+
+## PaddleTensor
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，常用字段：
+
+- `name` 用于指定输入数据对应的 模型中variable 的名字
+- `shape` 表示一个 Tensor 的 shape
+- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
+- `dtype` 表示 Tensor 的数据类型
+
+## 利用Config 创建不同引擎
+
+高层 API 底层有多种优化实现，我们称之为 engine；不同 engine 的切换通过传递不同的 Config 实现重载
+
+- `NativeConfig` 原生 engine，由 paddle 原生的 forward operator
+    组成，可以天然支持所有paddle 训练出的模型，
+
+- `MixedRTConfig` TensorRT mixed engine 用于 GPU
+    加速，用子图的方式支持了 [TensorRT] ，支持所有paddle
+    模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+
+## 预测部署过程
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 `PaddlePredictor`
+2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
+3. 获取输出的 `PaddleTensor` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+```c++
+#include "paddle_inference_api.h"
+
+// 创建一个 config，并修改相关设置
+paddle::NativeConfig config;
+config.model_dir = "xxx";
+config.use_gpu = false;
+// 创建一个原生的 PaddlePredictor
+auto predictor =
+      paddle::CreatePaddlePredictor<paddle::NativeConfig>(config);
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor;
+tensor.shape = std::vector<int>({4, 1});
+tensor.data.Reset(data, sizeof(data));
+tensor.dtype = paddle::PaddleDType::INT64;
+// 创建输出 tensor，输出 tensor 的内存可以复用
+std::vector<paddle::PaddleTensor> outputs;
+// 执行预测
+CHECK(predictor->Run(slots, &outputs));
+// 获取 outputs ...
+```
+
+编译时，联编 `libpaddle_fluid.a/.so` 便可。 
+
+
+
+## 高阶使用
+
+### 输入输出的内存管理
+`PaddleTensor` 的 `data` 字段是一个 `PaddleBuf`，用于管理一段内存用于数据的拷贝。 
+
+`PaddleBuf` 在内存管理方面有两种模式：
+
+1. 自动分配和管理内存
+    
+    ```c++
+    int some_size = 1024;
+    PaddleTensor tensor;
+    tensor.data.Resize(some_size);
+    ```
+
+2. 外部内存传入
+    ```c++
+    int some_size = 1024;
+    // 用户外部分配内存并保证 PaddleTensor 使用过程中，内存一直可用
+    void* memory = new char[some_size]; 
+    
+    tensor.data.Reset(memory, some_size);
+    // ...
+    
+    // 用户最后需要自行删除内存以避免内存泄漏
+    
+    delete[] memory;
+    ```
+
+两种模式中，第一种比较方便；第二种则可以严格控制内存的管理，便于与 `tcmalloc` 等库的集成。
+
+### 基于 contrib::AnalysisConfig  提升性能 (预发布)
+*AnalyisConfig 目前正在预发布阶段，用 `namespace contrib` 进行了保护，后续可能会有调整*
+
+类似 `NativeConfig` ， `AnalysisConfig` 可以创建一个经过一系列优化的高性能预测引擎。 其中包含了计算图的分析和优化，以及对一些重要 Op 的融合改写等，**对使用了 While, LSTM, GRU 等模型性能有大幅提升** 。
+
+`AnalysisConfig` 的使用方法也和 `NativeConfig` 类似，但 *目前仅支持 CPU，正在增加对GPU 的支持*
+
+```c++
+AnalysisConfig config;
+config.model_dir = xxx;
+config.use_gpu = false;  // 目前还不支持 GPU 的优化
+config.specify_input_name = true; // 需要指定输入的 name
+```
+
+这里需要注意的是，输入的 PaddleTensor 需要指定，比如之前的例子需要修改为
+
+```c++
+auto predictor =
+      paddle::CreatePaddlePredictor<paddle::contrib::AnalysisConfig>(config); // 注意这里需要 AnalysisConfig
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor;
+tensor.shape = std::vector<int>({4, 1});
+tensor.data.Reset(data, sizeof(data));
+tensor.dtype = paddle::PaddleDType::INT64;
+tensor.name = "input0"; // 注意这里的 name 需要设定
+```
+
+### 性能建议
+1. 在 CPU型号允许的情况下，尽量使用带 AVX 和 MKL 的版本
+2. 复用输入和输出的 `PaddleTensor` 以避免频繁分配内存拉低性能
+3. CPU预测，可以尝试把 `NativeConfig` 改成成 `AnalysisConfig` 来进行优化
+
+## 详细代码参考
+
+[inference demos](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci)
diff --git a/doc/fluid/user_guides/howto/inference/paddle_tensorrt_infer.md b/doc/fluid/user_guides/howto/inference/paddle_tensorrt_infer.md
new file mode 100644
index 0000000000000000000000000000000000000000..226299c2ced5629cf7ba23958dffc0a7c0a5bfa4
--- /dev/null
+++ b/doc/fluid/user_guides/howto/inference/paddle_tensorrt_infer.md
@@ -0,0 +1,132 @@
+# 使用Paddle TensorRT预测
+
+NVIDIA TensorRT 是一个高性能的深度学习预测库，可为深度学习推理应用程序提供低延迟和高吞吐量。Paddle 1.0 采用了子图的形式对TensorRT进行了初步集成，即我们可以使用该模块来提升Paddle模型的预测性能。该模块依旧在持续开发中，目前已支持的模型有：AlexNet, MobileNet, ResNet50, VGG19, ResNext, MobileNet-SSD等。在这篇文档中，我们将会对Paddle-TensorRT库的获取、使用和原理进行介绍。
+
+
+## 编译带`TensorRT`的预测库
+
+**使用Docker编译预测库**         
+
+1. 下载Paddle  
+ 
+	```
+	git clone https://github.com/PaddlePaddle/Paddle.git
+	```
+	
+2. 获取docker镜像
+  
+	```
+	nvidia-docker run --name paddle_trt -v $PWD/Paddle:/Paddle -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash
+	```
+ 
+3. 编译Paddle TensorRT       
+
+	```
+	# 在docker容器中执行以下操作
+	cd /Paddle
+	mkdir build
+	cd build
+	cmake .. \
+	      -DWITH_FLUID_ONLY=ON \
+	      -DWITH_CONTRIB=OFF \
+	      -DWITH_MKL=OFF \
+	      -DWITH_MKLDNN=OFF \
+	      -DWITH_TESTING=ON \
+	      -DCMAKE_BUILD_TYPE=Release \
+	      -DWITH_PYTHON=OFF
+	
+	# 编译    
+	make -j
+	# 生成预测库
+	make inference_lib_dist -j
+	```
+
+## Paddle TensorRT使用
+
+[`paddle_inference_api.h`]('https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/paddle_inference_api.h') 定义了使用TensorRT的所有接口。  
+
+总体上分为以下步骤：  
+1. 创建合适的配置MixedRTConfig.   
+2. 根据配合创建 `PaddlePredictor`.    
+3. 创建输入的tensor.   
+4. 获取输出的tensor，输出结果.   
+
+以下的代码展示了完整的过程：
+
+```c++
+#include "paddle_inference_api.h"
+
+using paddle::contrib::MixedRTConfig;
+namespace paddle {
+
+void RunTensorRT(int batch_size, std::string model_dirname) {
+  // 1. 创建MixedRTConfig
+  MixedRTConfig config;
+  config.model_dir = model_dirname;
+  config.use_gpu = true;  // 此处必须为true
+  config.fraction_of_gpu_memory = 0.2;  
+  config.device = 0;     // gpu id
+  // TensorRT 根据max batch size大小给op选择合适的实现，
+  // 因此max batch size大小和运行时batch的值最好相同。
+  config.max_batch_size = batch_size;
+
+  // 2. 根据config 创建predictor
+  auto predictor = CreatePaddlePredictor<MixedRTConfig>(config);
+
+  // 3. 创建输入 tensor 
+  int height = 224;
+  int width = 224;
+  float data[batch_size * 3 * height * width] = {0};
+
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  // 4. 创建输出 tensor
+  std::vector<PaddleTensor> outputs;
+  // 5. 预测
+  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+
+  const size_t num_elements = outputs.front().data.length() / sizeof(float);
+  auto *data = static_cast<float *>(outputs.front().data.data());
+  for (size_t i = 0; i < num_elements; i++) { 
+    std::cout << "output: " << data[i] << std::endl;
+  }
+}
+}  // namespace paddle
+
+int main() { 
+  // 模型下载地址 http://paddle-inference-dist.cdn.bcebos.com/tensorrt_test/mobilenet.tar.gz
+  paddle::RunTensorRT(1, “./mobilenet");
+  return 0;
+}
+```
+编译过程可以参照[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci)。
+
+## 子图运行原理
+   PaddlePaddle采用子图的形式对TensorRT进行集成，当模型加载后，神经网络可以表示为由变量和运算节点组成的计算图。Paddle TensorRT实现的功能是能够对整个图进行扫描，发现图中可以使用TensorRT优化的子图，并使用TensorRT节点替换它们。在模型的推断期间，如果遇到TensorRT节点，Paddle会调用TensoRT库对该节点进行优化，其他的节点调用Paddle的原生实现。TensorRT在推断期间能够进行Op的横向和纵向融合，过滤掉冗余的Op，并对特定平台下的特定的Op选择合适的kenel等进行优化，能够加快模型的预测速度。  
+
+下图使用一个简单的模型展示了这个过程：   
+
+**原始网络**
+<p align="center">
+ <img src="https://raw.githubusercontent.com/NHZlX/FluidDoc/add_trt_doc/doc/fluid/user_guides/howto/inference/image/model_graph_original.png" width="600">
+</p>
+
+**转换的网络**
+<p align="center">
+ <img src="https://raw.githubusercontent.com/NHZlX/FluidDoc/add_trt_doc/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png" width="600">
+</p>
+
+    
+   我们可以在原始模型网络中看到，绿色节点表示可以被TensorRT支持的节点，红色节点表示网络中的变量，黄色表示Paddle只能被Paddle原生实现执行的节点。那些在原始网络中的绿色节点被提取出来汇集成子图，并由一个TensorRT节点代替，成为转换网络中的`block-25` 节点。在网络运行过程中，如果遇到该节点，Paddle将调用TensorRT库来对其执行。
+
+
+
+
+
+
+
diff --git a/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md b/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..edb628e6f05572bc99c0688a7bf9884ebf18d783
--- /dev/null
+++ b/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md
@@ -0,0 +1,126 @@
+Windows环境模型预测使用说明
+===========================
+
+环境部署
+--------
+
+### 硬件环境
+
+测试环境硬件配置：
+
+| CPU   |      I7-8700K      |
+|----------|:-------------:|
+| 内存 |  16G |
+| 硬盘 |  1T hdd + 256G ssd |
+| 显卡 |  GTX1080 8G |
+
+测试环境操作系统使用win10 Version 18.03 版本。下载地址：
+
+### 环境配置步骤
+
+**一定要严格按照安装步骤顺序，否则会安装失败！**
+
+**安装vs2015**
+
+安装vs2015，安装选项中选择安装内容时勾选自定义，把关于c，c++，vc++的功能都安装上。下载地址：
+
+**安装CUDA8**
+
+需要去NVIDIA官网[https://www.geforce.cn/drivers](https://www.geforce.cn/drivers)
+下载显卡对应的驱动。推荐391版本
+<p align="center">
+ <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image1.png" >
+</p>
+安装时需要勾选自定义，勾选安装全部。
+
+验证安装需要进入cmd中，输入nvcc -V查看。
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image2.png">
+</p>
+
+如果有显卡安装驱动，也可以选择直接安装CUDA8.0，[https://developer.nvidia.com/cuda-80-ga2-download-archive](https://developer.nvidia.com/cuda-80-ga2-download-archive)
+
+**安装CUDNN**
+
+安装CUDNN只需要将文件中CUDNN
+7下的文件复制到对应的CUDA安装目录下。文件名，cudnn-8.0-windows10-x64-v7.zip。这里提供了cudnn
+7
+64位的版本。需要其他版本可在[https://developer.nvidia.com/cudnn](https://developer.nvidia.com/cudnn)
+下载。
+
+预测demo使用
+------------
+
+解压Paddle，Release，fluid\_install\_dir压缩包。
+
+进入Paddle/paddle/fluid/inference/api/demo\_ci目录，新建build目录并进入，然后使用cmake生成vs2015的solution文件。
+指令为：
+```cmake
+cmake .. -G \"Visual Studio 14 2015 Win64\" -DWITH\_GPU=ON
+-DWITH\_MKL=OFF -DWITH\_STATIC\_LIB=ON -DCMAKE\_BUILD\_TYPE=Release
+-DDEMO\_NAME=simple\_on\_word2vec
+-DPADDLE\_LIB=D:\\to\_the\_paddle\_fluid.lib
+-DCUDA\_LIB=D:\\CUDA\\v8.0\\lib\\x64
+```
+
+注：
+
+-DDEMO\_NAME 是要编译的文件
+
+-DPADDLE\_LIB 是fluid\_install\_dir路径，例如
+-DPADDLE\_LIB=D:\\fluid\_install\_dir
+
+-DCUDA\_LIB 是CUDA安装目录对应的文件夹
+
+Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake.org/download/]{.underline}](https://cmake.org/download/)
+
+执行完毕后，build目录如图所示，打开 箭头指向的solution文件：
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image3.png">
+</p>
+
+修改编译属性为/MT：
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image4.png">
+</p>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image5.png">
+</p>
+
+编译生成选项改成Release。
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image6.png">
+</p>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image7.png">
+</p>
+
+将提供的依赖包中，Release下的openblas和模型文件拷贝到编译生成的Release下。
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image8.png">
+</p>
+
+通过cmd进到Release目录执行：
+
+  1.  开启GLOG
+
+  	set GLOG\_v=3
+
+  2.  进行预测
+
+  	simple\_on\_word2vec.exe \--dirname=.\\word2vec.inference.model
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image9.png">
+</p>
+
+**FAQ：**
+
+路径中尽量不要包含空格，例如发现CUDA\_LIB路径是Program
+Files(x86)可能会出错。可以将CUDA拷贝到一个新位置（这里直接拷贝就行）
diff --git a/doc/fluid/user_guides/howto/prepare_data/feeding_data.rst b/doc/fluid/user_guides/howto/prepare_data/feeding_data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c3bf033bb8316eeb4901c0cdc61e0556c8816dac
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/feeding_data.rst
@@ -0,0 +1,169 @@
+.. _user_guide_use_numpy_array_as_train_data:
+
+###########################
+使用Numpy Array作为训练数据
+###########################
+
+PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
+再使用 Numpy Array 或者直接使用Python创建C++的
+:code:`fluid.LoDTensor` , 通过 :code:`Executor.run(feed=...)` 传给
+:code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` 。
+
+数据层配置
+##########
+
+通过 :code:`fluid.layers.data()` 可以配置神经网络中需要的数据层。具体方法为:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[3, 224, 224])
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+   # use image/label as layer input
+   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
+   loss = fluid.layers.cross_entropy(input=prediction, label=label)
+   ...
+
+上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data`
+创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据;
+:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是:
+
+1. Fluid中默认使用 :code:`-1` 表示 batch size 维度，默认情况下会在 :code:`shape`
+   的第一个维度添加 :code:`-1` 。 所以 上段代码中， 我们可以接受将一个
+   :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size
+   维度的位置的话，请设置 :code:`fluid.layers.data(append_batch_size=False)` 。
+   请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。
+
+
+2. Fluid中用来做类别标签的数据类型是 :code:`int64`，并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。
+
+.. _user_guide_feed_data_to_executor:
+
+传递训练数据给执行器
+####################
+
+:code:`Executor.run` 和 :code:`ParallelExecutor.run` 都接受一个 :code:`feed` 参数。
+这个参数是一个Python的字典。它的键是数据层的名字，例如上文代码中的 :code:`image`。
+它的值是对应的numpy array。
+
+例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   exe.run(feed={
+      "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+      "label": numpy.random.random(size=(32, 1)).astype('int64')
+   })
+
+进阶使用
+########
+
+如何传入序列数据
+----------------
+
+序列数据是PaddlePaddle Fluid支持的特殊数据类型，可以使用 :code:`LoDTensor` 作为
+输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据;
+2.每个序列的长度信息。
+用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。
+
+传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level`。
+例如训练数据是词汇组成的句子，:code:`lod_level=1`；训练数据是 词汇先组成了句子，
+句子再组成了段落，那么 :code:`lod_level=2`。
+
+例如:
+
+.. code-block:: python
+
+   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
+
+   ...
+
+   exe.run(feed={
+     "sentence": create_lod_tensor(
+       data=numpy.array([1, 3, 4, 5, 3, 6, 8], dtype='int64').reshape(-1, 1),
+       lod=[4, 1, 2],
+       place=fluid.CPUPlace()
+     )
+   })
+
+训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2`。
+他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]`。
+
+如何分别设置ParallelExecutor中每个设备的训练数据
+------------------------------------------------
+
+用户将数据传递给使用 :code:`ParallelExecutor.run(feed=...)` 时，
+可以显示指定每一个训练设备(例如GPU)上的数据。
+用户需要将一个列表传递给 :code:`feed` 参数，列表中的每一个元素都是一个字典。
+这个字典的键是数据层的名字，值是数据层的值。
+
+例如:
+
+.. code-block:: python
+
+   parallel_executor = fluid.ParallelExecutor()
+   parallel_executor.run(
+     feed=[
+        {
+          "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(32, 1)).astype('int64')
+        },
+        {
+          "image": numpy.random.random(size=(16, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(16, 1)).astype('int64')
+        },
+     ]
+   )
+
+上述代码中，GPU0会训练 32 个样本，而 GPU1训练 16 个样本。
+
+
+.. _user_guide_customize_batch_size_rank:
+
+自定义BatchSize维度
+-------------------
+
+PaddlePaddle Fluid默认batch size是数据的第一维度，以 :code:`-1` 表示。但是在高级
+使用中，batch_size 可以固定，也可以是其他维度或者多个维度来表示。这都需要设置
+:code:`fluid.layers.data(append_batch_size=False)` 来完成。
+
+1. 固定batch size维度
+
+  .. code-block:: python
+
+     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
+
+  这里，:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。
+
+2. 使用其他维度表示batch size
+
+  .. code-block:: python
+
+     sentence = fluid.layers.data(name="sentence",
+                                  shape=[80, -1, 1],
+                                  append_batch_size=False,
+                                  dtype="int64")
+
+  这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经
+  网络中。
+
+
+.. _user_guide_paddle_support_data_types:
+
+Fluid目前支持的数据类型
+-----------------------
+
+PaddlePaddle Fluid目前支持的数据类型包括:
+
+   * float16： 部分操作支持
+   * float32:  主要实数类型
+   * float64:  次要实数类型，支持大部分操作
+   * int32:  次要标签类型
+   * int64: 主要标签类型
+   * uint64: 次要标签类型
+   * bool: 控制流数据类型
+   * int16: 次要标签类型
+   * uint8: 输入数据类型，可用于图像像素
\ No newline at end of file
diff --git a/doc/fluid/user_guides/howto/prepare_data/index.rst b/doc/fluid/user_guides/howto/prepare_data/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..930a4dbcb2b4f3c6c699e354280d5710c4fd9a31
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/index.rst
@@ -0,0 +1,68 @@
+..  _user_guide_prepare_data:
+
+########
+准备数据
+########
+
+PaddlePaddle Fluid支持两种传入数据的方式:
+
+1. Python Reader同步方式：用户需要使用 :code:`fluid.layers.data`
+配置数据输入层，并在 :code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor`
+中，使用 :code:`executor.run(feed=...)` 传入训练数据。
+
+2. py_reader接口异步方式：用户需要先使用 :code:`fluid.layers.py_reader` 配置数据输入层，然后使用
+:code:`py_reader` 的 :code:`decorate_paddle_reader` 或 :code:`decorate_tensor_provider`
+方法配置数据源，再通过 :code:`fluid.layers.read_file` 读取数据。
+
+
+这两种准备数据方法的比较如下:
+
+========  =================================   =====================================
+对比项            Python Reader同步方式                py_reader接口异步方式
+========  =================================   =====================================
+API接口     :code:`executor.run(feed=...)`       :code:`fluid.layers.py_reader`
+数据格式              Numpy Array                   Numpy Array或LoDTensor
+数据增强          Python端使用其他库完成                  Python端使用其他库完成
+速度                     慢                                   快
+推荐用途                调试模型                              工业训练
+========  =================================   =====================================
+
+Python Reader同步方式
+#####################
+
+Fluid提供Python Reader方式传入数据。
+Python Reader是纯的Python端接口，数据传入与模型训练/预测过程是同步的。用户可通过Numpy Array传入
+数据，具体请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   feeding_data.rst
+
+Python Reader支持组batch、shuffle等高级功能，具体请参考：
+
+.. toctree::
+   :maxdepth: 2
+
+   reader.md
+
+py_reader接口异步方式
+#####################
+
+Fluid提供PyReader异步数据传入方式，数据传入与模型训练/预测过程是异步的，效率较高。具体请参考：
+
+.. toctree::
+   :maxdepth: 2
+
+   use_py_reader.rst
+
+
+LoD-Tensor简介
+#####################
+
+LoD-Tensor是Fluid中特有的概念，它在Tensor基础上附加了序列信息，支持处理变长数据。具体请参考：
+
+..  toctree::
+    :maxdepth:2
+
+    lod_tensor.md
\ No newline at end of file
diff --git a/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md b/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..01b28d2a372cef4a6e33dc3cbe48b249bc20e39f
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md
@@ -0,0 +1,239 @@
+# LoD-Tensor使用说明
+
+LoD(Level-of-Detail) Tensor是Fluid中特有的概念，它在Tensor基础上附加了序列信息。Fluid中可传输的数据包括：输入、输出、网络中的可学习参数，全部统一使用LoD-Tensor表示。
+
+阅读本文档将帮助您了解 Fluid 中的 LoD-Tensor 设计思想，以便您更灵活的使用这一数据类型。
+
+## 变长序列的挑战
+
+大多数的深度学习框架使用Tensor表示一个mini-batch。
+
+例如一个mini-batch中有10张图片，每幅图片大小为32x32，则这个mini-batch是一个10x32x32的 Tensor。
+
+或者在处理NLP任务中，一个mini-batch包含N个句子，每个字都用一个D维的one-hot向量表示，假设所有句子都用相同的长度L，那这个mini-batch可以被表示为NxLxD的Tensor。
+
+上述两个例子中序列元素都具有相同大小，但是在许多情况下，训练数据是变长序列。基于这一场景，大部分框架采取的方法是确定一个固定长度，对小于这一长度的序列数据以0填充。
+
+在Fluid中，由于LoD-Tensor的存在，我们不要求每个mini-batch中的序列数据必须保持长度一致，因此您不需要执行填充操作，也可以满足处理NLP等具有序列要求的任务需求。
+
+Fluid引入了一个索引数据结构（LoD）来将张量分割成序列。
+
+
+## LoD 索引
+
+为了更好的理解LoD的概念，本节提供了几个例子供您参考：
+
+**句子组成的 mini-batch**
+
+假设一个mini-batch中有3个句子，每个句子中分别包含3个、1个和2个单词。我们可以用(3+1+2)xD维Tensor 加上一些索引信息来表示这个mini-batch:
+
+```
+3       1   2
+| | |   |   | |
+```
+上述表示中，每一个`|` 代表一个D维的词向量，数字3，1，2构成了 1-level LoD。
+
+**递归序列**
+让我们来看另一个2-level LoD-Tensor的例子：假设存在一个mini-batch中包含3个句子、1个句子和2个句子的文章，每个句子都由不同数量的单词组成，则这个mini-batch的样式可以看作：
+```
+3            1 2 
+3   2  4     1 2  3
+||| || ||||  | || |||
+```
+
+表示的LoD信息为：
+```
+[[3，1，2]/*level=0*/，[3，2，4，1，2，3]/*level=1*/]
+```
+
+**视频的mini-batch**
+
+在视觉任务中，时常需要处理视频和图像这些元素是高维的对象，假设现存的一个nimi-batch包含3个视频，分别有3个，1个和2个帧，每个帧都具有相同大小：640x480，则这个mini-batch可以被表示为：
+```
+3     1  2
+口口口 口 口口
+```
+
+最底层tensor大小为（3+1+2）x640x480，每一个`口` 表示一个640x480的图像
+
+**图像的mini-batch**
+
+在传统的情况下，比如有N个固定大小的图像的mini-batch，LoD-Tensor表示为:
+
+```
+1 1 1 1     1
+口口口口 ... 口
+```
+在这种情况下，我们不会因为索引值都为1而忽略信息，仅仅把LoD-Tensor看作是一个普通的张量:
+```
+口口口口 ... 口
+```
+
+**模型参数**
+
+模型参数只是一个普通的张量，在Fluid中它们被表示为一个0-level LoD-Tensor。
+
+<a name="#LoDTensor的偏移表示"></a>
+## LoDTensor的偏移表示 
+
+为了快速访问基本序列，Fluid提供了一种偏移表示的方法——保存序列的开始和结束元素，而不是保存长度。
+
+在上述例子中，您可以计算基本元素的长度：
+```
+3 2 4 1 2 3
+```
+将其转换为偏移表示：
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
+所以我们知道第一个句子是从单词0到单词3，第二个句子是从单词3到单词5。
+
+类似的，LoD的顶层长度
+```
+3 1 2
+```
+可以被转化成偏移形式：
+```
+0 3 4   6
+  = =   =
+  3 3+1 4+2
+```
+
+因此该LoD-Tensor的偏移表示为：
+```
+0       3    4      6
+  3 5 9   10   12 15
+```
+
+## LoD-Tensor
+一个LoD-Tensor可以被看作是一个树的结构，树叶是基本的序列元素，树枝作为基本元素的标识。
+
+在 Fluid 中 LoD-Tensor 的序列信息有两种表述形式：原始长度和偏移量。在 Paddle 内部采用偏移量的形式表述 LoD-Tensor，以获得更快的序列访问速度；在 python API中采用原始长度的形式表述 LoD-Tensor 方便用户理解和计算，并将原始长度称为：`recursive_sequence_lengths` 。
+
+以上文提到的一个2-level LoD-Tensor为例：
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+- 以偏移量表示此 LoD-Tensor:[ [0,3,4,6] , [0,3,5,9,10,12,15] ]，
+- 以原始长度表达此 Lod-Tensor：recursive_sequence_lengths=[ [3-0 , 4-3 , 6-4] , [3-0 , 5-3 , 9-5 , 10-9 , 12-10 , 15-12] ]。
+
+以文字序列为例： [3,1,2] 可以表示这个mini-batch中有3篇文章，每篇文章分别有3、2、1个句子，[3,2,4,1,2,3] 表示每个句子中分别含有3、2、4、1、2、3个字。
+
+recursive_seq_lens 是一个双层嵌套列表，也就是列表的列表，最外层列表的size表示嵌套的层数，也就是lod-level的大小；内部的每个列表，对应表示每个lod-level下，每个元素的大小。
+```python
+#查看lod-tensor嵌套层数
+print len(recursive_seq_lengths)
+# output：2
+
+#查看最基础元素个数
+print sum(recursive_seq_lengths[-1])
+# output:15 (3+2+4+1+2+3=15)
+
+```
+
+## 代码示例
+
+本节代码将根据指定的级别y-lod，扩充输入变量x。本例综合了LoD-Tensor的多个重要概念，跟随代码实现，您将：
+
+-  直观理解Fluid中 `fluid.layers.sequence_expand` 的实现过程
+-  掌握如何在Fluid中创建LoD-Tensor
+-  学习如何打印LoDTensor内容
+
+
+**创建LoD-Tensor**
+
+Fluid中可以通过`fluid.create_lod_tensor()`创建一个LoD-Tensor，使用说明请参考[API reference](http://paddlepaddle.org/documentation/api/zh/develop/fluid.html#create-lod-tensor)。需要注意的是，这个API只能支持int64的数据，如果您希望处理float32的数据，推荐您使用下述方式创建lod_tensor：
+
+使用fluid.LoDTensor()创建一个LoD-Tensor，并为其指定数据、运算场所和LoD值：
+```python
+import paddle.fluid as fluid
+import numpy as np
+
+def create_lod_tensor(data, lod, place):
+    res = fluid.LoDTensor()
+    res.set(data, place)
+    res.set_lod(lod)
+    return res
+```
+**定义计算过程**
+
+layers.sequence_expand通过获取 y 的 lod 值对 x 的数据进行扩充，关于`fluid.layers.sequence_expand` 的功能说明，请先阅读[API reference](http://www.paddlepaddle.org/documentation/api/zh/0.15.0/layers.html#sequence-expand)。
+
+序列扩充代码实现：
+```python
+x = fluid.layers.data(name='x', shape=[1], dtype='float32', lod_level=0)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32', lod_level=1)
+out = fluid.layers.sequence_expand(x=x, y=y, ref_level=0)
+```
+*说明*：输出LoD-Tensor的维度仅与传入的真实数据维度有关，在定义网络结构阶段为x、y设置的shape值，仅作为占位，并不影响结果。
+
+**创建Executor**
+```python
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+```
+<a name="#准备数据"></a>
+
+**准备数据**
+
+这里我们使用[偏移量](#LoDTensor的偏移表示)的方法表示Tensor的LoD索引：
+假使x_d 为一个LoDTensor：
+```
+x.lod = [[0,1,4]]
+x.data = [[1],[2],[3],[4]]
+x.dims = [4,1]
+```	
+y_d 也为一个LoDTensor：
+```
+y.lod = [[0, 1,       4],
+         [0, 2, 3, 5, 6]]
+```
+其中，输出值只与 y 的LoD值有关，y_d 的 data 值在这里并不参与计算，维度上与LoD[-1]一致即可。
+
+预期输出结果为：
+```
+#预期输出lod的原始长度
+out.lod =  [ [1,  3,          3,         3]]
+#预期输出结果
+out.data = [ [1],[2],[3],[4],[2],[3],[4],[2],[3],[4]]
+```
+实现代码如下：
+```python
+x_d = create_lod_tensor(np.array([[1], [2],[3],[4]]), [[0,1,4]], place)
+y_d = create_lod_tensor(np.array([[1],[1],[1],[1],[1],[1]]), [[0,1,4], [0,2,3,5,6]], place)
+```
+**执行运算**
+
+在Fluid中，LoD>1的Tensor与其他类型数据一样，使用feed定义数据传入顺序。此外，由于输出results是带有LoD信息的Tensor，需在exe.run( )中添加`return_numpy=False`参数，获得LoD-Tensor的输出结果。
+```python
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+results = exe.run(fluid.default_main_program(),
+                  feed={'x':x_d, 'y': y_d },
+                  fetch_list=[out],return_numpy=False)
+```
+**查看LodTensor结果**
+
+由于LoDTensor的特殊属性，无法直接print查看内容，常用操作时将LoD-Tensor作为网络的输出fetch出来，然后执行 numpy.array(lod_tensor), 就能转成numpy array：
+
+```python
+np.array(results[0])
+```
+输出结果为：
+```
+array([[1],[2],[3],[4],[2],[3],[4],[2],[3],[4]])
+```
+可以看到与[准备数据](#准备数据)一节中的预期结果一致。
+
+## 总结
+
+至此，相信您已经基本掌握了LoD-Tensor的概念，尝试修改上述代码中的 x_d 与 y_d，观察输出结果，有助于您更好的理解这一灵活的结构。
+
+更多LoDTensor的模型应用，可以参考新手入门中的[词向量](../../../beginners_guide/basics/word2vec/index.html)、[个性化推荐](../../../beginners_guide/basics/recommender_system/index.html)、[情感分析](../../../beginners_guide/basics/understand_sentiment/index.html)等指导教程。
+
+更高阶的应用案例，请参考[模型库](../../../user_guides/models/index.html)中的相关内容。
diff --git a/doc/fluid/user_guides/howto/prepare_data/reader.md b/doc/fluid/user_guides/howto/prepare_data/reader.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa50e4d26166536eaf8044d527debd8ad46060f6
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/reader.md
@@ -0,0 +1,210 @@
+```eval_rst
+.. _user_guide_reader:
+```
+
+# Python Reader
+
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
+
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
+
+```
+iterable = data_reader()
+```
+
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
+
+An example implementation for single item data reader creator is as follows:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator is as follows:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
+
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],)]
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It is easy to convert from a reader to a batch reader:
+
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+It is also straight forward to create a custom batch reader:
+
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
+
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do the following :
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care about the second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why does a reader return only a single entry, and not a mini batch?
+
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
+
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
+
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
+
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
+
+### Why use a dictionary instead of a list to provide mapping?
+
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create a custom data reader creator ?
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train is:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/fluid/user_guides/howto/prepare_data/use_py_reader.rst b/doc/fluid/user_guides/howto/prepare_data/use_py_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e1374f99c2e42898a8486da2e70526e66646c9e
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/use_py_reader.rst
@@ -0,0 +1,139 @@
+.. _user_guide_use_py_reader:
+
+############################
+使用PyReader读取训练和测试数据
+############################
+
+Paddle Fluid支持PyReader，实现Python端往C++端导入数据的功能。与 :ref:`user_guide_use_numpy_array_as_train_data` 不同，在使用PyReader时，Python端导入数据的过程和C++端 :code:`Executor::Run()` 读取数据的过程是异步进行的，且能与 :code:`double_buffer_reader` 配合以进一步提高数据读取性能。
+
+创建PyReader对象
+################################
+
+用户创建PyReader对象的方式为：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    py_reader = fluid.layers.py_reader(capacity=64,
+                                       shapes=[(-1,3,224,224), (-1,1)],
+                                       dtypes=['float32', 'int64'],
+                                       name='py_reader',
+                                       use_double_buffer=True)
+
+其中，capacity为PyReader对象的缓存区大小；shapes为batch各参量（如图像分类任务中的image和label)的尺寸；dtypes为batch各参量的数据类型；name为PyReader对象的名称；use_double_buffer默认为True，表示使用 :code:`double_buffer_reader` 。
+
+若要创建多个不同的PyReader对象（如训练阶段和测试阶段往往需创建两个不同的PyReader对象），必须给不同的PyReader对象指定不同的name。比如，在同一任务中创建训练阶段和测试阶段的PyReader对象的方式为：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    train_py_reader = fluid.layers.py_reader(capacity=64,
+                                             shapes=[(-1,3,224,224), (-1,1)],
+                                             dtypes=['float32', 'int64'],
+                                             name='train',
+                                             use_double_buffer=True)
+
+    test_py_reader = fluid.layers.py_reader(capacity=64,
+                                            shapes=[(-1,3,224,224), (-1,1)],
+                                            dtypes=['float32', 'int64'],
+                                            name='test',
+                                            use_double_buffer=True)
+
+注意， :code:`Program.clone()` 方法不能实现PyReader对象的复制，因此必须用以上方式创建训练阶段和测试阶段的不同
+PyReader对象。
+
+由于 :code:`Program.clone()` 无法实现PyReader对象的复制，因此用户需通过 :code:`fluid.unique_name.guard()`
+的方式实现训练阶段和测试阶段模型参数的共享，具体方式为：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+    import paddle.dataset.mnist as mnist
+    import paddle.v2
+
+    import numpy
+
+    def network(is_train):
+        reader = fluid.layers.py_reader(
+            capacity=10,
+            shapes=((-1, 784), (-1, 1)),
+            dtypes=('float32', 'int64'),
+            name="train_reader" if is_train else "test_reader",
+            use_double_buffer=True)
+        img, label = fluid.layers.read_file(reader)
+        ...
+        # Here, we omitted the definition of loss of the model
+        return loss , reader
+
+    train_prog = fluid.Program()
+    train_startup = fluid.Program()
+
+    with fluid.program_guard(train_prog, train_startup):
+        with fluid.unique_name.guard():
+            train_loss, train_reader = network(True)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(train_loss)
+
+    test_prog = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_prog, test_startup):
+        with fluid.unique_name.guard():
+            test_loss, test_reader = network(False)
+
+设置PyReader对象的数据源
+################################
+PyReader对象提供 :code:`decorate_tensor_provider` 和 :code:`decorate_paddle_reader` 方法，它们均接收一个Python生成器 :code:`generator` 对象作为数据源，两个方法的区别在于：
+
+1. :code:`decorate_tensor_provider` 方法：要求 :code:`generator` 每次产生一个 :code:`list` 或 :code:`tuple` 对象， :code:`list` 或 :code:`tuple` 对象中的每个元素为 :code:`LoDTensor` 类型或Numpy数组类型，且 :code:`LoDTensor` 或Numpy数组的 :code:`shape` 必须与创建PyReader对象时指定的 :code:`shapes` 参数完全一致。
+
+2. :code:`decorate_paddle_reader` 方法：要求 :code:`generator` 每次产生一个 :code:`list` 或 :code:`tuple` 对象， :code:`list` 或 :code:`tuple` 对象中的每个元素为Numpy数组类型，但Numpy数组的 :code:`shape` 不必与创建PyReader对象时指定的 :code:`shapes` 参数完全一致， :code:`decorate_paddle_reader` 方法内部会对其进行 :code:`reshape` 操作。
+
+使用PyReader进行模型训练和测试
+################################
+
+具体方式为（接上述代码）：
+
+.. code-block:: python
+
+    place = fluid.CUDAPlace(0)
+    startup_exe = fluid.Executor(place)
+    startup_exe.run(train_startup)
+    startup_exe.run(test_startup)
+
+    trainer = fluid.ParallelExecutor(
+        use_cuda=True, loss_name=train_loss.name, main_program=train_prog)
+
+    tester = fluid.ParallelExecutor(
+        use_cuda=True, share_vars_from=trainer, main_program=test_prog)
+
+    train_reader.decorate_paddle_reader(
+        paddle.v2.reader.shuffle(paddle.batch(mnist.train(), 512), buf_size=8192))
+
+    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+
+    for epoch_id in xrange(10):
+        train_reader.start()
+        try:
+            while True:
+                print 'train_loss', numpy.array(
+                    trainer.run(fetch_list=[train_loss.name]))
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_reader.reset()
+
+        test_reader.start()
+        try:
+            while True:
+                print 'test loss', numpy.array(
+                    tester.run(fetch_list=[test_loss.name]))
+        except fluid.core.EOFException:
+            print 'End of testing'
+            test_reader.reset()
+
+具体步骤为：
+
+1. 在每个epoch开始前，调用 :code:`start()` 方法启动PyReader对象；
+
+2. 在每个epoch结束时， :code:`read_file` 抛出 :code:`fluid.core.EOFException` 异常，在捕获异常后调用 :code:`reset()` 方法重置PyReader对象的状态，以便启动下一轮的epoch。
diff --git a/doc/fluid/user_guides/howto/training/checkpoint_doc_cn.md b/doc/fluid/user_guides/howto/training/checkpoint_doc_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c4afd536c67b24a17e4437ecedf779ddcddcbc98
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/checkpoint_doc_cn.md
@@ -0,0 +1,60 @@
+# Checkpoint功能使用指南
+
+## 背景
+单机/多机在训练过程中会由于软件/硬件的问题出现异常，导致训练中断，进而导致训练无结果或结果不可用，浪费大量时间和机器性能。
+
+## 目的
+Checkpoint功能能够在训练中途对训练数据中间数据进行保存，出现异常恢复训练的时候能够加载中途保存的数据继续训练， 实现单机/多机的容错训练的功能。
+
+## 说明
+### 目前已实现的参数保存：
+1. 基于Trainer 0 实现训练过程中的参数保存
+2. 基于PServer 实现了```Distribute Lookup Table```相关参数保存
+### Fluid Checkpoint 保存数据目录结构：
+
+```
+checkpoint_dir (用户定义的checkpoint目录)
+├── checkpoint_0 (第一次保存)
+│   ├── __lockup_table__ (Distribute Lookup Table 目录)
+│   │   ├── table_pserver_0 (Pserver 0 号保存的lookup table 数据)
+│   │   └── table_pserver_1
+│   ├── __model__ (model 目录)
+│   │   └── var.w_1
+│   └── trainer_0 (trainer 自有数据保存)
+│       ├── epoch_id
+│       └── step_id
+└── checkpoint_1 (第二次保存)
+```
+
+## 使用方法
+### 声明Fluid.CheckpointConfig
+用户对checkpoint功能的配置，主要是配置对象```Fluid```中的```CheckpointConfig```.
+
+```CheckpointConfig``` 包括4个参数：
+
+| 参数 | 类型 | 说明 | 
+| - | :-: | - | 
+| checkpoint_dir | int| checkpoint存储目录 | 
+| max_num_checkpoints | int | 最大保存的checkpoint副本数 | 
+| epoch_interval | int | 每隔epoch_interval轮epoch |
+| step_interval | int | 每隔step_interval轮step |
+
+### 在Fluid.Trainer对象的声明中加入Fluid.CheckpointConfig的声明
+Trainer的__init__方法的参数中包含了对```CheckpointConfig```， 需要传入在声明Trainer前声明的```CheckpointConfig```对象。
+如：
+```python
+config = CheckpointConfig(
+    checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, 
+    epoch_interval = 2, step_interval = 10)
+trainer = Trainer(..., checkpoint_config=config)
+```
+定义和声明完成后， 训练在运行过程中就会在指定的step和epoch处进行保存，出现异常时，就会自动从最新的checkpoint目录进行参数恢复啦！
+
+## 相关API
+[Trainer API 说明](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py)
+
+## 注意
+1. 保证每个训练的```checkpoint_dir``` 与其他训练独立。
+2. 最大副本数量```max_num_checkpoints```需要根据磁盘容量以及模型的大小进行调整， 保证磁盘的可用性。
+3. ```epoch_interval```  和 ```step_interval```  不宜过小， 频繁的进行checkpoint会拖慢训练速度。
+4. **分布式训练**的过程中：每个Trainer都会在```checkpoint_dir```目录中保存当前Trainer的参数（只有Trainer 0会保存模型的参数），需要**分布式文件系统(HDFS等)**将同```checkpoint_dir```目录的数据进行合并才能得到完整的数据，恢复训练的时候需要用完整的数据进行恢复。
diff --git a/doc/fluid/user_guides/howto/training/checkpoint_doc_en.md b/doc/fluid/user_guides/howto/training/checkpoint_doc_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..14d37246ca0cab8715e244fda9624d0d59f8ec5f
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/checkpoint_doc_en.md
@@ -0,0 +1,62 @@
+# Checkpoint User Guide
+
+## Background
+In many cases, Stand-alone training and Distributed training can be aborted by the software problem or hardware problem. More seriously, we waste so much time and the performance of the machine but get nothing, which makes us frustrating and we have to restart it again.
+
+## Purpose
+The feature of ```Checkpoint``` can save Intermediate model variables, lookup table variable, and other needs data in checkpoint directory. When the exception occurs, we can load these variables from the checkpoint directory immediately.
+## Introduce
+### Complete Features Currently：
+1. The Trainer 0 will save model variables in training.
+2. Each of the Trainer will save its own arguments needed.
+3. Each of the Parameter Server will save ```Distribute Lookup Table``` variables in training.
+### Fluid Checkpoint directory structure：
+
+```
+checkpoint_dir (the checkpoint directory user define)
+├── checkpoint_0 (the first save directory)
+│   ├── __lockup_table__ (Distribute Lookup Table directory)
+│   │   ├── table_pserver_0 (Lookup table's data about Pserver 0)
+│   │   └── table_pserver_1
+│   ├── __model__ (model directory)
+│   │   └── var.w_1
+│   └── trainer_0 (each trainer will save its own data)
+│       ├── epoch_id
+│       └── step_id
+└── checkpoint_1 (the second save directory)
+```
+
+## usage
+### Fluid.CheckpointConfig construct
+When the user wants to use ```Checkpoint``` feature, the main thing user have to do is declare ```CheckpointConfig``` and construct it.
+
+```CheckpointConfig``` has 4 member variables need to be initialized：
+
+| Member Variable | Type | Comment | 
+| - | :-: | - | 
+| checkpoint_dir | int| checkpoint directory | 
+| max_num_checkpoints | int | Maximum number of checkpoint copies | 
+| epoch_interval | int |  epoch interval times |
+| step_interval | int | step interval times |
+
+### Add Fluid.CheckpointConfig's declaration in Fluid.Trainer
+Because the initialization of Trainer needs an instance of ```CheckpointConfig```., we should declare ```CheckpointConfig``` in ```Fluid``` first.
+
+For example：
+```python
+config = CheckpointConfig(
+    checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2, 
+    epoch_interval = 2, step_interval = 10)
+trainer = Trainer(..., checkpoint_config=config)
+```
+
+After all the things done, the train will save checkpoint at the specified epoch and step, when the train is aborted, the user can restart it, the train will restore from the latest copy.
+
+## Related API
+[Related Trainer API](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py)
+
+## Attention
+1. Make the ```checkpoint_dir``` only be used by one train job.
+2. The number of ```max_num_checkpoints``` need to be adjusted by the disk size and model size.
+3. Too frequently to slow down the train speed, so too ```small epoch_interval``` and ```step_interval``` are not suitable.
+4. **In distributed train**, each Trainer will save arguments in its ```checkpoint_dir``` (Only Trainer 0 will save model variables). We need **distributed file system (HDFS, etc)** to merge all the ```checkpoint_dir``` to get the whole data.
diff --git a/doc/fluid/user_guides/howto/training/cluster_howto.rst b/doc/fluid/user_guides/howto/training/cluster_howto.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b6da64b560157c2f63393005281b3241e3721e0e
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/cluster_howto.rst
@@ -0,0 +1,229 @@
+.. _cluster_howto
+
+Fluid分布式训练使用手册
+====================
+
+分布式训练基本思想
+---------------
+
+分布式深度学习训练通常分为两种并行化方法：数据并行，模型并行，参考下图：
+
+.. image:: src/parallelism.png
+
+在模型并行方式下，模型的层和参数将被分布在多个节点上，模型在一个mini-batch的前向和反向训练中，将经过多次跨\
+节点之间的通信。每个节点只保存整个模型的一部分；在数据并行方式下，每个节点保存有完整的模型的层和参数，每个节点\
+独自完成前向和反向计算，然后完成梯度的聚合并同步的更新所有节点上的参数。Fluid目前版本仅提供数据并行方式，另外\
+诸如模型并行的特例实现（超大稀疏模型训练）功能将在后续的文档中予以说明。
+
+在数据并行模式的训练中，Fluid使用了两种通信模式，用于应对不同训练任务对分布式训练的要求，分别为RPC通信和Collective
+通信。其中RPC通信方式使用 `gRPC <https://github.com/grpc/grpc/>`_ ，Collective通信方式使用
+`NCCL2 <https://developer.nvidia.com/nccl>`_ 。
+
+.. csv-table:: 下面是一个RPC通信和Collective通信的横向对比：
+   :header: "Feature", "Collective", "RPC"
+
+   "Ring-Based通信", "Yes", "No"
+   "异步训练", "Yes", "Yes"
+   "分布式模型", "No", "Yes"
+   "容错训练", "No", "Yes"
+   "性能", "Faster", "Fast"
+
+- RPC通信方式的结构：
+
+  .. image:: src/dist_train_pserver.png
+
+  使用RPC通信方式的数据并行分布式训练，会启动多个pserver进程和多个trainer进程，每个pserver进程\
+  会保存一部分模型参数，并负责接收从trainer发送的梯度并更新这些模型参数；每个trainer进程会保存一份\
+  完整的模型，并使用一部分数据进行训练，然后向pserver发送梯度，最后从pserver拉取更新后的参数。
+
+  pserver进程可以在和trainer完全不同的计算节点上，也可以和trainer公用节点。一个分布式任务所需要的\
+  pserver进程个数通常需要根据实际情况调整，以达到最佳的性能，然而通常来说pserver的进程不会比trainer\
+  更多。
+
+  在使用GPU训练时，pserver可以选择使用GPU或只使用CPU，如果pserver也使用GPU，则会增加一次从CPU拷贝\
+  接收到的梯度数据到GPU的开销，在某些情况下会导致整体训练性能降低。
+
+- NCCL2通信方式的结构：
+
+  .. image:: src/dist_train_nccl2.png
+
+  使用NCCL2（Collective通信方式）进行分布式训练，是不需要启动pserver进程的，每个trainer进程都保存\
+  一份完整的模型参数，在完成计算梯度之后通过trainer之间的相互通信，Reduce梯度数据到所有节点的所有设备\
+  然后每个节点在各自完成参数更新。
+
+使用parameter server方式的训练
+------------------------------
+
+使用 :code:`transpiler` API可以把单机可以执行的程序快速转变成可以分布式执行的程序。在不同的服务器节点
+上，通过传给 :code:`transpiler` 对应的参数，以获取当前节点需要执行的 :code:`Program` 。
+
+
+.. csv-table:: 需要配置参数包括
+   :header: "参数", "说明"
+
+   "role", "\ **必选**\ 区分作为pserver启动还是trainer启动，不传给transpile，也可以用其他的变量名或环境变量"
+   "trainer_id", "\ **必选**\ 如果是trainer进程，用于指定当前trainer在任务中的唯一id，从0开始，在一个任务中需保证不重复"
+   "pservers", "\ **必选**\ 当前任务所有pserver的ip:port列表字符串，形式比如：127.0.0.1:6170,127.0.0.1:6171"
+   "trainers", "\ **必选**\ trainer节点的个数"
+   "sync_mode", "\ **可选**\ True为同步模式，False为异步模式"
+   "startup_program", "\ **可选**\ 如果startup_program不是默认的fluid.default_startup_program()，需要传入此参数"
+   "current_endpoint", "\ **可选**\ 只有NCCL2模式需要传这个参数"
+
+一个例子，假设有两个节点，分别是 :code:`192.168.1.1` 和 :code:`192.168.1.2` ，使用端口6170，启动4个trainer，
+则代码可以写成：
+
+.. code-block:: python
+
+   role = "PSERVER"
+   trainer_id = 0  # get actual trainer id from cluster
+   pserver_endpoints = "192.168.1.1:6170,192.168.1.2:6170"
+   current_endpoint = "192.168.1.1:6170" # get actual current endpoint
+   trainers = 4
+   t = fluid.DistributeTranspiler()
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+   if role == "PSERVER":
+       pserver_prog = t.get_pserver_program(current_endpoint)
+       pserver_startup = t.get_startup_program(current_endpoint,
+                                               pserver_prog)
+       exe.run(pserver_startup)
+       exe.run(pserver_prog)
+   elif role == "TRAINER":
+       train_loop(t.get_trainer_program())
+
+
+选择同步或异步训练
+++++++++++++++++++
+
+Fluid分布式任务可以支持同步训练或异步训练，在同步训练方式下，所有的trainer节点，会在每个mini-batch
+同步地合并所有节点的梯度数据并发送给parameter server完成更新，在异步训练方式下，每个trainer没有相互\
+同步等待的过程，可以独立的parameter server的参数。通常情况下，使用异步训练方式，可以在trainer节点\
+更多的时候比同步训练方式有更高的总体吞吐量。
+
+在调用 :code:`transpile` 函数时，默认会生成同步训练的分布式程序，通过指定 :code:`sync_mode=False`
+参数即可生成异步训练的程序：
+
+.. code-block:: python
+
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=False)
+
+
+
+选择是否使用分布式embedding表进行训练
++++++++++++++++++++++++++++++++++
+
+embedding被广泛应用在各种网络结构中，尤其是文本处理相关的模型。在某些场景，例如推荐系统或者搜索引擎中，
+embedding的feature id可能会非常多，当feature id达到一定数量时，embedding参数会变得很大，一方面可能
+单机内存无法存放导致无法训练，另一方面普通的训练模式每一轮迭代都需要同步完整的参数，参数太大会让通信变得
+非常慢，进而影响训练速度。
+
+Fluid支持千亿量级超大规模稀疏特征embedding的训练，embedding参数只会保存在parameter server上，通过
+参数prefetch和梯度稀疏更新的方法，大大减少通信量，提高通信速度。
+
+该功能只对分布式训练有效，单机无法使用。
+需要配合稀疏更新一起使用。
+
+使用方法，在配置embedding的时候，加上参数 :code:`is_distributed=True` 以及 :code:`is_sparse=True` 即可。
+参数 :code:`dict_size` 定义数据中总的id的数量，id可以是int64范围内的任意值，只要总id个数小于等于dict_size就可以支持。
+所以配置之前需要预估一下数据中总的feature id的数量。
+
+.. code-block:: python
+
+  emb = fluid.layers.embedding(
+      is_distributed=True,
+      input=input,
+      size=[dict_size, embedding_width],
+      is_sparse=True)
+
+
+选择参数分布方法
+++++++++++++++++
+
+参数 :code:`split_method` 可以指定参数在parameter server上的分布方式。
+
+Fluid默认使用 `RoundRobin <https://en.wikipedia.org/wiki/Round-robin_scheduling>`_
+方式将参数分布在多个parameter server上。此方式在默认未关闭参数切分的情况下，参数会较平均的分布在所有的
+parameter server上。如果需要使用其他，可以传入其他的方法，目前可选的方法有： :code:`RoundRobin` 和
+:code:`HashName` 。也可以使用自定义的分布方式，只需要参考
+`这里 <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/transpiler/ps_dispatcher.py#L44>`_
+编写自定义的分布函数。
+
+
+关闭切分参数
+++++++++++++
+
+参数 :code:`slice_var_up` 指定是否将较大（大于8192个元素）的参数切分到多个parameter server以均衡计算负载，默认为开启。
+
+当模型中的可训练参数体积比较均匀或者使用自定义的参数分布方法是参数均匀分布在多个parameter server上，
+可以选择关闭切分参数，这样可以降低切分和重组带来的计算和拷贝开销：
+
+.. code-block:: python
+
+   t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, slice_var_up=False)
+
+
+开启内存优化
+++++++++++++
+
+在parameter server分布式训练模式下，要开启内存优化 :code:`memory_optimize` 和单机相比，需要注意按照下面的规则配置：
+
+* 在pserver端，\ **不要**\ 执行 :code:`memory_optimize`
+* 在trainer端，先执行 :code:`fluid.memory_optimize` 再执行 :code:`t.transpile()`
+* 在trainer端，调用 :code:`memory_optimize` 需要增加 :code:`skip_grads=True` 确保发送的梯度不会被重命名： :code:`fluid.memory_optimize(input_program, skip_grads=True)`
+
+示例：
+
+.. code-block:: python
+
+  if role == "TRAINER": 
+      fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
+  t = fluid.DistributeTranspiler()
+  t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+  if role == "PSERVER":
+      # start pserver here
+  elif role == "TRAINER":
+      # start trainer here
+
+
+使用NCCL2通信方式的训练
+--------------------
+
+NCCL2模式的分布式训练，由于没有parameter server角色，是trainer之间互相通信，使用时注意：
+
+* 配置 :code:`fluid.DistributeTranspilerConfig` 中 :code:`mode="nccl2"` 。
+* 调用 :code:`transpile` 时，:code:`trainers` 传入所有trainer节点的endpoint，并且传入参数 :code:`current_endpoint` 。
+* 初始化 :code:`ParallelExecutor` 时传入 :code:`num_trainers` 和 :code:`trainer_id` 。
+
+一个例子：
+
+.. code-block:: python
+
+  trainer_id = 0 # get actual trainer id here
+  trainers = "192.168.1.1:6170,192.168.1.2:6170"
+  current_endpoint = "192.168.1.1:6170"
+  config = fluid.DistributeTranspilerConfig()
+  config.mode = "nccl2"
+  t = fluid.DistributeTranspiler(config=config)
+  t.transpile(trainer_id, trainers=trainers, current_endpoint=current_endpoint)
+  exe = fluid.ParallelExecutor(use_cuda,
+    loss_name=loss_name, num_trainers=len(trainers.split(",")), trainer_id=trainer_id)
+  ...
+
+.. csv-table:: NCCL2模式必要参数说明
+   :header: "参数", "说明"
+
+   "trainer_id", "任务中每个trainer节点的唯一ID，从0开始，不能有重复"
+   "trainers", "任务中所有trainer节点的endpoint，用于在NCCL2初始化时，广播NCCL ID"
+   "current_endpoint", "当前节点的endpoint"
+
+目前使用NCCL2进行分布式训练仅支持同步训练方式。使用NCCL2方式的分布式训练，更适合模型体积较大，并需要使用\
+同步训练和GPU训练，如果硬件设备支持RDMA和GPU Direct，可以达到很高的分布式训练性能。
+
+注意如果系统中有多个网络设备，需要手动指定NCCL2使用的设备，
+假设需要使用 :code:`eth2` 为通信设备，需要设定如下环境变量：
+
+.. code-block:: bash
+
+   export NCCL_SOCKET_IFNAME=eth2
+
+另外NCCL2提供了其他的开关环境变量，比如指定是否开启GPU Direct，是否使用RDMA等，详情可以参考
+`ncclknobs <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#ncclknobs>`_ 。
diff --git a/doc/fluid/user_guides/howto/training/cluster_quick_start.rst b/doc/fluid/user_guides/howto/training/cluster_quick_start.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a8d13f2d34af68f6a98aaef9afabed0660156f0
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/cluster_quick_start.rst
@@ -0,0 +1,194 @@
+..  _cluster_quick_start:
+
+分布式训练快速开始
+==================
+
+准备工作
+--------
+
+在本篇文章中，我们将会在介绍如何快速在一个集群中启动一个 PaddlePaddle
+的分布式训练任务，在开始之前，请按如下步骤做些准备工作：
+
+1. 准备一个网络连通的训练集群，在本文中我们使用4个训练节点使用 ``*.paddlepaddle.com``
+   来表示节点的主机名称，您可以根据实际情况修改它。
+
+2. 在开始之前确保已经阅读过 :ref:`install_steps`
+   并且可以在集群的所有节点上可以正常运行 PaddlePaddle。
+
+样例代码
+-------
+
+下面使用一个非常简单的线性回归模型作为样例来解释如何启动一个包含2个 pserver server 节点以及
+2个 trainer 节点的分布式训练任务，您可以将本段代码保存为 ``dist_train.py``
+
+.. code:: python
+
+    import os
+    import paddle
+    import paddle.fluid as fluid
+
+    # train reader
+    BATCH_SIZE = 20
+    EPOCH_NUM = 30
+    BATCH_SIZE = 8
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.uci_housing.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    def train():
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+        loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_loss = fluid.layers.mean(loss)
+        opt = fluid.optimizer.SGD(learning_rate=0.001)
+        opt.minimize(avg_loss)
+
+        place = fluid.CPUPlace()
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+
+        # fetch distributed training environment setting
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", None)
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            trainer_id = trainer_id,
+            pservers = pserver_endpoints,
+            trainers = trainers)
+
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+            exe.run(startup_prog)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            trainer_prog = t.get_trainer_program()
+            exe.run(fluid.default_startup_program())
+
+            for epoch in range(EPOCH_NUM):
+                for batch_id, batch_data in enumerate(train_reader()):
+                    avg_loss_value, = exe.run(trainer_prog,
+                                          feed=feeder.feed(batch_data),
+                                          fetch_list=[avg_loss])
+                    if (batch_id + 1) % 10 == 0:
+                        print("Epoch: {0}, Batch: {1}, loss: {2}".format(
+                            epoch, batch_id, avg_loss_value[0]))
+            # destory the resource of current trainer node in pserver server node
+            exe.close()
+        else:
+            raise AssertionError("PADDLE_TRAINING_ROLE should be one of [TRAINER, PSERVER]")
+
+    train()
+
+环境变量说明
+-----------
+
+在启动分布式训练任务时，使用不同的环境变量来表示不同的节点角色，具体如下：
+
+.. list-table::
+  :header-rows: 1
+
+  * - 环境变量
+    - 数据类型
+    - 样例
+    - 描述
+  * - :code:`PADDLE_TRAINING_ROLE`
+    - str
+    - :code:`PSERVER,TRAINER`
+    - 当前训练节点角色
+  * - :code:`PADDLE_PSERVER_IPS`
+    - str
+    - :code:`ps0.paddlepaddle.com,ps1.paddlepaddle.com`
+    - 分布式训练任务中所有 pserver 节点的 IP 地址或 hostname, 使用","分隔
+  * - :code:`PADDLE_PSERVER_PORT`
+    - int
+    - 6174
+    - pserver 进程监听的端口
+  * - :code:`PADDLE_TRAINERS`
+    - int
+    - 2
+    - 分布式训练任务中 trainer 节点的数量
+  * - :code:`PADDLE_CURRENT_IP`
+    - str
+    - :code:`ps0.paddlepaddle.com`
+    - 当前 pserver 节点的 IP 地址或 hostname
+  * - :code:`PADDLE_TRAINER_ID`
+    - str 
+    - 0
+    - 当前 trainer 节点的 ID (唯一)， 取值范围为 [0, PADDLE_TRAINERS)
+
+注： 环境变量只是获取运行时信息的一种方式，实际任务中可以采用命令行参数等方式获取运行时信息。
+
+分布式训练相关 API
+------------------
+
+DistributeTranspiler
+~~~~~~~~~~~~~~~~~~~~~~
+
+基于 pserver-trainer 架构的的分布式训练任务分为两种角色： Parameter Server(pserver) 以及 trainer, 
+在 Fluid 中，用户只需配置单机训练所需要的网络配置, ``DistributeTranspiler`` 模块会自动地根据
+当前训练节点的角色将用户配置的单机网路配置改写成 pserver 和 trainer 需要运行的网络配置:
+
+.. code:: python
+
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id = trainer_id,                   
+        pservers = pserver_endpoints,    
+        trainers = trainers)
+    if PADDLE_TRAINING_ROLE == "TRAINER":
+        # fetch the pserver program and execute it
+        trainer_prog = t.get_trainer_program()
+        ...
+
+    elif PADDLE_TRAINER_ROLE == "PSERVER":
+        # fetch the trainer program and execute it
+        pserver_prog = t.get_pserver_program(current_endpoint) 
+        ...
+
+exe.close()
+~~~~~~~~~~~~~~
+
+pserver 节点中会保存所有 trainer 节点的状态信息，在 trainer结束训练时需要调用 ``exe.close()``
+通知所有 PServer 节点释放当前 Trainer 节点的资源:
+
+.. code:: python
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    # training process ...
+    exe.close() # notify PServer to destory the resource
+
+启动分布式训练任务
+--------------------
+
+.. list-table::
+   :header-rows: 1
+
+   * - 启动节点
+     - 启动命令
+     - 说明
+   * - ps0.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps0.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动 pserver 节点
+   * - ps1.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps1.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动 pserver 节点
+   * - trainer0.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=0 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动第0号 trainer 节点
+   * - trainer1.paddlepaddle.com
+     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com,ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=1 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
+     - 启动第1号 trainer 节点
diff --git a/doc/fluid/user_guides/howto/training/index.rst b/doc/fluid/user_guides/howto/training/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..68475101e26b3f695c8003995cc1c6a95426ff27
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/index.rst
@@ -0,0 +1,12 @@
+############
+训练神经网络
+############
+
+PaddlePaddle Fluid支持单机训练，和多节点训练。每种训练模式下，都支持多种训练方法。
+
+.. toctree::
+   :maxdepth: 2
+
+   single_node
+   multi_node
+   save_load_variables
diff --git a/doc/fluid/user_guides/howto/training/multi_node.rst b/doc/fluid/user_guides/howto/training/multi_node.rst
new file mode 100644
index 0000000000000000000000000000000000000000..93df4abd4d1bf51ea9bc275de384c7545df825e8
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/multi_node.rst
@@ -0,0 +1,10 @@
+########
+多机训练
+########
+
+.. toctree::
+   :maxdepth: 2
+
+   cluster_quick_start.rst
+   cluster_howto.rst
+   train_on_baidu_cloud_cn.rst
diff --git a/doc/fluid/user_guides/howto/training/save_load_variables.rst b/doc/fluid/user_guides/howto/training/save_load_variables.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a96776f4a17a1d6da170bdff9d81771c38912bb5
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/save_load_variables.rst
@@ -0,0 +1,172 @@
+.. _user_guide_save_load_vars:
+
+##################
+保存与载入模型变量
+##################
+
+模型变量分类
+############
+
+在PaddlePaddle Fluid中，所有的模型变量都用 :code:`fluid.Variable()` 作为基类进行表示。
+在该基类之下，模型变量主要可以分为以下几种类别：
+
+1. 模型参数
+  模型参数是深度学习模型中被训练和学习的变量，在训练过程中，训练框架根据反向传播算法计算出每一个模型参数当前的梯度，
+  并用优化器根据梯度对参数进行更新。模型的训练过程本质上可以看做是模型参数不断迭代更新的过程。
+  在PaddlePaddle Fluid中，模型参数用 :code:`fluid.framework.Parameter` 来表示，
+  这是一个 :code:`fluid.Variable()` 的派生类，除了 :code:`fluid.Variable()` 具有的各项性质以外，
+  :code:`fluid.framework.Parameter` 还可以配置自身的初始化方法、更新率等属性。
+
+2. 长期变量
+  长期变量指的是在整个训练过程中持续存在、不会因为一个迭代的结束而被销毁的变量，例如动态调节的全局学习率等。
+  在PaddlePaddle Fluid中，长期变量通过将 :code:`fluid.Variable()` 的 :code:`persistable`
+  属性设置为 :code:`True` 来表示。所有的模型参数都是长期变量，但并非所有的长期变量都是模型参数。
+
+3. 临时变量
+  不属于上面两个类别的所有模型变量都是临时变量，这种类型的变量只在一个训练迭代中存在，在每一个迭代结束后，
+  所有的临时变量都会被销毁，然后在下一个迭代开始之前，又会先构造出新的临时变量供本轮迭代使用。
+  一般情况下模型中的大部分变量都属于这一类别，例如输入的训练数据、一个普通的layer的输出等等。
+
+
+
+如何保存模型变量
+################
+
+根据用途的不同，我们需要保存的模型变量也是不同的。例如，如果我们只是想保存模型用来进行以后的预测，
+那么只保存模型参数就够用了。但如果我们需要保存一个checkpoint以备将来恢复训练，
+那么我们应该将各种长期变量都保存下来，甚至还需要记录一下当前的epoch和step的id。
+因为一些模型变量虽然不是参数，但对于模型的训练依然必不可少。
+
+因此，根据需求的不同，我们提供了两套API来分别进行模型的参数和checkpoint的保存。
+
+保存模型用于对新样本的预测
+==========================
+
+如果我们保存模型的目的是用于对新样本的预测，那么只保存模型参数就足够了。我们可以使用
+:code:`fluid.io.save_params()` 接口来进行模型参数的保存。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    param_path = "./my_paddle_model"
+    prog = fluid.default_main_program()
+    fluid.io.save_params(executor=exe, dirname=param_path, main_program=None)
+
+上面的例子中，通过调用 :code:`fluid.io.save_params` 函数，PaddlePaddle Fluid会对默认
+:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
+筛选出其中所有的模型参数，并将这些模型参数保存到指定的 :code:`param_path` 之中。
+
+
+保存checkpoint用于将来恢复训练
+==============================
+
+在训练过程中，我们可能希望在一些节点上将当前的训练状态保存下来，
+以便在将来需要的时候恢复训练环境继续进行训练。这一般被称作“checkpoint”。
+想要保存checkpoint，可以使用 :code:`fluid.io.save_checkpiont()` 接口。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./checkpoints"
+    prog = fluid.default_main_program()
+    trainer_args = {"epoch_id": 200,
+                    "step_id": 20} # just an example
+    fluid.io.save_checkpoint(executor=exe,
+                                checkpoint_dir=path,
+                                trainer_id=0,
+                                trainer_args=trainer_args,
+                                main_program=prog,
+                                max_num_checkpoints=3)
+
+上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对默认
+:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
+根据一系列内置的规则自动筛选出其中所有需要保存的变量，并将他们保存到指定的 :code:`path` 目录下。
+
+:code:`fluid.io.save_checkpoint` 的各个参数中， :code:`trainer_id` 在单机情况下设置为0即可； :code:`trainer_args`
+为一个Python dict，用于给定当前的epoch_id和step_id；
+:code:`max_num_checkpoints` 用于表示的最大checkpoint数量，
+如果目录中已经存在的checkpoint数量超过这个值，那最早的checkpoint将被删除。
+
+如何载入模型变量
+################
+
+与模型变量的保存相对应，我们提供了两套API来分别载入模型的参数和载入模型的checkpoint。
+
+载入模型用于对新样本的预测
+==========================
+
+对于通过 :code:`fluid.io.save_params` 保存的模型，可以使用 :code:`fluid.io.load_params`
+来进行载入。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    param_path = "./my_paddle_model"
+    prog = fluid.default_main_program()
+    fluid.io.load_params(executor=exe, dirname=param_path,
+                         main_program=prog)
+
+上面的例子中，通过调用 :code:`fluid.io.load_params` 函数，PaddlePaddle Fluid会对
+:code:`prog` 中的所有模型变量进行扫描，筛选出其中所有的模型参数，
+并尝试从 :code:`param_path` 之中读取加载它们。
+
+需要格外注意的是，这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_params`
+时所用的 :code:`prog` 中的前向部分完全一致，且不能包含任何参数更新的操作。如果两者存在不一致，
+那么可能会导致一些变量未被正确加载；如果错误地包含了参数更新操作，那可能会导致正常预测过程中参数被更改。
+这两个 :code:`fluid.Program` 之间的关系类似于训练 :code:`fluid.Program`
+和测试 :code:`fluid.Program` 之间的关系，详见： :ref:`user_guide_test_while_training`。
+
+另外，需特别注意运行 :code:`fluid.default_startup_program()` 必须在调用 :code:`fluid.io.load_params`
+之前。如果在之后运行，可能会覆盖已加载的模型参数导致错误。
+
+
+载入checkpoint用于恢复训练
+==========================
+
+对于通过 :code:`fluid.io.save_checkpoint` 保存的模型，可以使用 :code:`fluid.io.load_checkpoint`
+来进行载入。
+
+例如：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./checkpoints"
+    prog = fluid.default_main_program()
+    fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                             serial=9, main_program=prog)
+
+上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对
+:code:`prog` 中的所有模型变量进行扫描，根据内置规则自动筛选出需要加载的变量，
+并尝试从 :code:`path` 之中加载它们。
+
+参数 :code:`serial` 用来标记具体要加载的checkpoint的版本号。在保存checkpoint的时候，
+一个checkpoint会被保存在一个子目录中，并在目录名上体现出自己的版本号。
+一般越大的版本号表示这个checkpoint越新。
+
+这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_checkpoint` 时所用的 :code:`prog`
+完全一致，否则会导致变量加载错误或者未加载。另外，与 :code:`fluid.io.save_params` 类似，
+运行 :code:`fluid.default_startup_program()` 也必须在 :code:`fluid.io.load_checkpoint`
+之前进行。
+
+多机checkpoint保存
+##################
+
+.. toctree::
+   :maxdepth: 2
+
+   checkpoint_doc_cn.md
\ No newline at end of file
diff --git a/doc/fluid/user_guides/howto/training/single_node.rst b/doc/fluid/user_guides/howto/training/single_node.rst
new file mode 100644
index 0000000000000000000000000000000000000000..23eac0f831f2d6d052b7fc35b536d4ab633df851
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/single_node.rst
@@ -0,0 +1,119 @@
+########
+单机训练
+########
+
+准备工作
+########
+
+要进行PaddlePaddle Fluid单机训练，需要先 :ref:`user_guide_prepare_data` 和
+:ref:`user_guide_configure_simple_model` 。当\
+:ref:`user_guide_configure_simple_model` 完毕后，可以得到两个\
+:code:`fluid.Program`， :code:`startup_program` 和 :code:`main_program`。
+默认情况下，可以使用 :code:`fluid.default_startup_program()` 与\ :code:`fluid.default_main_program()` 获得全局的 :code:`fluid.Program`。
+
+例如:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[784])
+   label = fluid.layers.data(name="label", shape=[1])
+   hidden = fluid.layers.fc(input=image, size=100, act='relu')
+   prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+   loss = fluid.layers.mean(
+       fluid.layers.cross_entropy(
+           input=prediction,
+           label=label
+       )
+   )
+
+   sgd = fluid.optimizer.SGD(learning_rate=0.001)
+   sgd.minimize(loss)
+
+   # Here the fluid.default_startup_program() and fluid.default_main_program()
+   # has been constructed.
+
+在上述模型配置执行完毕后， :code:`fluid.default_startup_program()` 与\
+:code:`fluid.default_main_program()` 配置完毕了。
+
+初始化参数
+##########
+
+参数随机初始化
+==============
+
+用户配置完模型后，参数初始化操作会被写入到\
+:code:`fluid.default_startup_program()` 中。使用 :code:`fluid.Executor()` 运行
+这一程序，即可在全局 :code:`fluid.global_scope()` 中随机初始化参数。例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CUDAPlace(0))
+   exe.run(program=fluid.default_startup_program())
+
+值得注意的是: 如果使用多GPU训练，参数需要先在GPU0上初始化，再经由\
+:code:`fluid.ParallelExecutor` 分发到多张显卡上。
+
+
+载入预定义参数
+==============
+
+在神经网络训练过程中，经常会需要载入预定义模型，进而继续进行训练。\
+如何载入预定义参数，请参考 :ref:`user_guide_save_load_vars`。
+
+
+单卡训练
+########
+
+执行单卡训练可以使用 :code:`fluid.Executor()` 中的 :code:`run()` 方法，运行训练\
+:code:`fluid.Program` 即可。在运行的时候，用户可以通过 :code:`run(feed=...)`\
+参数传入数据；用户可以通过 :code:`run(fetch=...)` 获取持久的数据。例如:\
+
+.. code-block:: python
+
+   ...
+   loss = fluid.layers.mean(...)
+
+   exe = fluid.Executor(...)
+   # the result is an numpy array
+   result = exe.run(feed={"image": ..., "label": ...}, fetch_list=[loss])
+
+这里有几点注意事项:
+
+1. feed的数据格式，请参考文章 :ref:`user_guide_feed_data_to_executor`。
+2. :code:`Executor.run` 的返回值是 :code:`fetch_list=[...]` 的variable值。被fetch\
+   的Variable必须是persistable的。 :code:`fetch_list` 可以传入Variable的列表，\
+   也可以传入Variable的名字列表。:code:`Executor.run` 返回Fetch结果列表。
+3. 如果需要取回的数据包含序列信息，可以设置
+   :code:`exe.run(return_numpy=False, ...)` 直接返回 :code:`fluid.LoDTensor`
+   。用户可以直接访问 :code:`fluid.LoDTensor` 中的信息。
+
+多卡训练
+########
+
+执行多卡训练可以使用 :code:`fluid.ParallelExecutor` 运行训练
+:code:`fluid.Program`。例如:
+
+.. code-block:: python
+
+   train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name,
+                                main_program=fluid.default_main_program())
+   train_exe.run(fetch_list=[loss.name], feed={...})
+
+这里有几点注意事项:
+
+1. :code:`ParallelExecutor` 的构造函数需要指明要执行的 :code:`fluid.Program` ,
+   并在执行过程中不能修改。默认值是 :code:`fluid.default_main_program()` 。
+2. :code:`ParallelExecutor` 需要明确指定是否使用 CUDA 显卡进行训练。在显卡训练\
+   模式下会占用全部显卡。用户可以配置 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 来修改占用\
+   的显卡。
+
+进阶使用
+########
+
+.. toctree::
+   :maxdepth: 2
+
+   test_while_training
+   save_load_variables
diff --git a/doc/fluid/user_guides/howto/training/src/create_gpu_machine.png b/doc/fluid/user_guides/howto/training/src/create_gpu_machine.png
new file mode 100644
index 0000000000000000000000000000000000000000..8b98ce5bdf0c1f9921eac1f4f55d31bec028d650
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/create_gpu_machine.png differ
diff --git a/doc/fluid/user_guides/howto/training/src/create_image.png b/doc/fluid/user_guides/howto/training/src/create_image.png
new file mode 100644
index 0000000000000000000000000000000000000000..b9a26de49a6ec33707199d2dfde8a741c4222581
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/create_image.png differ
diff --git a/doc/fluid/user_guides/howto/training/src/create_more_nodes.png b/doc/fluid/user_guides/howto/training/src/create_more_nodes.png
new file mode 100644
index 0000000000000000000000000000000000000000..656cf6f49bd7e239bfbbd305dc87a5a73a6100d1
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/create_more_nodes.png differ
diff --git a/doc/fluid/user_guides/howto/training/src/dist_train_demo.py b/doc/fluid/user_guides/howto/training/src/dist_train_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..29cd49dd16c4ac1f4af3e832b441a6d4b1202673
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/src/dist_train_demo.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.fluid.core as core
+import math
+import os
+import sys
+
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+
+BATCH_SIZE = 64
+PASS_NUM = 1
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def train(use_cuda, role, endpoints, current_endpoint, trainer_id, trainers):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    prediction, avg_loss, acc = conv_net(img, label)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    t = fluid.DistributeTranspiler()
+    t.transpile(trainer_id, pservers=endpoints, trainers=trainers)
+    if role == "pserver":
+        prog = t.get_pserver_program(current_endpoint)
+        startup = t.get_startup_program(current_endpoint, pserver_program=prog)
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup)
+        exe.run(prog)
+    elif role == "trainer":
+        prog = t.get_trainer_program()
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=500),
+            batch_size=BATCH_SIZE)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+        feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+        exe.run(fluid.default_startup_program())
+        for pass_id in range(PASS_NUM):
+            for batch_id, data in enumerate(train_reader()):
+                acc_np, avg_loss_np = exe.run(prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[acc, avg_loss])
+                if (batch_id + 1) % 10 == 0:
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                                float(avg_loss_np.mean()), float(acc_np.mean())))
+
+if __name__ == '__main__':
+    if len(sys.argv) != 6:
+        print("Usage: python %s role endpoints current_endpoint trainer_id trainers" % sys.argv[0])
+        exit(0)
+    role, endpoints, current_endpoint, trainer_id, trainers = \
+        sys.argv[1:]
+    train(True, role, endpoints, current_endpoint, int(trainer_id), int(trainers))
+
diff --git a/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.graffle b/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..16f6b8835c4ffb82babca56b62ba44494fd6a947
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.graffle differ
diff --git a/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.png b/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.png
new file mode 100644
index 0000000000000000000000000000000000000000..587a1a48affdde6809d7f8bf77e1055db7cd8c14
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.png differ
diff --git a/doc/fluid/user_guides/howto/training/src/dist_train_pserver.graffle b/doc/fluid/user_guides/howto/training/src/dist_train_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..046c4903231e8ca441884674c08b381766c0bbae
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/dist_train_pserver.graffle differ
diff --git a/doc/fluid/user_guides/howto/training/src/dist_train_pserver.png b/doc/fluid/user_guides/howto/training/src/dist_train_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd2f92ad1a14ac12efc2c257c8aa3d1ae403b2b1
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/dist_train_pserver.png differ
diff --git a/doc/fluid/user_guides/howto/training/src/parallelism.png b/doc/fluid/user_guides/howto/training/src/parallelism.png
new file mode 100644
index 0000000000000000000000000000000000000000..c787907397acb78d5e8ce31481e44dac2f774a4e
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/parallelism.png differ
diff --git a/doc/fluid/user_guides/howto/training/src/release.png b/doc/fluid/user_guides/howto/training/src/release.png
new file mode 100644
index 0000000000000000000000000000000000000000..75dfd7f0dce96ab57d4a471728e471b0d2b6e5f4
Binary files /dev/null and b/doc/fluid/user_guides/howto/training/src/release.png differ
diff --git a/doc/fluid/user_guides/howto/training/test_while_training.rst b/doc/fluid/user_guides/howto/training/test_while_training.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37d5c0d78179ccead7a81dffb4ae2f0d835a5949
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/test_while_training.rst
@@ -0,0 +1,120 @@
+.. _user_guide_test_while_training:
+
+##################
+训练过程中评测模型
+##################
+
+模型的测试评价与训练的 :code:`fluid.Program` 不同。在测试评价中:
+
+1. 评价测试不进行反向传播，不优化更新参数。
+2. 评价测试执行的操作可以不同。
+
+   * 例如 BatchNorm 操作，在训练和测试时执行不同的算法。
+
+   * 评价模型与训练相比可以是完全不同的模型。
+
+生成测试 :code:`fluid.Program`
+#################################
+
+通过克隆训练 :code:`fluid.Program` 生成测试 :code:`fluid.Program`
+=======================================================================
+
+:code:`Program.clone()` 方法可以复制出新的 :code:`fluid.Program` 。 通过设置
+:code:`Program.clone(for_test=True)` 复制含有用于测试的操作Program。简单的使用方法如下:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   img = fluid.layers.data(name="image", shape=[784])
+   prediction = fluid.layers.fc(
+     input=fluid.layers.fc(input=img, size=100, act='relu'),
+     size=10,
+     act='softmax'
+   )
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   loss = fluid.layers.mean(fluid.layers.cross_entropy(input=prediction, label=label))
+   acc = fluid.layers.accuracy(input=prediction, label=label)
+
+   test_program = fluid.default_main_program().clone(for_test=True)
+
+   adam = fluid.optimizer.Adam(learning_rate=0.001)
+   adam.minimize(loss)
+
+在使用 :code:`Optimizer` 之前，将 :code:`fluid.default_main_program()` 复制\
+成一个 :code:`test_program` 。之后使用测试数据运行 :code:`test_program`,\
+就可以做到运行测试程序，而不影响训练结果。
+
+分别配置训练 :code:`fluid.Program` 和测试 :code:`fluid.Program`
+=====================================================================
+
+如果训练程序和测试程序相差较大时，用户也可以通过完全定义两个不同的
+:code:`fluid.Program`，分别进行训练和测试。在PaddlePaddle Fluid中，\
+所有的参数都有名字。如果两个不同的操作，甚至两个不同的网络使用了同样名字的参数，\
+那么他们的值和内存空间都是共享的。
+
+PaddlePaddle Fluid中使用 :code:`fluid.unique_name` 包来随机初始化用户未定义的\
+参数名称。通过 :code:`fluid.unique_name.guard` 可以确保多次调用某函数\
+参数初始化的名称一致。
+
+例如:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   def network(is_test):
+       file_obj = fluid.layers.open_files(filenames=["test.recordio"] if is_test else ["train.recordio"], ...)
+       img, label = fluid.layers.read_file(file_obj)
+       hidden = fluid.layers.fc(input=img, size=100, act="relu")
+       hidden = fluid.layers.batch_norm(input=hidden, is_test=is_test)
+       ...
+       return loss
+
+   with fluid.unique_name.guard():
+       train_loss = network(is_test=False)
+       sgd = fluid.optimizer.SGD(0.001)
+       sgd.minimize(train_loss)
+
+   test_program = fluid.Program()
+   with fluid.unique_name.guard():
+       with fluid.program_gurad(test_program, fluid.Program()):
+           test_loss = network(is_test=True)
+
+   # fluid.default_main_program() is the train program
+   # fluid.test_program is the test program
+
+执行测试 :code:`fluid.Program`
+#################################
+
+使用 :code:`Executor` 执行测试 :code:`fluid.Program`
+=======================================================
+
+用户可以使用 :code:`Executor.run(program=...)` 来执行测试
+:code:`fluid.Program`。
+
+例如
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   test_acc = exe.run(program=test_program, feed=test_data_batch, fetch_list=[acc])
+   print 'Test accuracy is ', test_acc
+
+使用 :code:`ParallelExecutor` 执行测试 :code:`fluid.Program`
+===============================================================
+
+用户可以使用训练用的 :code:`ParallelExecutor` 与测试 :code:`fluid.Program`
+一起新建一个测试的 :code:`ParallelExecutor` ；再使用测试
+:code:`ParallelExecutor.run` 来执行测试。
+
+例如:
+
+.. code-block:: python
+
+   train_exec = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+
+   test_exec = fluid.ParallelExecutor(use_cuda=True, share_vars_from=train_exec,
+                                      main_program=test_program)
+   test_acc = test_exec.run(fetch_list=[acc], ...)
+
diff --git a/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst b/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..76bb9f49220600ebfec78b388bc0a75c0901e746
--- /dev/null
+++ b/doc/fluid/user_guides/howto/training/train_on_baidu_cloud_cn.rst
@@ -0,0 +1,228 @@
+.. _train_on_baidu_cloud_cn
+
+在百度云上启动Fluid分布式训练
+=========================
+
+PaddlePaddle Fluid分布式训练，可以不依赖集群系统（比如MPI，Kubernetes）启动分布式训练。
+本章节将会以 `百度云 <https://cloud.baidu.com/>`_ 为实例，说明如何在云端环境，甚至云端GPU环境启动
+大规模分布式任务。
+
+创建集群模板
+----------
+
+登录到百度云控制台，选择BCC服务，点击“创建实例”。选择地域，注意，只有一些地域有GPU服务器可选，
+选择合适的地域之后，再选择对应型号，然后创建一个空的服务器，如下图：
+
+.. image:: src/create_gpu_machine.png
+
+* 在操作系统选项中，可以根据需要选择对应的版本，注意根据实际情况选择CUDA版本，这里我们选择CUDA-9.2。
+* 示例中选择机器付费方式为后付费，表示随着机器的释放，收费也会对应停止，对运行一次性任务会比较划算。
+
+在机器创建成功之后，执行下面的命令安装paddlepaddle GPU版本和相关依赖。
+
+.. code-block:: bash
+
+  apt-get update && apt-get install -y python python-pip python-opencv
+  # 注：百度云cuda-9.2镜像默认没有安装cudnn和nccl2，需要手动安装，如果自行安装，需要从官网下载
+  wget -q "http://paddle-train-on-cloud.cdn.bcebos.com/libcudnn7_7.2.1.38-1+cuda9.2_amd64.deb"
+  wget -q "http://paddle-train-on-cloud.cdn.bcebos.com/nccl_2.2.13-1+cuda9.0_x86_64.txz"
+  dpkg -i libcudnn7_7.2.1.38-1+cuda9.2_amd64.deb
+  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so
+  unxz nccl_2.2.13-1+cuda9.0_x86_64.txz
+  tar xf nccl_2.2.13-1+cuda9.0_x86_64.tar
+  cp -r nccl_2.2.13-1+cuda9.0_x86_64/lib/* /usr/lib
+  # 注：可以选择是否使用下面的pip镜像加速下载
+  pip install -i https://pypi.tuna.tsinghua.edu.cn/simple matplotlib==2.2.3
+  pip install -i https://pypi.tuna.tsinghua.edu.cn/simple paddlepaddle-gpu==0.15.0.post97
+
+
+完成安装后，使用下面的测试程序，测试当前机器是否可以正确运行GPU训练程序，如果遇到报错，请根据报错提示修复
+运行环境问题。为了方便启动GPU集群，测试程序执行成功之后，选择当前服务器，然后选择“创建自定义镜像”，后续
+创建GPU集群时即可选择配置好的镜像。
+
+.. image:: src/create_image.png
+
+* 测试程序：
+
+.. code-block:: python
+
+  from __future__ import print_function
+
+  import paddle.fluid.core as core
+  import math
+  import os
+  import sys
+
+  import numpy
+
+  import paddle
+  import paddle.fluid as fluid
+
+  BATCH_SIZE = 64
+  PASS_NUM = 1
+
+  def loss_net(hidden, label):
+      prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+      loss = fluid.layers.cross_entropy(input=prediction, label=label)
+      avg_loss = fluid.layers.mean(loss)
+      acc = fluid.layers.accuracy(input=prediction, label=label)
+      return prediction, avg_loss, acc
+
+  def conv_net(img, label):
+      conv_pool_1 = fluid.nets.simple_img_conv_pool(
+          input=img,
+          filter_size=5,
+          num_filters=20,
+          pool_size=2,
+          pool_stride=2,
+          act="relu")
+      conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+      conv_pool_2 = fluid.nets.simple_img_conv_pool(
+          input=conv_pool_1,
+          filter_size=5,
+          num_filters=50,
+          pool_size=2,
+          pool_stride=2,
+          act="relu")
+      return loss_net(conv_pool_2, label)
+
+
+  def train(use_cuda):
+      if use_cuda and not fluid.core.is_compiled_with_cuda():
+          return
+      img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+      prediction, avg_loss, acc = conv_net(img, label)
+
+      test_program = fluid.default_main_program().clone(for_test=True)
+
+      optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+      optimizer.minimize(avg_loss)
+
+      place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+      exe = fluid.Executor(place)
+
+      train_reader = paddle.batch(
+          paddle.reader.shuffle(
+              paddle.dataset.mnist.train(), buf_size=500),
+          batch_size=BATCH_SIZE)
+      test_reader = paddle.batch(
+          paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+      feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+
+      exe.run(fluid.default_startup_program())
+
+      
+      for pass_id in range(PASS_NUM):
+          for batch_id, data in enumerate(train_reader()):
+              acc_np, avg_loss_np = exe.run(fluid.default_main_program(),
+                                            feed=feeder.feed(data),
+                                            fetch_list=[acc, avg_loss])
+              if (batch_id + 1) % 10 == 0:
+                  print(
+                      'PassID {0:1}, BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.
+                      format(pass_id, batch_id + 1,
+                              float(avg_loss_np.mean()), float(acc_np.mean())))
+
+  if __name__ == '__main__':
+      train(True)
+
+
+创建集群
+------
+
+完成创建镜像之后，可以使用这个配置好的镜像创建一个GPU集群，根据您的实际需求创建足够数量的GPU服务器，
+作为示例，这里启动2台GPU服务器，包括上一步创建的服务器，所以这里再启动一台新的服务器。
+
+点击“创建实例”，在相同地域选择同样配置的GPU服务器，注意选择刚才创建的镜像作为操作系统。
+
+.. image:: src/create_more_nodes.png
+
+编写集群任务启动脚本
+----------------
+
+为了方便在更多的GPU服务器上启动分布式训练任务，我们将使用
+`fabric <http://www.fabfile.org/>`_
+作为集群任务启动管理工具，您可以选择其他熟悉的集群框架，比如MPI, Kubernetes，本示例演示的方法
+仅针对简单集群环境，而且服务器之间可以互相ssh登录。
+
+安装fabric，需要执行：
+
+.. code-block:: bash
+
+  pip install fabric
+
+假设我们创建了2台GPU服务器，ip分别是 :code:`172.16.0.5,172.16.0.6` ，然后在第一台服务器上，
+先创建训练程序文件 :code:`dist_train_demo.py` ，从
+`这里 <https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/training/src/dist_train_demo.py>`_ 
+下载代码。然后编写 :code:`fabfile.py` 脚本，用于控制在不同服务器上启动训练任务的parameter server和trainer：
+
+.. code-block:: python
+
+  from fabric import Group, task
+
+  endpoints = "172.16.0.5:6173,172.16.0.6:6173"
+  port = "6173"
+  pservers = 2
+  trainers = 2
+
+  hosts = []
+  eps = []
+  for ep in endpoints.split(","):
+      eps.append(ep)
+      hosts.append(ep.split(":")[0])
+
+  def start_server(c):
+      current_endpoint = "%s:%s" % (c.host, port)
+      trainer_id = hosts.index(c.host)
+      cmd = "python /root/work/dist_train_demo.py pserver %s %s %d %d &> /root/work/server.log.%s &" % (
+          endpoints, current_endpoint, trainer_id, trainers, c.host)
+      c.run(cmd)
+
+  def start_trainer(c):
+      current_endpoint = "%s:%s" % (c.host, port)
+      trainer_id = hosts.index(c.host)
+      cmd = "python /root/work/dist_train_demo.py trainer %s %s %d %d &> /root/work/trainer.log.%s &" % (
+          endpoints, current_endpoint, trainer_id, trainers, c.host)
+      c.run(cmd)
+
+  @task
+  def start(c):
+      c.connect_kwargs.password = "work@paddle123"
+      c.run("mkdir -p /root/work")
+      c.put("dist_train_demo.py", "/root/work")
+      start_server(c)
+      start_trainer(c)
+
+  @task
+  def tail_log(c):
+      c.connect_kwargs.password = "work@paddle123"
+      c.run("tail /root/work/trainer.log.%s" % c.host)
+
+保存上述代码到 :code:`fabfile.py` 之后，执行
+
+.. code-block:: bash
+
+  fab -H 172.16.0.5,172.16.0.6 start
+
+就可以开始一个分布式训练任务。这个任务会在两台GPU服务器分别启动2个pserver进程和2个trainer进程开始训练。
+
+获取分布式训练结果
+---------------
+
+示例任务会在 :code:`/root/work` 下记录日志，分别为
+:code:`pserver.log.[IP]` 和 :code:`trainer.log.[IP]` 的形式，可以手动在
+服务器上查看这些日志文件观察结果，也可以使用fabric获取所有节点的日志信息，比如：
+
+.. code-block:: bash
+
+  fab -H 172.16.0.5,172.16.0.6 tail-log
+
+关闭集群
+------
+
+任务执行完成后，不要忘记释放掉GPU集群资源，勾选选择需要释放的服务器，选择“释放”，则会关闭机器并释放资源。
+如果需要执行新的任务，可以直接使用之前保存的镜像，启动新的集群，并参照前面的步骤开始训练。
+
+.. image:: src/release.png
\ No newline at end of file
diff --git a/doc/fluid/user_guides/index.rst b/doc/fluid/user_guides/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7b3b718159ac4661cc74de9d9ea10eec949b5f2
--- /dev/null
+++ b/doc/fluid/user_guides/index.rst
@@ -0,0 +1,47 @@
+########
+使用指南
+########
+
+==============
+     概览
+==============
+..  todo::
+
+如果您已经掌握了新手入门阶段的内容，期望可以针对实际问题建模、搭建自己网络，本模块提供了一些 Fluid 的使用细节供您参考：
+
+
+    - `Fluid 设计思想 <../user_guides/design_idea/fluid_design_idea.html>`_：介绍 Fluid 底层的设计思想，帮助用户更好的理解框架运作过程
+
+    - `准备数据 <../user_guides/howto/prepare_data/index.html>`_ ：介绍使用 Fluid 训练网络时，数据的支持类型及传输方法
+
+    - `配置简单的网络 <../user_guides/howto/configure_simple_model/index.html>`_： 介绍如何针对问题建模，并利用 Fluid 中相关算子搭建网络
+
+    - `训练神经网络 <../user_guides/howto/training/index.html>`_：介绍如何使用 Fluid 进行单机训练、多机训练、以及保存和载入模型变量
+
+    - `模型评估与调试 <../user_guides/howto/evaluation_and_debugging/index.html>`_：介绍在 Fluid 下进行模型评估和调试的方法，包括：
+      
+      - `模型评估 <../user_guides/howto/evaluation_and_debugging/evaluation/metrics.html>`_：介绍常用模型评估指标的构造方法
+      - `Visual DL 工具 <../user_guides/howto/evaluation_and_debugging/debug/visualdl.html>`_：介绍如何利用 Visual DL 工具可视化训练过程
+
+    - `预测部署 <../user_guides/howto/inference/index.html>`_：介绍如何应用训练好的模型进行预测
+
+
+基于 Fluid 复现的多领域经典模型：
+
+    - `Fluid 模型库 <../user_guides/models/index.html>`_
+
+
+==============
+     目录
+==============
+
+..  toctree::
+    :maxdepth: 2
+
+    howto/prepare_data/index
+    howto/configure_simple_model/index
+    howto/training/index
+    howto/evaluation_and_debugging/index
+    howto/inference/index
+    models/index.rst
+    design_idea/fluid_design_idea.md
diff --git a/doc/fluid/user_guides/models/index.rst b/doc/fluid/user_guides/models/index.rst
new file mode 120000
index 0000000000000000000000000000000000000000..5ac5e2d94aca9f6a9abc84e0ec1137fda576d435
--- /dev/null
+++ b/doc/fluid/user_guides/models/index.rst
@@ -0,0 +1 @@
+../../../../external/models/fluid/README.cn.rst
\ No newline at end of file
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b34ba8d0768427802b11614c6962f3c3f6ef4e3
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,52 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+        "${BINARY_BUILD_DIR_EN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+        html
+        ${BINARY_BUILD_DIR_EN}
+        ${SPHINX_CACHE_DIR_EN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+        "${BINARY_BUILD_DIR_CN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+        html
+        ${BINARY_BUILD_DIR_CN}
+        ${SPHINX_CACHE_DIR_CN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_CN})
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0607748b751e9f2d606236d9e98868335379b05c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -0,0 +1,187 @@
+# Android平台编译指南
+
+用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
+
+## 基于Docker容器的编译方式
+Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
+
+### 构建PaddlePaddle的Android开发镜像
+我们把PaddlePaddle的交叉编译环境打包成一个镜像，称为开发镜像，里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t username/paddle-android:dev . -f Dockerfile.android
+```
+
+用户也可以使用PaddlePaddle提供的官方开发镜像：
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
+### 编译PaddlePaddle C-API库
+构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
+Android的Docker开发镜像向用户提供两个可配置的参数：
+
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 16</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+
+- 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
+```
+
+- 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
+```
+
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+
+## 基于Linux交叉编译环境的编译方式
+本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
+
+### 准备交叉编译环境
+
+从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn)，用户可自行前往下载预编译好的版本，也可通过以下命令获取：
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK中包含了所有Android API级别、所有架构（arm/arm64/x86/mips）需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别，构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
+
+- 构建`armeabi-v7a`、 `Android API 21`的独立工具链：
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
+
+此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+- 构建`arm64-v8a`、 `Android API 21`的独立工具链：
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
+
+此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+### 配置交叉编译参数
+
+CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
+
+交叉编译Android版本的PaddlePaddle库时，有一些必须配置的参数：
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译PaddlePaddle所需的所有第三方库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`）。
+- `WITH_C_API`，必须设置为`ON`。在Android平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
+
+Android平台可选配置参数：
+
+- `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
+- `ANDROID_TOOLCHAIN`，目标工具链。可设置`gcc/clang`，默认值为`clang`。
+	- CMake 3.7以上，将会始终使用`clang`工具链；CMake 3.7以下，可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
+	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
+- `ANDROID_ABI`，目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`，默认值为`armeabi-v7a`。
+- `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
+- `ANROID_ARM_MODE`，是否使用ARM模式。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
+- `ANDROID_ARM_NEON`，是否使用NEON指令。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
+
+其他配置参数：
+
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值；若环境变量`CC/CXX`没有设置，则设置成`cc/c++`编译器。
+
+常用的cmake配置如下：
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 使用`clang`编译工具链
+- `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
+
+### 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```bash
+make
+make install
+```
+
+注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录，其中`include`中包含C-API的头文件，`lib`中包含若干个不同Android ABI的PaddlePaddle库，`third_party`中包含所依赖的所有第三方库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..572063e8012efee2d2e142eb57e459e0e8c6382c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -0,0 +1,189 @@
+# Build PaddlePaddle for Android
+
+There are two approaches to build PaddlePaddle for Android: 
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux) 
+
+## Cross-Compiling Using Docker
+
+Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows.
+
+### Build the Docker Image
+
+The following steps pack all the tools that we need to build PaddlePaddle into a Docker image.
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t paddle:dev-android . -f Dockerfile.android
+```
+
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
+### Build the Inference Library
+
+We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
+```
+
+The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
+
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 16</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+
+The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
+
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+
+The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
+
+## Cross-Compiling on Linux
+
+The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer.
+
+### Setup the Environment
+
+To build for Android's, we need [Android NDK](
+https://developer.android.com/ndk/downloads/index.html):
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android.  (We plan to remove the intermediate stage of building the standalone toolchain in the near future.)
+
+- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
+  
+  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
+
+- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
+
+  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
+
+### Cross-Compiling Arguments
+
+CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake.  `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
+
+Some other CMake arguments you need to know:
+
+- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
+- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
+- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
+
+Some Android-specific arguments:
+
+- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory.  PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument.
+- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`.  The default value is `clang`.
+  - For CMake >= 3.7, it should anyway be `clang`.  For older versions, it could be `gcc`.
+  - Android's official `clang` requires `glibc` >= 2.15.
+- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`.  The default value is `armeabi-v7a`.
+- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`.
+- `ANROID_ARM_MODE`:
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+- `ANDROID_ARM_NEON`: indicates if to use NEON instructions.
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+
+Other useful arguments:
+
+- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen.  Could be `ON` or `OFF`, defaults to `OFF`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
+
+Some frequent configurations for your reference:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+
+There are some other arguments you might want to configure.
+
+- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library.
+- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
+
+Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
+- `CMAKE_BUILD_TYPE=Release`
+- `ANDROID_TOOLCHAIN=clang`
+- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
+
+### Build and Install
+
+After running `cmake`, we can run `make; make install` to build and install.
+
+Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures.
+
+After building，in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories:
+
+- `include`: the header file of the inference library,
+- `lib`: the inference library built for various Android ABIs,
+- `third_party`: dependent third-party libraries built for Android.
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5196d9a4c93c7692d2a624ec7d0650e32806338
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -0,0 +1,117 @@
+# iOS平台编译指南
+交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
+
+## 准备交叉编译环境
+Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境，用户从App Store下载安装Xcode即可。也可自行前往官网下载，[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后，可在命令行执行`xcodebuild -version`，判断是否安装成功。
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## 配置交叉编译参数
+
+PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake)，以提供一些默认的编译器和编译参数配置。
+
+交叉编译iOS版本的PaddlePaddle库时，有一些必须配置的参数：
+
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
+- `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
+
+iOS平台可选配置参数：
+
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
+  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
+  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
+- `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。
+- `IOS_USE_VECLIB_FOR_BLAS`，是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算，可设置`ON/OFF`，默认值为`OFF`。
+- `IOS_DEVELOPMENT_ROOT`，`Developer`目录，可显式指定为`/path/to/platform/Developer`。若未显式指定，PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。
+- `IOS_SDK_ROOT`，所使用`SDK`的根目录，可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定，PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。
+
+其他配置参数：
+
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算，在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`，默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值；若环境变量`CC/CXX`未设置，则使用`cc/c++`编译器。
+
+常用的cmake配置如下：
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望得到最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`，调用`vecLib`框架提供的BLAS函数进行矩阵计算。
+
+## 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```
+$ make
+$ make install
+```
+
+注意：如果你曾在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含以下内容：
+
+- `include`目录，其中包含所有C-API的头文件
+- `lib`目录，其中包含PaddlePaddle的C-API静态库
+- `third_party`目录，其中包含所依赖的所有第三方库
+
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
+
+自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..19bfe86c511c7e43b462f94c8cabba420b3007f1
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# Build PaddlePaddle for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings：
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Build and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `make install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8ef9dc8031613831437745995268f3abc392f5b
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -0,0 +1,62 @@
+# Raspberry Pi平台编译指南
+
+通常有两个方法来构建基于 Rasspberry Pi 的版本：
+
+1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。
+
+1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
+
+## 安装交叉编译器
+
+克隆下面 Github repo
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。
+
+## 配置交叉编译参数
+
+CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。
+
+交叉编译Raspberry Pi版本PaddlePaddle库时，有一些必须配置的参数：
+
+- `CMAKE_SYSTEM_NAME`：CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
+
+- `RPI_TOOLCHAIN`：编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
+
+- `RPI_ARM_NEON`：是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
+
+一个常用的CMake配置如下：
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+其中`WITH_C_API=ON`表示需要构建推理库。
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。
+
+## 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。
+
+```bash
+make
+make install
+```
+
+注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
diff --git a/doc/mobile/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c1a5950ff9553bb725d5a96e3fdf2e5e9f6f95c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_raspberry_en.md
@@ -0,0 +1,62 @@
+# Build PaddlePaddle for Raspberry Pi
+
+You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi:
+
+1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).
+
+1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article.
+
+## The Cross-Compiling Toolchain
+
+Step 1. Clone the Github repo by running the following command.
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`.  To run it on a Linux computer, glibc version >= 2.14 is needed.
+
+## CMake Arguments
+
+CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake).
+
+Some important arguments that need to be set:
+
+- `CMAKE_SYSTEM_NAME`: The target platform.  Must be `RPi`.
+
+- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain.
+
+- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`.
+
+- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host.  It is used to build building tools running on the host, for example, protoc.
+
+A commonly-used CMake configuration is as follows:
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`.
+
+You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`.
+
+## Build and Install
+
+The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies.
+
+```bash
+make
+make install
+```
+
+ The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`.
+
+The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`.
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..56d1515005f6e40b084c6b2184c6a0b3e3a00496
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e0acdff0284e3bc84b2cc4a34a142ee01754f940
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f
--- /dev/null
+++ b/doc/survey/cluster_bootstrapping_tools.md
@@ -0,0 +1,71 @@
+# Cluster bootstrapping tool survey
+## Abstract
+In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
+
+## Basic assumptions
+Here are some basic assumptions before we move on to  details
+1. You are an administrator of a bare metal machine cluster, which means:
+  * you have full control to each of the machines.
+  * you have full control to the network which machines are connected to.
+2. Machines can be booted from network with PEX or iPXE
+3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
+
+if your cluster is able to mark above items with checkmarks, then keep reading.
+
+## Comparing Sextant and Tectonic installer
+### Sextant
+Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
+
+#### Pros
+1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
+2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
+3. docker registry integrated.
+4. GPU machine took care of.
+
+### Cons
+1. k8s API server is not deployed with high availability in considering by default.
+2. No grouping support.
+3. No API interface, a one-off service.
+
+
+### Tectonic installer
+First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
+
+Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
+Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
+
+Matchbox's Approach is similar to Sexstant.
+
+### Pros
+1. supports grouping machines.
+2. supports running provisioning service in rtk. (not a big deal though).
+3. supports http/gRPC API interface.
+4. supports multi-template.
+
+### Cons
+1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
+2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
+
+## Conclusion
+Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
+
+
+
+## Appendix: General procedure to bring up a cluster
+It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
+1. setup a bootstrap machine with static IP in the cluster, which has following services:
+  * DHCP: assigns ip address for rest of the nodes.
+  * name service: to map node name to a IP
+  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
+  * cluster config service: this is for providing cluster node with OS config via http
+  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
+2. New node powers on, it will
+  * broadcast the request for an IP address
+  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
+  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
+  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
+  * then restart the node.
+
+For further understanding, following 2 links from Matchbox are some good readings:
+* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
+* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f62eeadff43af1f0a3c81e284a6508bf063b21e
--- /dev/null
+++ b/doc/survey/dynamic_graph.md
@@ -0,0 +1,379 @@
+# Automatic Differentiation with the Tape
+
+## Automatic Differentiation
+
+A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.
+
+## Program Transformation v.s. Backtracking
+
+Given the forward pass program, there are two strategies to derive the backward pass:
+
+1. by transforming the forward pass program without executing it, or
+1. by backtracking the execution process of the forward pass program.
+
+This article is about the latter strategy. 
+
+## The Tape and Dynamic Networks
+
+We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf).  When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations.  Such changes are known as *dynamic networks* in the field of deep learning.
+
+## Typical Systems
+
+Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years.  This article surveys the following typical systems: 
+
+- [DyNet](https://dynet.readthedocs.io/en/latest/)
+- [PyTorch](https://pytorch.org/)
+- Chainer
+- Autograd from HIPS
+
+Before diving into these systems, let us pose an example forward pass program:
+
+```python
+x = Variable(randn(20, 1)))
+label = Variable(randint(1))
+W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
+h = matmul(W_1, x)
+pred = matmul(W_2, h)
+loss = softmax(pred, label)
+loss.backward()
+```
+
+## The Representation of Tapes
+
+### DyNet: the Tape as a List
+
+DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`.  The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    node [
+        fontsize = "16"
+        shape = "ellipse"
+    ];
+    edge [];
+    "node0" [
+        label = "<f0> type: matmul | <f1> input: W_1, x | <f2> output: h"
+        shape = "record"
+    ];
+    "node1" [
+        label = "<f0> type: matmul | <f1> input: W_2, h | <f2> output: pred"
+        shape = "record"
+    ];
+    "node2" [
+        label = "<f0> type: softmax | <f1> input: pred, label | <f2> output: loss"
+        shape = "record"
+    ];
+    "node0":f0 -> "node1":f0 [];
+    "node1":f0 -> "node2":f0 [];
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
+
+### PyTorch: the Tape as a Graph
+
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.  Please be aware that a `Function` might have more than one `prev_func`s.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    
+    subgraph function {
+        node [
+            fontsize = "16"
+            style = filled
+            shape = "record"
+        ];
+        "matmul0" [ label = "<f0> type: matmul | prev_func: None" ];
+        "matmul1" [ label = "<f0> type: matmul | prev_func: matmul" ];
+        "softmax" [ label = "<f0> type: softmax | prev_func: matmul" ];
+    }
+    
+    subgraph variable {
+        node [
+            fontsize = "16"
+            shape = "Mrecord"
+            style = filled
+            fillcolor = white
+        ];
+        "x" [ label = "<f0> x | <f1> creator: None" ];
+        "label" [ label = "<f0> label | <f1> creator: None" ];
+        "W_1" [ label = "<f0> W_1 | <f1> creator: None" ];
+        "W_2" [ label = "<f0> W_2 | <f1> creator: None" ];
+        "h" [ label = "<f0> h | <f1> creator: None" ];
+        "pred" [ label = "<f0> pred | <f1> creator: matmul" ];
+        "loss" [ label = "<f0> loss | <f1> creator: softmax" ];
+    }
+    
+    subgraph data_flow {
+        "x":f0 -> "matmul0":f0;
+        "W_1":f0 -> "matmul0":f0;
+        "matmul0":f0 -> "h":f0;
+    
+        "h":f0 -> "matmul1":f0;
+        "W_2":f0 -> "matmul1":f0;
+        "matmul1":f0 -> "pred":f0;
+    
+        "pred":f0 -> "softmax":f0;
+        "label":f0 -> "softmax":f0;
+        "softmax":f0 -> "loss":f0;
+    }
+
+    subgraph prev_func {
+        edge [color="red", arrowsize="0.6", penwidth="1", constraint=false];
+        "matmul1":f1 -> "matmul0":f0;
+        "softmax":f1 -> "matmul1":f0;
+        label = "prev_func";
+    }
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
+
+Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.
+
+## Comparison: List v.s. Graph
+
+The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:
+
+```python
+result = BigNet(data)
+loss = SmallNet(data)
+loss.backward()
+```
+
+## Lazy v.s. Immediate Evaluation
+
+Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:
+
+```python
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg()
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+
+The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
+
+PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+
+
+## Fluid: Learning the Lessons
+
+Please refer to `paddle/contrib/dynamic/`.
+
+## Appendix
+
+### Overview
+
+| Framework | Has Tape | Core in C++ | First Release Date |
+|-----------|----------|-------------|--------------------|
+| Autograd  | No       | No          | Mar 5, 2015        |
+| Chainer   | No       | No          | Jun 5, 2015        |
+| Pytorch   | No       | Yes         | Aug 31, 2016       |
+| Dynet     | Yes      | Yes         | Oct 12, 2016       |
+
+### Source Code
+#### Autograd
+[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed.
+```python
+# User API
+def make_grad(fun, x):
+    start_node = VJPNode.new_root()
+    end_value, end_node =  trace(start_node, fun, x)
+    return backward_pass(g, end_node), end_value
+
+# trace the forward pass by creating VJPNodes
+def trace(start_node, fun, x):
+    with trace_stack.new_trace() as t:
+        start_box = new_box(x, t, start_node)
+        end_box = fun(start_box)
+        return end_box._value, end_box._node
+
+def backward_pass(g, end_node):
+    outgrads = {end_node : (g, False)}
+    for node in toposort(end_node):
+        outgrad = outgrads.pop(node)
+        ingrads = node.vjp(outgrad[0])
+        for parent, ingrad in zip(node.parents, ingrads):
+            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
+    return outgrad[0]
+
+# Every VJPNode corresponds to a op_grad
+class VJPNode(Node):
+    __slots__ = ['parents', 'vjp']
+    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
+        self.parents = parents
+        vjpmaker = primitive_vjps[fun]
+        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)
+```
+#### Chainer
+Example Code
+```python
+# (1) Function Set definition, creates FunctionNode
+model = FunctionSet(
+    l1=F.Linear(784, 100),
+    l2=F.Linear(100, 100),
+    l3=F.Linear(100, 10)).to_gpu()
+
+# (2) Optimizer Setup
+opt = optimizers.SGD()
+opt.setup(model)
+
+# (3) Forward computation
+def forward(x, t):
+    h1 = F.relu(model.l1(x))
+    h2 = F.relu(model.l2(h1))
+    y = model.l3(h2)
+    return F.softmax_cross_entropy(y, t)
+
+# (4) Training loop
+for epoch in xrange(n_epoch):
+    for i in xrange(0, N, b_size):
+        x = Variable(to_gpu(...))
+        t = Variable(to_gpu(...))
+        opt.zero_grads()
+        loss = forward(x, t)
+        loss.backward()
+        opt.update()
+```
+In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`.
+```python
+class FunctionNode(object):
+    ...
+    def apply(self, inputs):
+        outputs = self.forward(inputs)
+        ret = tuple([variable.Variable(y, requires_grad=requires_grad)
+                     for y in outputs])
+        # Topological ordering
+        self.rank = max([x.rank for x in inputs]) if input_vars else 0
+        # Add backward edges
+        for y in ret:
+            y.creator_node = self
+        self.inputs = tuple([x.node for x in input_vars])
+        self.outputs = tuple([y.node for y in ret])
+
+        return ret
+```
+`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order.
+```python
+class VariableNode(object):
+    ...
+    def backward(self, retain_grad, loss_scale):
+        if self.creator_node is None:
+            return
+
+        cand_funcs = []
+        seen_set = set()
+        grads = {}
+
+        # Initialize error by 1, if this is a loss variable
+        if self.data.size == 1 and self._grad_var is None:
+            self.grad = numpy.ones_like(self.data)
+        grads[self._node] = self._grad_var
+
+        def add_cand(cand):
+            if cand not in seen_set:
+                # Negate since heapq is min-heap. This is a global variable
+                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+                seen_set.add(cand)
+
+        add_cand(self.creator_node)
+
+        while cand_funcs:
+            _, _, func = heapq.heappop(cand_funcs)
+            gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad)
+
+            for x, gx in enumerate(gxs):
+                if x in grads:
+                    grads[x] += gx
+                else:
+                    grads[x] = gx
+
+                if x.creator_node is not None:
+                    add_cand(x.creator_node)
+```
+
+#### PyTorch
+Example Code
+```python
+x = Variable(torch.ones(5, 5))
+y = Variable(torch.ones(5, 5) * 4)
+z = x ** 2 + x * 2 + x * y + y
+z.backward(torch.ones(5, 5))
+```
+The trace is done by `Variable.creator` and `Function.previous_functions`.
+```python
+class Variable(object):
+    def __init__(self, tensor, creator=None, requires_grad=True):
+        if creator is None:
+            creator = Leaf(self, requires_grad)
+        self.data = tensor
+        self.creator = creator
+        self._grad = None
+
+    def backward(self, gradient=None):
+        if gradient is None:
+            if self.data.numel() != 1:
+                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+            gradient = self.data.new(1).fill_(1)
+        self._execution_engine.run_backward(self, gradient)
+
+class Function(obejct):
+    # ...
+    def _do_forward(self, *input):
+        unpacked_input = tuple(arg.data for arg in input)
+        raw_output = self.forward(*unpacked_input)
+
+        # mark output.creator = self for backward trace
+        output = tuple(Variable(tensor, self) for tensor in raw_output)
+
+        self.previous_functions = [(arg.creator, id(arg)) for arg in input]
+        self.output_ids = {id(var): i for i, var in enumerate(output)}
+        return output
+
+    def _do_backward(self, grad_output):
+        return self.backwaerd(grad_output)
+```
+The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd.
+
+#### DyNet
+Example code
+```python
+model = dy.model()
+W_p = model.add_parameters((20, 100))
+b_p = model.add_parameters(20)
+E = model.add_lookup_parameters((20000, 50))
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg() # init tape
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order.
+```c++
+void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
+  ...  
+  for (int i = num_nodes - 1; i >= 0; --i) {
+    // each node corresponds to an op
+    node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai);
+  }
+  ...
+}
+```
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6e48f4f58269b67450cb012f6dcc59e1083abba
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
new file mode 100644
index 0000000000000000000000000000000000000000..890f70615538af23cd05b9ffd685e870a5644cdb
--- /dev/null
+++ b/doc/templates/conf.py.cn.in
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
+import shlex
+from recommonmark import parser, transform
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'PaddlePaddle'
+author = u'%s developers' % project
+copyright = u'2016, %s' % author
+github_doc_root = ''
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+    '.Rmd': MarkdownParser,
+}
+os.environ['PADDLE_BUILD_DOC'] = '1'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.graphviz'
+]
+mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
+table_styling_embed_css = True
+
+autodoc_member_order = 'bysource'
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md', '.Rmd']
+
+# The encoding of source files.
+source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index_cn'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = 'zh_CN'
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = []
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# Use the .. admonition:: directive for Notes sections.
+# False to use the .. rubric:: directive instead.
+napoleon_use_admonition_for_notes = True
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
new file mode 100644
index 0000000000000000000000000000000000000000..5b09464cb991f96127edec40f7dbbc97a8d82582
--- /dev/null
+++ b/doc/templates/conf.py.en.in
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
+import shlex
+from recommonmark import parser, transform
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
+
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'PaddlePaddle'
+author = u'%s developers' % project
+copyright = u'2016, %s' % author
+github_doc_root = ''
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+    '.Rmd': MarkdownParser,
+}
+os.environ['PADDLE_BUILD_DOC'] = '1'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+]
+
+
+autodoc_member_order = 'bysource'
+
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md', '.Rmd']
+
+# The encoding of source files.
+source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index_en'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = []
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# Use the .. admonition:: directive for Notes sections.
+# False to use the .. rubric:: directive instead.
+napoleon_use_admonition_for_notes = True
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+        'enable_eval_rst': True,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/templates/layout.html b/doc/templates/layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..5091eb32eaeff77bd40f5d348e887b99b6eff4ea
--- /dev/null
+++ b/doc/templates/layout.html
@@ -0,0 +1,23 @@
+{# layout.html #}
+{# Import the theme's layout. #}
+{% extends "!layout.html" %}
+
+{# SIDE NAV, TOGGLES ON MOBILE #}		
+{% block menu %}
+<nav class="doc-menu-vertical" role="navigation">
+{% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+{{ toctree }}
+</nav>
+{% endblock %}
+
+{%- block extrahead %} 
+<script>
+var _hmt = _hmt || [];
+(function() {
+  var hm = document.createElement("script");
+  hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+  var s = document.getElementsByTagName("script")[0]; 
+  s.parentNode.insertBefore(hm, s);
+})();
+</script>
+{% endblock %}
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d230a1b9217eea6740419822f350096e361a4435
--- /dev/null
+++ b/doc/v2/CMakeLists.txt
@@ -0,0 +1,54 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_v2_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_v2_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c74522cb089b17c8419e9058f76631b0fe0df93
--- /dev/null
+++ b/doc/v2/api/CMakeLists.txt
@@ -0,0 +1,25 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_v2_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/api/config/activation.rst b/doc/v2/api/config/activation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5317e66b64bbd85c61f19700a9d2c1d239dee573
--- /dev/null
+++ b/doc/v2/api/config/activation.rst
@@ -0,0 +1,108 @@
+===========
+Activation
+===========
+
+Abs
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Abs
+    :noindex:
+    
+Exp
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Exp
+    :noindex:
+    
+Identity
+========
+
+..  automodule:: paddle.v2.activation
+    :members: Identity
+    :noindex:
+    
+Linear
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Linear
+    :noindex:
+
+Log
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Log
+    :noindex:
+    
+Square
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Square
+    :noindex:
+    
+Sigmoid
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Sigmoid
+    :noindex:
+    
+Softmax
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Softmax
+    :noindex:
+    
+SequenceSoftmax
+===============
+
+..  automodule:: paddle.v2.activation
+    :members: SequenceSoftmax
+    :noindex:
+    
+Relu
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Relu
+    :noindex:
+    
+BRelu
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: BRelu
+    :noindex:
+    
+SoftRelu
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftRelu
+    :noindex:
+    
+Tanh
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Tanh
+    :noindex:
+    
+STanh
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: STanh
+    :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/v2/api/config/attr.rst b/doc/v2/api/config/attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a93f41b86779200d8bac651614f4d61f4895875f
--- /dev/null
+++ b/doc/v2/api/config/attr.rst
@@ -0,0 +1,6 @@
+Parameter Attribute
+===================
+
+..  automodule:: paddle.v2.attr
+    :members:
+    :noindex:
diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst
new file mode 100644
index 0000000000000000000000000000000000000000..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f
--- /dev/null
+++ b/doc/v2/api/config/evaluators.rst
@@ -0,0 +1,110 @@
+..  _api_v2:
+
+==========
+Evaluators
+==========
+
+Classification 
+==============
+
+classification_error
+--------------------
+..  automodule:: paddle.v2.evaluator
+    :members: classification_error
+    :noindex:
+
+auc
+---
+..  automodule:: paddle.v2.evaluator
+    :members: auc
+    :noindex:
+
+ctc_error
+---------
+..  automodule:: paddle.v2.evaluator
+    :members: ctc_error
+    :noindex:
+
+chunk
+-----
+..  automodule:: paddle.v2.evaluator
+    :members: chunk
+    :noindex:
+
+precision_recall
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  precision_recall
+    :noindex:
+
+Rank
+====
+
+pnpair
+------
+..  automodule:: paddle.v2.evaluator
+    :members:  pnpair
+    :noindex:
+
+Utils
+=====
+
+sum
+---
+..  automodule:: paddle.v2.evaluator
+    :members: sum
+    :noindex:
+
+column_sum
+----------
+..  automodule:: paddle.v2.evaluator
+    :members: column_sum
+    :noindex:
+
+Print
+=====
+
+classification_error_printer
+----------------------------
+..  automodule:: paddle.v2.evaluator
+    :members:  classification_error_printer
+    :noindex:
+
+gradient_printer
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  gradient_printer
+    :noindex:
+
+maxid_printer
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  maxid_printer
+    :noindex:
+
+maxframe_printer
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  maxframe_printer
+    :noindex:
+
+seqtext_printer
+---------------
+..  automodule:: paddle.v2.evaluator
+    :members:  seqtext_printer
+    :noindex:
+
+value_printer
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  value_printer
+    :noindex:
+
+Detection
+==========
+
+detection_map
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  detection_map
+    :noindex:
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5a0cfadfce84df41defdf518b7c3a6222d5b30a1
--- /dev/null
+++ b/doc/v2/api/config/layer.rst
@@ -0,0 +1,552 @@
+..  _api_v2.layer:
+
+======
+Layers
+======
+
+Data layer
+===========
+
+..  _api_v2.layer_data:
+
+data
+----
+..  autofunction:: paddle.v2.layer.data
+    :noindex:
+
+Fully Connected Layers
+======================
+
+..  _api_v2.layer_fc:
+
+fc
+--
+..  autofunction:: paddle.v2.layer.fc
+    :noindex:
+
+selective_fc
+------------
+..  autofunction:: paddle.v2.layer.selective_fc
+    :noindex:
+
+Conv Layers
+===========
+
+conv_operator
+-------------
+..  autofunction:: paddle.v2.layer.conv_operator
+    :noindex:
+
+conv_projection
+---------------
+..  autofunction:: paddle.v2.layer.conv_projection
+    :noindex:
+
+conv_shift
+----------
+..  autofunction:: paddle.v2.layer.conv_shift
+    :noindex:
+
+img_conv
+--------
+..  autofunction:: paddle.v2.layer.img_conv
+    :noindex:
+
+..  _api_v2.layer_context_projection:
+
+context_projection
+------------------
+..  autofunction:: paddle.v2.layer.context_projection
+    :noindex:
+
+row_conv
+--------
+..  autofunction:: paddle.v2.layer.row_conv
+    :noindex:
+
+Image Pooling Layer
+===================
+
+img_pool
+--------
+..  autofunction:: paddle.v2.layer.img_pool
+    :noindex:
+
+spp
+---
+..  autofunction:: paddle.v2.layer.spp
+    :noindex:
+
+maxout
+------
+..  autofunction:: paddle.v2.layer.maxout
+    :noindex:
+
+roi_pool
+--------
+..  autofunction:: paddle.v2.layer.roi_pool
+    :noindex:
+
+pad
+----
+..  autofunction:: paddle.v2.layer.pad
+    :noindex:
+
+Norm Layer
+==========
+
+img_cmrnorm
+-----------
+..  autofunction:: paddle.v2.layer.img_cmrnorm
+    :noindex:
+
+batch_norm
+----------
+..  autofunction:: paddle.v2.layer.batch_norm
+    :noindex:
+
+sum_to_one_norm
+---------------
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
+    :noindex:
+
+cross_channel_norm
+------------------
+..  autofunction:: paddle.v2.layer.cross_channel_norm
+    :noindex:
+
+row_l2_norm
+-----------
+..  autofunction:: paddle.v2.layer.row_l2_norm
+    :noindex:
+
+Recurrent Layers
+================
+
+recurrent
+---------
+..  autofunction:: paddle.v2.layer.recurrent
+    :noindex:
+
+lstmemory
+---------
+..  autofunction:: paddle.v2.layer.lstmemory
+    :noindex:
+
+grumemory
+---------
+..  autofunction:: paddle.v2.layer.grumemory
+    :noindex:
+
+gated_unit
+-----------
+..  autofunction:: paddle.v2.layer.gated_unit
+    :noindex:
+
+Recurrent Layer Group
+=====================
+
+memory
+------
+..  autofunction:: paddle.v2.layer.memory
+    :noindex:
+
+recurrent_group
+---------------
+..  autofunction:: paddle.v2.layer.recurrent_group
+    :noindex:
+
+lstm_step
+---------
+..  autofunction:: paddle.v2.layer.lstm_step
+    :noindex:
+
+gru_step
+--------
+..  autofunction:: paddle.v2.layer.gru_step
+    :noindex:
+
+beam_search
+------------
+..  autofunction:: paddle.v2.layer.beam_search
+    :noindex:
+
+get_output
+----------
+..  autofunction:: paddle.v2.layer.get_output
+    :noindex:
+
+Mixed Layer
+===========
+
+..  _api_v2.layer_mixed:
+
+mixed
+-----
+..  autofunction:: paddle.v2.layer.mixed
+    :noindex:
+
+..  _api_v2.layer_embedding:
+
+embedding
+---------
+..  autofunction:: paddle.v2.layer.embedding
+    :noindex:
+
+scaling_projection
+------------------
+..  autofunction:: paddle.v2.layer.scaling_projection
+    :noindex:
+
+dotmul_projection
+-----------------
+..  autofunction:: paddle.v2.layer.dotmul_projection
+    :noindex:
+
+dotmul_operator
+---------------
+..  autofunction:: paddle.v2.layer.dotmul_operator
+    :noindex:
+
+full_matrix_projection
+----------------------
+..  autofunction:: paddle.v2.layer.full_matrix_projection
+    :noindex:
+
+identity_projection
+-------------------
+..  autofunction:: paddle.v2.layer.identity_projection
+    :noindex:
+
+slice_projection
+-------------------
+..  autofunction:: paddle.v2.layer.slice_projection
+    :noindex:
+
+table_projection
+----------------
+..  autofunction:: paddle.v2.layer.table_projection
+    :noindex:
+
+trans_full_matrix_projection
+----------------------------
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
+    :noindex:
+
+Aggregate Layers
+================
+
+AggregateLevel
+--------------
+..  autoclass:: paddle.v2.layer.AggregateLevel
+    :noindex:
+
+..  _api_v2.layer_pooling:
+
+pooling
+-------
+..  autofunction:: paddle.v2.layer.pooling
+    :noindex:
+
+..  _api_v2.layer_last_seq:
+
+last_seq
+--------
+..  autofunction:: paddle.v2.layer.last_seq
+    :noindex:
+
+..  _api_v2.layer_first_seq:
+
+first_seq
+---------
+..  autofunction:: paddle.v2.layer.first_seq
+    :noindex:
+
+sub_seq
+---------
+..  autofunction:: paddle.v2.layer.sub_seq
+    :noindex:
+
+concat
+------
+..  autofunction:: paddle.v2.layer.concat
+    :noindex:
+
+seq_concat
+----------
+..  autofunction:: paddle.v2.layer.seq_concat
+    :noindex:
+
+seq_slice
+---------
+..  autofunction:: paddle.v2.layer.seq_slice
+    :noindex:
+
+sub_nested_seq
+--------------
+..  autofunction:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
+Reshaping Layers
+================
+
+block_expand
+------------
+..  autofunction:: paddle.v2.layer.block_expand
+    :noindex:
+
+..  _api_v2.layer_expand:
+
+ExpandLevel
+-----------
+..  autoclass:: paddle.v2.layer.ExpandLevel
+    :noindex:
+
+expand
+------
+..  autofunction:: paddle.v2.layer.expand
+    :noindex:
+
+repeat
+------
+..  autofunction:: paddle.v2.layer.repeat
+    :noindex:
+
+rotate
+------
+..  autofunction:: paddle.v2.layer.rotate
+    :noindex:
+
+seq_reshape
+-----------
+..  autofunction:: paddle.v2.layer.seq_reshape
+    :noindex:
+
+Math Layers
+===========
+
+addto
+-----
+..  autofunction:: paddle.v2.layer.addto
+    :noindex:
+
+linear_comb
+-----------
+..  autofunction:: paddle.v2.layer.linear_comb
+    :noindex:
+
+interpolation
+-------------
+..  autofunction:: paddle.v2.layer.interpolation
+    :noindex:
+
+bilinear_interp
+---------------
+..  autofunction:: paddle.v2.layer.bilinear_interp
+    :noindex:
+
+dropout
+--------
+..  autofunction:: paddle.v2.layer.dropout
+    :noindex:
+
+dot_prod
+---------
+.. autofunction:: paddle.v2.layer.dot_prod
+    :noindex:
+
+out_prod
+--------
+.. autofunction:: paddle.v2.layer.out_prod
+    :noindex:
+
+power
+-----
+..  autofunction:: paddle.v2.layer.power
+    :noindex:
+
+scaling
+-------
+..  autofunction:: paddle.v2.layer.scaling
+    :noindex:
+
+clip
+----
+..  autofunction:: paddle.v2.layer.clip
+    :noindex:
+
+resize
+------
+..  autofunction:: paddle.v2.layer.resize
+    :noindex:
+
+slope_intercept
+---------------
+..  autofunction:: paddle.v2.layer.slope_intercept
+    :noindex:
+
+tensor
+------
+..  autofunction:: paddle.v2.layer.tensor
+    :noindex:
+
+..  _api_v2.layer_cos_sim:
+
+cos_sim
+-------
+..  autofunction:: paddle.v2.layer.cos_sim
+    :noindex:
+
+l2_distance
+-----------
+..  autofunction:: paddle.v2.layer.l2_distance
+    :noindex:
+
+trans
+-----
+..  autofunction:: paddle.v2.layer.trans
+    :noindex:
+
+scale_shift
+-----------
+..  autofunction:: paddle.v2.layer.scale_shift
+    :noindex:
+
+factorization_machine
+---------------------
+..  autofunction:: paddle.v2.layer.factorization_machine
+    :noindex:
+
+Sampling Layers
+===============
+
+maxid
+-----
+..  autofunction:: paddle.v2.layer.max_id
+    :noindex:
+
+sampling_id
+-----------
+..  autofunction:: paddle.v2.layer.sampling_id
+    :noindex:
+
+multiplex
+---------
+..  autofunction:: paddle.v2.layer.multiplex
+    :noindex:
+
+..  _api_v2.layer_costs:
+
+Cost Layers
+===========
+
+cross_entropy_cost
+------------------
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
+    :noindex:
+
+cross_entropy_with_selfnorm_cost
+--------------------------------
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+    :noindex:
+
+multi_binary_label_cross_entropy_cost
+-------------------------------------
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+    :noindex:
+
+classification_cost
+-------------------
+.. autofunction:: paddle.v2.layer.classification_cost
+   :noindex:
+
+huber_regression_cost
+-------------------------
+..  autofunction:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+
+huber_classification_cost
+-------------------------
+..  autofunction:: paddle.v2.layer.huber_classification_cost
+    :noindex:
+
+lambda_cost
+-----------
+..  autofunction:: paddle.v2.layer.lambda_cost
+    :noindex:
+
+square_error_cost
+-----------------
+..  autofunction:: paddle.v2.layer.square_error_cost
+    :noindex:
+
+rank_cost
+---------
+..  autofunction:: paddle.v2.layer.rank_cost
+    :noindex:
+
+sum_cost
+---------
+..  autofunction:: paddle.v2.layer.sum_cost
+    :noindex:
+
+crf
+---
+..  autofunction:: paddle.v2.layer.crf
+    :noindex:
+
+crf_decoding
+------------
+..  autofunction:: paddle.v2.layer.crf_decoding
+    :noindex:
+
+ctc
+---
+..  autofunction:: paddle.v2.layer.ctc
+    :noindex:
+
+warp_ctc
+--------
+..  autofunction:: paddle.v2.layer.warp_ctc
+    :noindex:
+
+nce
+---
+..  autofunction:: paddle.v2.layer.nce
+    :noindex:
+
+hsigmoid
+---------
+..  autofunction:: paddle.v2.layer.hsigmoid
+    :noindex:
+
+smooth_l1_cost
+--------------
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
+    :noindex:
+
+multibox_loss
+--------------
+..  autofunction:: paddle.v2.layer.multibox_loss
+    :noindex:
+
+detection_output
+----------------
+..  autofunction:: paddle.v2.layer.detection_output
+    :noindex:
+
+Check Layer
+============
+
+eos
+---
+..  autofunction:: paddle.v2.layer.eos
+    :noindex:
+
+Activation
+==========
+
+prelu
+--------
+..  autofunction:: paddle.v2.layer.prelu
+    :noindex:
diff --git a/doc/v2/api/config/networks.rst b/doc/v2/api/config/networks.rst
new file mode 100644
index 0000000000000000000000000000000000000000..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1
--- /dev/null
+++ b/doc/v2/api/config/networks.rst
@@ -0,0 +1,132 @@
+========
+Networks
+========
+
+The v2.networks module contains pieces of neural network that combine multiple layers.
+
+NLP
+===
+
+sequence_conv_pool
+------------------
+..  automodule:: paddle.v2.networks
+    :members: sequence_conv_pool
+    :noindex:
+
+..  _api_trainer_config_helpers_network_text_conv_pool:
+
+text_conv_pool
+--------------
+..  automodule:: paddle.v2.networks
+    :members: text_conv_pool
+    :noindex:
+
+Images
+======
+
+img_conv_bn_pool
+----------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_bn_pool
+    :noindex:
+
+img_conv_group
+--------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_group
+    :noindex:
+
+..  _api_trainer_config_helpers_network_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+..  automodule:: paddle.v2.networks
+    :members: simple_img_conv_pool
+    :noindex:
+
+small_vgg
+---------
+..  automodule:: paddle.v2.networks
+    :members: small_vgg
+    :noindex:
+
+vgg_16_network
+---------------
+..  automodule:: paddle.v2.networks
+    :members: vgg_16_network
+    :noindex:
+
+Recurrent
+=========
+
+LSTM
+----
+
+lstmemory_unit
+``````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_unit
+    :noindex:
+
+lstmemory_group
+```````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_group
+    :noindex:
+
+simple_lstm
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_lstm
+    :noindex:
+
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_lstm
+    :noindex:
+
+GRU
+---
+
+gru_unit
+````````
+..  automodule:: paddle.v2.networks
+    :members: gru_unit
+    :noindex:
+
+gru_group
+`````````
+..  automodule:: paddle.v2.networks
+    :members: gru_group
+    :noindex:
+
+simple_gru
+``````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru
+    :noindex:
+
+simple_gru2
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru2
+    :noindex:
+
+bidirectional_gru
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_gru
+    :noindex:
+
+simple_attention
+----------------
+..  automodule:: paddle.v2.networks
+    :members: simple_attention
+    :noindex:
+
+dot_product_attention
+---------------------
+..  automodule:: paddle.v2.networks
+    :members: dot_product_attention
+    :noindex:
diff --git a/doc/v2/api/config/optimizer.rst b/doc/v2/api/config/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b32373fdef52a7aa9d64b12cda3f76cb2abf351b
--- /dev/null
+++ b/doc/v2/api/config/optimizer.rst
@@ -0,0 +1,45 @@
+==========
+Optimizer
+==========
+
+Momentum
+========
+..  automodule:: paddle.v2.optimizer
+    :members: Momentum
+    :noindex:
+
+Adam
+====
+..  automodule:: paddle.v2.optimizer
+    :members: Adam
+    :noindex:
+
+Adamax
+======
+..  automodule:: paddle.v2.optimizer
+    :members: Adamax
+    :noindex:
+
+AdaGrad
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: AdaGrad
+    :noindex:
+
+DecayedAdaGrad
+==============
+..  automodule:: paddle.v2.optimizer
+    :members: DecayedAdaGrad
+    :noindex:
+
+AdaDelta
+========
+..  automodule:: paddle.v2.optimizer
+    :members: AdaDelta
+    :noindex:
+
+RMSProp
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: RMSProp
+    :noindex:
diff --git a/doc/v2/api/config/pooling.rst b/doc/v2/api/config/pooling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d26b365c9284632210a1532853e39feedc70758b
--- /dev/null
+++ b/doc/v2/api/config/pooling.rst
@@ -0,0 +1,46 @@
+=======
+Pooling
+=======
+
+BasePool
+========
+..  automodule:: paddle.v2.pooling
+    :members: BasePool
+    :noindex:
+
+Avg
+===
+..  automodule:: paddle.v2.pooling
+    :members: Avg
+    :noindex:
+
+Max
+===
+..  automodule:: paddle.v2.pooling
+    :members: Max
+    :noindex:
+
+Sum
+===
+..  automodule:: paddle.v2.pooling
+    :members: Sum
+    :noindex:
+
+SquareRootN
+===========
+..  automodule:: paddle.v2.pooling
+    :members: SquareRootN
+    :noindex:
+
+CudnnAvg
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnAvg
+    :noindex:
+
+CudnnMax
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnMax
+    :noindex:
+
diff --git a/doc/v2/api/data.rst b/doc/v2/api/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b56c7332cc284649c7e04328e51a7faa78593a39
--- /dev/null
+++ b/doc/v2/api/data.rst
@@ -0,0 +1,10 @@
+==================================
+Data Reader Interface and DataSets
+==================================
+
+..  toctree::
+    :maxdepth: 1
+
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/v2/api/data/data_reader.rst b/doc/v2/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0
--- /dev/null
+++ b/doc/v2/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/v2/api/data/dataset.rst b/doc/v2/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/v2/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/v2/api/data/image.rst b/doc/v2/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/v2/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5813509dce46677444f0234db8e0eaa4f113e3a0
--- /dev/null
+++ b/doc/v2/api/index_en.rst
@@ -0,0 +1,9 @@
+API
+===
+
+..  toctree::
+    :maxdepth: 1
+
+    model_configs.rst
+    data.rst
+    run_logic.rst
diff --git a/doc/v2/api/model_configs.rst b/doc/v2/api/model_configs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..992b559cbd87244612521d4c96f84f997d6c4196
--- /dev/null
+++ b/doc/v2/api/model_configs.rst
@@ -0,0 +1,13 @@
+Model Configuration
+===================
+
+..  toctree::
+    :maxdepth: 1
+
+    config/activation.rst
+    config/layer.rst
+    config/evaluators.rst
+    config/optimizer.rst
+    config/pooling.rst
+    config/networks.rst
+    config/attr.rst
diff --git a/doc/v2/api/overview.rst b/doc/v2/api/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a6f21428de1e4906e4af9433bc1c994f2b2c8b8e
--- /dev/null
+++ b/doc/v2/api/overview.rst
@@ -0,0 +1,12 @@
+V2 API Overview
+================
+
+The PaddlePaddle V2 API is designed to provide a modern user interface for PaddlePaddle V1(the original layer-based platform of PaddlePaddle),
+it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/layer.html>`_ , `Optimizer <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/optimizer.html>`_ , `Evaluator <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/evaluators.html>`_  and `Data Reader <http://www.paddlepaddle.org/docs/develop/api/en/v2/data/data_reader.html>`_ to make the model configuration more familiar to users.
+
+A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
+
+We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
+it has several easy to use methods to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
+
+All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
diff --git a/doc/v2/api/run_logic.rst b/doc/v2/api/run_logic.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5c97651f6536d89d2b5926d4b2907a547aa86b55
--- /dev/null
+++ b/doc/v2/api/run_logic.rst
@@ -0,0 +1,31 @@
+======================
+Training and Inference
+======================
+
+Parameters
+==========
+
+..  automodule:: paddle.v2.parameters
+    :members: Parameters
+    :noindex:
+
+Trainer
+=======
+
+..  automodule:: paddle.v2.trainer
+    :members: SGD
+    :noindex:
+
+Event
+=====
+
+..  automodule:: paddle.v2.event
+    :members:
+    :noindex:
+
+Inference
+=========
+
+..  autofunction:: paddle.v2.infer
+    :noindex:
+    
\ No newline at end of file
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d0dacb104f148c2aeb323365cbd6f014ae00ed5a
--- /dev/null
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,225 @@
+从源码编译
+======================
+
+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
+   docker build -t paddle:dev .
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..664b68da8b7dd3e005ebf3ec34de77729e5ab355
--- /dev/null
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -0,0 +1,237 @@
+Build from Sources
+==========================
+
+.. _requirements:
+
+Requirements
+----------------
+
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+2. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
+We run all the tools by running this image.
+
+.. _build_step:
+
+How To Build
+----------------
+
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
+Or you can build your own image from source as the optional step below:
+
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
+.. code-block:: bash
+
+   # 1. clone the source code
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. Optional: build development docker image from source
+   docker build -t paddle:dev .
+   # 3. Run the following command to build a CPU-Only binaries
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container.
+
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+If you wish to run only one unit test, like :code:`test_sum_op`:
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+---------------------------
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
+
+- Not enough disk space
+
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
+
+.. _compile_deps:
+
+Appendix: Compile Dependencies
+-------------------------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Appendix: Build Options
+-------------------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
+
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..106c86bace075764c84bc2a7f7cb09d466fa8794
--- /dev/null
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -0,0 +1,146 @@
+使用Docker安装运行
+================================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
+
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+------------------------------
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+------------------------------
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25aecb8d0da9feb00006da6259b529b7011d91cb
--- /dev/null
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -0,0 +1,153 @@
+Run in Docker Containers
+=================================
+
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
+
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
+
+After you've read above tutorials you may proceed the following steps.
+
+.. _docker_pull:
+
+Pull PaddlePaddle Docker Image
+------------------------------
+
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+Choose between different BLAS version:
+
+  .. code-block:: bash
+
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
+
+
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+Launch your training program in Docker
+--------------------------------------
+
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
+
+Also, you can go into the container shell, run or debug your code
+interactively:
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
+
+.. _docker_run_book:
+
+PaddlePaddle Book
+------------------
+
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+Then, you would back and paste the address into the local browser:
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+.. _docker_run_gpu:
+
+Train with Docker with GPU
+------------------------------
+
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a9305ac4b6578c14a962f223c647a71e3b8a72b
--- /dev/null
+++ b/doc/v2/build_and_install/index_cn.rst
@@ -0,0 +1,56 @@
+安装与编译
+==========
+
+.. _install_steps:
+
+PaddlePaddle针对不同的用户群体提供了多种安装方式。
+
+专注深度学习模型开发
+--------------------
+
+PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_cn.rst
+
+这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
+
+关注底层框架
+-------------
+
+PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_cn.rst
+
+我们推荐在Docker中运行PaddlePaddle，该方式具有以下优势：
+
+- 无需单独安装第三方依赖
+- 方便分享运行时环境，易于问题的复现
+
+对于有定制化二进制文件需求的用户，我们同样提供了从源码编译安装PaddlePaddle的方法：
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_cn.rst
+
+.. warning::
+
+	需要提醒的是，这种安装方式会涉及到一些第三方库的下载、编译及安装，整个安装过程耗时较长。
+
+
+常见问题汇总
+--------------
+
+如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
+
+:ref:`常见问题解答 <install_faq>`
+
+如果问题没有得到解决，欢迎向PaddlePaddle社区反馈问题：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7990bacbd6966e88e8763e9c5709e410f7e9fed4
--- /dev/null
+++ b/doc/v2/build_and_install/index_en.rst
@@ -0,0 +1,56 @@
+install and Compile
+======================
+
+.. _install_steps:
+
+PaddlePaddle provides various methods of installation for many different users
+
+Focus on Deep Learning Model Development
+----------------------------------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+------------------------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_en.rst
+
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
+
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
+
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
+
+FAQ
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 <install_faq>`
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/v2/build_and_install/paddleci.png b/doc/v2/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/v2/build_and_install/paddleci.png differ
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..095da19cd41d29bfa72ab23abd24bec45f925a86
--- /dev/null
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -0,0 +1,105 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0
--- /dev/null
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -0,0 +1,123 @@
+Install using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/v2/design/cluster_train/README.md b/doc/v2/design/cluster_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..177a5f5d54bd924fab34795219ce1f7b270c8e25
--- /dev/null
+++ b/doc/v2/design/cluster_train/README.md
@@ -0,0 +1,182 @@
+# Design Doc: Distributed Training
+
+## Objective
+
+In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
+
+This poses technical challenges to PaddlePaddle:
+
+1. Support fault-recovery.
+1. Support both offline and online training.
+1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
+
+
+## Training Job
+
+A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
+
+1. the *master server process*, which dispatches tasks to
+1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
+1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.
+
+Their relation is illustrated in the following graph:
+
+<img src="src/paddle-model-sharding.png"/>
+
+By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
+
+When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
+
+### Master Server Process
+
+The master server process will:
+
+- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
+- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
+
+
+#### Task
+
+A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
+
+#### Task Queue
+
+The master server has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master server. Each master server process has three task queues.
+
+<img src="src/paddle-task-queues.png"/>
+
+- The todo queue holds tasks to be dispatched. When a job starts, the master server fills in the todo queue with all tasks.
+- The pending queue holds tasks that are currently training by trainers.
+- the done queue holds tasks that are already trained.
+
+The life cycle of a single task is illustrated below:
+
+<img src="src/paddle-task-states.png"/>
+
+1. When a new pass of training starts, all tasks will be placed in the todo queue.
+1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
+1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
+1. If a task fails for any reason in trainer, or takes longer than a specific period of time,  the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
+1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
+
+### Trainer Process
+
+The trainer process will:
+
+- Request tasks from the master.
+- Work on the tasks
+- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+
+### Parameter Server Process
+
+Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
+
+The parameter server will:
+
+- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
+- Periodically save its parameters to distributed file system by overriding the previous save.
+
+### Optimization Algorithms
+
+The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
+
+- Synchronous Stochastic Gradient Descent (sync-SGD)
+
+	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
+
+- Asynchronous Stochastic Gradient Descent (async-SGD)
+
+	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
+
+	- Each trainer uploads its accumulated gradient every n mini-batches.
+	- Every m mini-batches, the trainer downloads new parameters from parameter server.
+	- n and m do not have to be equal.
+
+## Fault Tolerant
+
+The training job will pause if the master server processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
+
+The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
+
+- sync-SGD
+
+	TODO
+
+- async-SGD
+
+	Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
+
+## Fault Recovery
+
+PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
+
+Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
+
+<img src="src/paddle-etcd.png"/>
+
+### Master Server Process
+
+When the master is started by the Kubernetes, it executes the following steps at startup:
+
+1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
+1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
+1. Write its ip address to */master/addr* so that trainers can discover it.
+1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
+
+When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
+
+### Trainer Process
+
+When the trainer is started by the Kubernetes, it executes the following steps at startup:
+
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
+1. Finds and watches */master/addr* to get master's address.
+1. Requests for tasks from the master to start training.
+
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
+
+### Parameter Server Process
+
+When the parameter server is started by Kubernetes, it executes the following steps at startup:
+
+1. Read desired total number of parameter servers from etcd `/ps_desired`
+1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
+
+	The desired number of parameter servers is 3:
+
+	<img src="src/paddle-ps-0.png"/>
+
+	The third parameter server joined:
+
+	<img src="src/paddle-ps-1.png"/>
+
+1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
+1. Now the parameter server is ready for the trainers' requests.
+
+If the parameter server's etcd lease expires, the parameter server will kill itself.
+
+
+## Parameter Server Checkpointing
+See [here](./checkpointing.md)
+
+## Store and dispatching trainning data
+See [here](./data_dispatch.md)
+
+
+## Dynamic Scaling
+
+### Trainer Scaling
+
+TODO
+
+### Parameter Server Scaling
+
+Not planned for v1.
+
+## Training Dataset Format
+
+TODO
+
+## User Interface
+
+TODO
diff --git a/doc/v2/design/cluster_train/checkpointing.md b/doc/v2/design/cluster_train/checkpointing.md
new file mode 100644
index 0000000000000000000000000000000000000000..c87ef2c7d2636208866d05456d5d44316d0bb200
--- /dev/null
+++ b/doc/v2/design/cluster_train/checkpointing.md
@@ -0,0 +1,44 @@
+## 模型参数检查点（Checkpointing）
+模型数据检查点的实现，可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像，来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中，可以通过阶段性的保存每个parameter server的数据快照（snapshot）到 ***分布式存储服务*** 达到容灾的目的，比如每隔10分钟最新的快照，并删除更早的快照。在出现单点故障时，只需要恢复这台节点，或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
+
+<img src="src/checkpointing.png" width="500"/>
+
+### 快照保存的设计如下：
+
+说明：
+
+* parameter server在集群中启动后，自动挂载分布式存储目录，并把快照保存到这个目录下。
+* ***注：每个parameter server的检查点各自独立保存，暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点，因为这样做也没法保证消除随机性。***
+
+检查点保存程序流程：
+
+1. 如果满足条件"每隔10分钟"时，parameter server会获取parameters内存的`read_lock`，启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程，则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`，所以在写入快照的过程中，parameter server会暂停参数更新并等待。
+2. parameter server生成一个UUID，向指定的目录中一个新的文件（文件名为此UUID）写入快照数据。在快照写入完成后，计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容：`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
+3. 删除磁盘目录中不是当前uuid的快照文件。
+4. 释放对paramters内存的锁定，停止保存检查点的线程。
+
+这里需要用户额外注意，在您的实际环境中，训练任务的运行可能会占满trainer和parameter server之间的网络带宽，如果parameter server此时还需要通过网络访问分布式存储以保存快照，可能会造成网络拥塞，而出现阶段性的运行停滞。
+
+### 从快照恢复
+
+在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动，则需要回滚到上一个检查点：
+
+  1. 从etcd中读取节点：`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
+  1. 从磁盘文件中加载uuid文件名的检查点快照文件，并加载其中的参数
+  1. 如果上面两步出现错误，则使用启动参数定义的初始化方法初始化参数
+  1. 开始提供服务
+
+## TODO List
+### 推测执行/加速执行（TODO）
+在异构集群中，如果存在某些trainer执行速度过慢会影响整体集群的速度（如图中Trainer 1），此时master将负责启动一个新的Trainer（Accelerate Trainer 2），使用同样的训练数据block。哪个trainer先完成block的训练，则把另一个慢速的kill掉。
+
+### 动态扩容/缩容
+目前只考虑动态扩容trainer数量，可以减小系统复杂性。
+
+## 术语
+* model: 指深度学习训练之后得到的所有参数，使用这个神经网络可以完成对新数据的预测
+* parameters: 神经网络中的参数，包括权重w和偏置b。一个神经网络的模型由大量的参数组成
+* shard: 分片，通常指将一个整体拆分成多份的其中的一份。
+* model shard: 将一个神经网络参数拆分成多份，每个shard分别存储在其中一台parameter server之上
+* parameter block: 多个parameter block构成一个model shard
+* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低（（平均故障率*平均故障修复时间）^2）只对特殊在线系统考虑两台以上同时故障的容灾。
diff --git a/doc/v2/design/cluster_train/data_dispatch.md b/doc/v2/design/cluster_train/data_dispatch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f5d22ff5e6abcb576d16cbe7391da1967a1ab8e
--- /dev/null
+++ b/doc/v2/design/cluster_train/data_dispatch.md
@@ -0,0 +1,160 @@
+## 训练数据的存储和分发
+
+### 概念解释
+
+### 流程介绍
+生产环境中的训练数据集通常体积很大，并被存储在诸如Hadoop HDFS，Ceph，AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务，包括：
+
+* 数据预处理任务
+* Paddle训练任务
+* 在线模型预测服务
+<div style="align: center">
+<img src="src/paddle-cloud-in-data-center.png" width="800"/>
+</div>
+
+在上图中显示了在一个实际生产环境中的应用（人脸识别）的数据流图。生产环境的日志数据会通过实时流的方式（Kafka）和离线数据的方式（HDFS）存储，并在集群中运行多个分布式数据处理任务，比如流式数据处理（online data process），离线批处理（offline data process）完成数据的预处理，提供给paddle作为训练数据。用户也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
+
+### 训练数据存储
+我们选择[CephFS](http://docs.ceph.com/docs/master/cephfs/)作为存储系统。
+
+- 无论是从[PFSClient](../file_manager/README.md)的角度，还是从[Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)中运行任务的角度，统一用`/pfs/$DATACENTER/home/$USER`来访问用户自己的数据。  
+- `/pfs/$DATACENTER/common`下存放公共数据集合
+	- 做只读挂载 
+
+<div style="align: center">
+<img src="src/file_storage.png" width="700" align=center/>
+</div>
+
+### 文件预处理
+
+
+在开始训练之前, 数据集需要预先被转换成PaddlePaddle分布式训练使用的存储格[RecordIO](https://github.com/PaddlePaddle/Paddle/issues/1947)。我们提供两个转换方式：
+
+1. 用户在本地转换好再上传
+1. 用户上传数据后，在机群上运行转换程序
+
+转换生成的文件名会是以下格式：
+
+```text
+name_prefix-aaaaa-of-bbbbb
+```
+
+"aaaaa"和"bbbbb"都是五位的数字，每一个文件是数据集的一个shard，"aaaaa"代表shard的index，"bbbbb"代表这个shard的最大index。
+
+比如ImageNet这个数据集可能被分成1000个shard，它们的文件名是：
+```text
+imagenet-00000-of-00999
+imagenet-00001-of-00999
+...
+imagenet-00999-of-00999
+```
+
+#### 转换库
+
+无论是在本地或是云端转换，我们都提供Python的转换库，接口是：
+```python
+def convert(output_path, reader, num_shards, name_prefix)
+```
+
+- `output_path`: directory in which output files will be saved.
+- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
+- `num_shards`: the number of shards that the dataset will be partitioned into.
+- `name_prefix`: the name prefix of generated files.
+
+`reader`每次输出一个data instance，这个instance可以是单个值，或者用tuple表示的多个值：
+
+```python
+yield 1 # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+```
+
+每个值的类型可以是整形、浮点型数据、字符串，或者由它们组成的list，以及numpy.ndarray。如果是其它类型，会被Pickle序列化成字符串。
+
+### 示例程序
+
+#### 使用转换库
+
+以下`reader_creator`生成的`reader`每次输出一个data instance，每个data instance包涵两个值：numpy.ndarray类型的值和整型的值：
+```python
+def reader_creator():
+	def reader():
+		for i in range(1000):
+			yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+	return reader
+```
+
+把`reader_creator`生成的`reader`传入`convert`函数即可完成转换：
+```python
+convert("./", reader_creator(), 100, random_images)
+```
+
+以上命令会在当前目录下生成100个文件：
+```text
+random_images-00000-of-00099
+random_images-00001-of-00099
+...
+random_images-00099-of-00099
+```
+
+#### 进行训练
+
+
+PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)，生成给定`RecordIO`文件对应的data reader。**无论在本地还是在云端，reader的使用方式都是一致的**：
+
+```python
+# ...
+reader = paddle.reader.creator.RecordIO("/pfs/datacenter_name/home/user_name/random_images-*-of-*")
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+trainer.train(batch_reader, ...)
+```
+
+以上代码的reader输出的data instance与生成数据集时，reader输出的data instance是一模一样的。
+
+### 上传训练文件
+
+使用下面命令，可以把本地的数据上传到存储集群中。
+
+```bash  
+paddle pfs cp filename /pfs/$DATACENTER/home/$USER/folder/
+```
+
+比如，把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令：
+
+```bash  
+paddle pfs cp random_images-*-of-* /pfs/$DATACENTER/home/$USER/folder/
+```
+
+需要`$DATACENTER`的配置写到配置文件中，例如
+
+```
+# config file
+[datacenter_1]
+username=user
+usercert=user.pem
+userkey=user-key.pem
+endpoint=datacenter1.paddlepaddle.org
+
+[datacenter_2]
+username=user
+usercert=user.pem
+userkey=user-key.pem
+endpoint=datacenter2.paddlepaddle.org
+```
+## TODO
+### 文件访问的权限
+控制用户权限  
+
+- 用户可以把自己的数据分享给别人
+
+### 文件访问方式
+不用mount的方式来访问数据，而是直接用API的接口远程访问
+
+例如：  
+
+```
+f = open('/pfs/datacenter_name/home/user_name/test1.dat')
+```
+
+
+### 支持用户自定义的数据预处理job
diff --git a/doc/v2/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md
new file mode 100644
index 0000000000000000000000000000000000000000..edb0245ea083e791b7f32ac57a330698299fceda
--- /dev/null
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@@ -0,0 +1,101 @@
+# Alalysis of large model distributed training in Paddle
+
+***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
+
+## What is it
+
+We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
+
+## How to use
+
+Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
+
+Accrodingly, configure your embedding layers like:
+
+```python
+SPARSE_REMOTE=True
+
+w1 = data_layer(name="w1", size=dict_size)
+emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+w2 = data_layer(name="w2", size=dict_size)
+emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+...
+```
+
+## Implementation details
+
+```c++
+enum MatType {
+  MAT_NORMAL,
+  MAT_NORMAL_SHARED,
+  MAT_VALUE_SHARED,
+  MAT_SPARSE_ROW_IDS,
+  MAT_SPARSE_ROW_AUTO_GROW,
+  MAT_CACHE_ROW,
+  MAT_SPARSE_ROW,
+  MAT_SPARSE_ROW_PREFETCH,
+  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+};
+```
+
+`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
+
+In `trainer_internal.cpp:L93 trainOneBatch`:
+
+```c++
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+```
+
+When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
+
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+
+```c++
+if (fullSize) {
+    ...
+} else {
+getParams = [&] {
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+};
+applyL1 = [](Parameter& para, real decayRate) {
+    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+};
+}
+```
+
+Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
+
+```c++
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+```
+
+`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
+then `getParameterSparse` remote call returns only one row of data to the client.
diff --git a/doc/v2/design/cluster_train/master_server.md b/doc/v2/design/cluster_train/master_server.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bf3c506f101361875043f8bfd97972b8c981a22
--- /dev/null
+++ b/doc/v2/design/cluster_train/master_server.md
@@ -0,0 +1,91 @@
+# Design Doc: Master Server
+
+For an overview of master server's role, please refer to [distributed training design doc](./README.md). In this design doc we will discuss the master server in more details. The master will be implemented in [Go](https://golang.org/).
+
+## Dataset
+
+<img src="src/dataset.png"/>
+
+A dataset is a list of files in *RecordIO* format. A RecordIO file consists of chunks, whereas each chunk consists some records.
+
+## Task Queue
+
+As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress.
+
+### Task Queue Creation
+
+1. Each trainer will make an RPC call (using Go's [rpc](https://golang.org/pkg/net/rpc/) package) to the master server, telling it the RecordIO files representing the dataset specified by the user. Since every trainer will tell the master server the same dataset, only the first RPC call will be honored.
+
+	The RPC interface is:
+	```go
+	func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error {
+	}
+	```
+1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file.
+
+	The definition of the chunk is:
+	```go
+	type Chunk struct {
+		Idx   int // index of the chunk within the file
+		Path  string
+		Index recordio.Index // chunk index
+	}
+	```
+1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
+
+	The definition of the task is:
+	```go
+	type Task struct {
+		Index  int
+		Chunks []Chunk
+	}
+	```
+
+	The elements in the tasks queues is of type `TaskEntry`, containing a timeout counter (described in [task retry logic](#task-retry-logic)), and a task:
+	```go
+	type TaskEntry struct {
+		NumTimeout int
+		Task       Task
+	}
+	```
+
+	The definition of task queues is:
+	```go
+	type TaskQueues struct {
+		Todo    []TaskEntry
+		Pending map[int]TaskEntry // map from task index to task entry
+		Done    []TaskEntry
+	}
+	```
+
+### Task Queue Persistence
+
+The task queues need to be persisted on [etcd](https://github.com/coreos/etcd) for fault recovery. Since the task queues only change once a task is completed or timed out, which is not very frequent, we can afford to synchronize with etcd every time the task queues change.
+
+We will serialize the task queues data structure with [gob encoding](https://golang.org/pkg/encoding/gob/), compress with gzip, and save into etcd synchronously under key `/task_queues`.
+
+### Task Dispatch
+
+The trainer will make an RPC call to master to get a new task when:
+
+- the trainer first started, or
+- the trainer finishes a task.
+
+The RPC interface is:
+```go
+func (m *RPCServer) GetTask(finished *Task, result *Task) error {
+}
+```
+Argument `finished` will be `nil` when the trainer is just started.
+
+During the RPC call the master will do the following:
+
+- Make a copy of the task queues, and update the copy reflecting the finished tasks and the new pending tasks.
+- Synchronize the copy of task queues with etcd using a transaction conditioned on holding the master lock.
+- Replace the task queues with the copy and report to the trainer with the new tasks if succeeded, or discard the copy and report the error to the trainer if failed.
+
+### Task Retry Logic
+
+When a task is dispatched to the trainer, the master will schedule a function for execution after the timeout duration (based on the moving average of task completion time). If the task entry in still in the pending queue, its timeout counter will increase by one, and the task will be moved to todo queue. If the timeout counter is above the threshold, the master will log the error and discard the task.
+
+Please note that since a timed out task could be completed after it has been dispatched for retry, so it is possible for a task to be processed multiple times. We do not try to prevent it from happening since it's fine to train on the same task multiple times due to the stochastic nature of the stochastic gradient decent algorithm.
diff --git a/doc/v2/design/cluster_train/pserver_client.md b/doc/v2/design/cluster_train/pserver_client.md
new file mode 100644
index 0000000000000000000000000000000000000000..474b8c572cd92fc87e9f7f3f2b19d12cccd158de
--- /dev/null
+++ b/doc/v2/design/cluster_train/pserver_client.md
@@ -0,0 +1,171 @@
+# Design Doc: The Client Library of Parameter Server
+
+For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file.
+
+## Parameter Partition
+
+Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment:
+
+### Sparse Parameter
+
+The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector.
+
+Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention:
+
+If a sparse parameter is partitioned into n shards, they should be named as:
+
+```text
+name:sparse-0
+name:sparse-1
+...
+name:sparse-n-1
+```
+
+The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention.
+
+## Model Optimization Using Gradients
+
+There are two ways to perform model optimization using gradients:
+
+- On Client
+
+  The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization).
+
+- On Parameter Server
+
+  The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients.
+
+## L1 and L2 Regularization
+
+PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary.
+
+## Parameter Initialization
+
+The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers.
+
+### Trainer Selection
+
+To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below:
+
+<img src="./src/init_lock.png">
+
+### Trainer Selection Process
+
+The trainer select process is encapsulated in the C API function:
+```c
+int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto);
+```
+The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below:
+
+<img src="./src/pserver_init.png">
+
+## C Interface
+
+```c
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  unsigned char*      content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+typedef int paddle_pserver_client;
+
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
+
+/**
+ * @brief paddle_begin_init_params begins to initialize parameters on
+ * parameter servers.
+ *
+ * paddle_begin_init_params will be called from multiple trainers,
+ * only one trainer will be selected to initialize the parameters on
+ * parameter servers. Other trainers need to get the initialized
+ * parameters from parameter servers using @paddle_get_params.
+ *
+ * @return 1 if the trainer is selected to initialize parameter
+ * servers, otherwise 0.
+ */
+int paddle_begin_init_params(paddle_pserver_client client);
+
+/**
+ * @brief paddle_init_param initializes the parameter on parameter
+ * servers.
+ *
+ * @param param the parameter to initialize.
+ * @param param_config_proto the configuration for the parameter.
+ * @param config_len the length of param_config_proto
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+
+/**
+ * @brief paddle_finish_init_params tells parameter servers client has
+ * sent all parameters to parameter servers as initialization.
+ *
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_finish_init_params(paddle_pserver_client client);
+
+/**
+ * @brief paddle_send_grads sends gradients to parameter servers for
+ * updating parameters.
+ *
+ * @param grads the array of gradients to send.
+ * @param len the length of the gradient array.
+ * @param learning_rate the learning rate for the gradients.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
+
+/**
+ * @brief paddle_get_params gets parameters from parameter servers.
+ *
+ * paddle_get_params will block until parameters are initialized on
+ * the parameter servers.
+ *
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
+ * @param len the length of the names array and the paddle_parameter
+ * array.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
+
+/**
+ * @brief paddle_save_model indicates parameters to save the parameter
+ * to the given path
+ *
+ * @param path the path to save parameters.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_save_model(paddle_pserver_client client, const char* path);
+```
diff --git a/doc/v2/design/cluster_train/remote_parameter_updater.md b/doc/v2/design/cluster_train/remote_parameter_updater.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77
--- /dev/null
+++ b/doc/v2/design/cluster_train/remote_parameter_updater.md
@@ -0,0 +1,21 @@
+# Design Doc: Remote Parameter Updater for Cluster Train
+
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+
+## Parameter Updater
+
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+
+### Remote Parameter Updater
+
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+
+#### Sparse Remote Parameter Updater
+
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+
+### Interface Design
+
+TBD
diff --git a/doc/v2/design/cluster_train/save_model.md b/doc/v2/design/cluster_train/save_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..b755185c81ad617b9c85c47de0f5f65d2201c658
--- /dev/null
+++ b/doc/v2/design/cluster_train/save_model.md
@@ -0,0 +1,111 @@
+# Design Doc: Save Model
+
+## Overview
+
+The model is the output of the training process. There are two
+ways from which user can obtain a model:
+
+- Save model triggered by user code: user code asks PaddlePaddle to
+  save a model.
+- Convert model from the checkpoint: model being converted from
+  pservers' periodic checkpoint. In this way, the user can cancel a
+  job at any time, and still have a relatively fresh model (we
+  checkpoint around every 5 minutes).
+
+### Trainer Saving Model vs. Pservers Saving Model
+
+Both trainers and pservers have access to the model. So the model can
+be saved from a trainer or pservers. We need to decide where the model
+is saved from.
+
+#### Dense Update vs. Sparse Update
+
+There are two types of model update methods: dense update and sparse
+update (when the model parameter is configured to be sparse).
+
+- Dense update
+
+  Every trainer has it's own full copy of the model. Every model
+  update will update the entire model.
+
+- Sparse update
+
+  The training input is sparse, and the trainer does not have the
+  entire model. It will only download the sub-model necessary related
+  to the input. When updating the model, only the sub-model related to
+  the training input is updated.
+
+
+#### Pservers Saving Model
+
+The benefit of letting pservers save model is they have the entire
+model all the time. However, since pservers are on different nodes, it
+requires a merging process to merge model shards into the same
+model. Thus requires the pservers to write models to a distributed
+filesystem, making the checkpoint shards visible to the merge program.
+
+#### Trainer Saving Model
+
+The benefit of letting one trainer to save the model is it does not
+require a distributed filesystem. And it's reusing the same save model
+logic when training locally - except when doing sparse update, the
+trainer needs to download the entire model during the saving process.
+
+#### Conclusion
+
+Given trainer saving model does not require a distributed filesystem,
+and is an intuitive extension to trainer saving model when training
+locally, we decide to let the trainer save the model when doing
+distributed training.
+
+
+### Convert Model from Checkpoint
+
+TODO
+
+
+## Timeline
+
+We first implement trainer save the model. Converting the latest
+snapshot to a model will be a TODO for future.
+
+
+## Trainer Save Model
+
+### Trainer Election
+
+One trainer will be elected as the one to save the model. When using
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.
+
+### Model Save Path
+
+Each trainer will be given the directory to save the model. The
+elected trainer will save the model to
+`given-directory/trainerID`. Since the trainer ID is unique, this
+would prevent concurrent save to the same file when multiple trainers
+are elected to save the model when split-brain problem happens.
+
+### What Happens When Model Is Saving
+
+It takes some time to save model, we need to define what will happen
+when save model is taking place.
+
+When doing dense update, the trainer uses the local model. Pservers
+does not need to pause model update.
+
+When doing sparse update. The trainer needs to download the entire
+model while saving. To get the most accurate model, the model update
+needs to be paused before the download starts and resumed after the
+download finishes. Otherwise, the trainer gets a model that is
+"polluted": some part of the model is old, some part of the model is
+new.
+
+It's unclear that the "polluted" model will be inferior due to the
+stochastic nature of deep learning, and pausing the model update will
+add more complexity to the system. Since supporting sparse update is a
+TODO item. We defer the evaluation of pause the model update or not
+during saving model to the future.
diff --git a/doc/v2/design/cluster_train/src/checkpointing.png b/doc/v2/design/cluster_train/src/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/v2/design/cluster_train/src/checkpointing.png differ
diff --git a/doc/v2/design/cluster_train/src/data_dispatch.png b/doc/v2/design/cluster_train/src/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/v2/design/cluster_train/src/data_dispatch.png differ
diff --git a/doc/v2/design/cluster_train/src/dataset.graffle b/doc/v2/design/cluster_train/src/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/v2/design/cluster_train/src/dataset.graffle differ
diff --git a/doc/v2/design/cluster_train/src/dataset.png b/doc/v2/design/cluster_train/src/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/v2/design/cluster_train/src/dataset.png differ
diff --git a/doc/v2/design/cluster_train/src/file_storage.graffle b/doc/v2/design/cluster_train/src/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/v2/design/cluster_train/src/file_storage.graffle differ
diff --git a/doc/v2/design/cluster_train/src/file_storage.png b/doc/v2/design/cluster_train/src/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/v2/design/cluster_train/src/file_storage.png differ
diff --git a/doc/v2/design/cluster_train/src/init_lock.graffle b/doc/v2/design/cluster_train/src/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/v2/design/cluster_train/src/init_lock.graffle differ
diff --git a/doc/v2/design/cluster_train/src/init_lock.png b/doc/v2/design/cluster_train/src/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/v2/design/cluster_train/src/init_lock.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-etcd.graffle b/doc/v2/design/cluster_train/src/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-etcd.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-etcd.png b/doc/v2/design/cluster_train/src/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-etcd.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-model-sharding.png b/doc/v2/design/cluster_train/src/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-model-sharding.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-ps-0.png b/doc/v2/design/cluster_train/src/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-ps-0.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-ps-1.png b/doc/v2/design/cluster_train/src/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-ps-1.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-ps.graffle b/doc/v2/design/cluster_train/src/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-ps.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-queues.graffle b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-queues.png b/doc/v2/design/cluster_train/src/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-queues.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-states.graffle b/doc/v2/design/cluster_train/src/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-states.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-states.png b/doc/v2/design/cluster_train/src/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-states.png differ
diff --git a/doc/v2/design/cluster_train/src/pserver_init.graffle b/doc/v2/design/cluster_train/src/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/v2/design/cluster_train/src/pserver_init.graffle differ
diff --git a/doc/v2/design/cluster_train/src/pserver_init.png b/doc/v2/design/cluster_train/src/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/v2/design/cluster_train/src/pserver_init.png differ
diff --git a/doc/v2/design/cluster_train/src/submit-job.graffle b/doc/v2/design/cluster_train/src/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/v2/design/cluster_train/src/submit-job.graffle differ
diff --git a/doc/v2/design/cluster_train/src/submit-job.png b/doc/v2/design/cluster_train/src/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/v2/design/cluster_train/src/submit-job.png differ
diff --git a/doc/v2/design/cluster_train/src/trainer.graffle b/doc/v2/design/cluster_train/src/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254
Binary files /dev/null and b/doc/v2/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/v2/design/cluster_train/src/trainer.png b/doc/v2/design/cluster_train/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/design/cluster_train/src/trainer.png differ
diff --git a/doc/v2/design/cluster_train/submit-job.md b/doc/v2/design/cluster_train/submit-job.md
new file mode 100644
index 0000000000000000000000000000000000000000..8377d5489dc64bd2fdc5bb4f7bc737e7b489000d
--- /dev/null
+++ b/doc/v2/design/cluster_train/submit-job.md
@@ -0,0 +1,127 @@
+# Submit a Distributed Training Job
+
+The user can submit a distributed training job with Python code, rather than with a command-line interface.
+
+## Runtime Environment On Kubernetes
+
+For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image.
+
+### Base Docker Image
+
+Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right.
+
+### Runtime Docker Image
+
+The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image.
+
+- Handle Python Dependencies
+
+  You need to provide requirements.txt file in your `trainer-package` folder. Example:
+
+  ```txt
+  pillow
+  protobuf==3.1.0
+  ```
+  More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like:
+  ```bash
+    paddle_example
+      |-quick_start
+        |-trainer.py
+        |-dataset.py
+        |-requirements.txt
+  ```
+
+## Submit Distributed Training Job With Python Code
+<img src="./src/submit-job.png" width="800">
+
+- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job.
+- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes.
+- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well.
+
+You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters:
+```python
+paddle.job.dist_train(
+  trainer=dist_trainer(),
+  paddle_job=PaddleJob(
+    job_name = "paddle-cloud",
+    entry_point = "python %s"%__file__,
+    trainer_package = "/example/word2vec",
+    image = "yancey1989/paddle-job",
+    trainers = 10,
+    pservers = 3,
+    trainer_cpu = 1,
+    trainer_gpu = 1,
+    trainer_mem = "10G",
+    pserver_cpu = 1,
+    pserver_mem = "2G"
+  ))
+```
+
+The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows:
+```python
+def dist_trainer():
+  def trainer_creator():
+    trainer = paddle.v2.trainer.SGD(...)
+    trainer.train(...)
+  return trainer_creator
+```
+
+The pseudo code of `paddle.job.dist_train` is as follows:
+```python
+def dist_train(trainer, paddle_job):
+  # if the code is running on cloud, set PADDLE_ON_CLOUD=YES
+  if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO":
+    #submit the paddle job
+    paddle_job.submit()
+  else:
+    #start the training
+    trainer()
+```
+### PaddleJob Parameters
+parameter | type | explanation
+ --- | --- | ---
+job_name | str | the unique name for the training job
+entry_point | str | entry point for startup trainer process
+trainer_package | str | trainer package file path which user have the access right
+image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image)
+pservers|int| Parameter Server process count
+trainers|int| Trainer process count
+pserver_cpu|int| CPU count for each Parameter Server process
+pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_cpu|int| CPU count for each Trainer process
+trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter
+
+### Deploy Parameter Server, Trainer and Master Process
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
+
+## Job Server
+
+- RESTful API
+
+  Job server provides RESTful HTTP API for receiving the trainer package and displaying
+  PaddlePaddle job related informations.
+  - `POST   /v1/package` receive the trainer package and save them on CephFS
+  - `POST   /v1/trainer/job` submit a trainer job
+  - `GET    /v1/jobs/` list all jobs
+  - `GET    /v1/jobs/<job-name>` the status of a job
+  - `DELETE /v1/jobs/<job-name>` delete a job
+  - `GET    /v1/version` job server version
+
+- Build Runtime Docker Image on Kubernetes
+
+  `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training.
+
+  There are some benefits for building runtime Docker image on JobServer:
+  - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe.
+  - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies.
+  - If we want to change another image type, such as RKT, users do not need to care about it.
+
+- Deploy Parameter Server, Trainer and Master Processes
+
+  `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows:
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
diff --git a/doc/v2/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..826ff3141bc2512b525cb44ac0f18b376ce57e92
--- /dev/null
+++ b/doc/v2/design/interface/00.why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
+                                     uint64_t* width,
+                                     uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/legacy/math/matrix.h"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::capi::CMatrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/capi/CMatrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class CMatrix {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 实现
+
+参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/v2/design/interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md
new file mode 100644
index 0000000000000000000000000000000000000000..9820284523246a062581f322616d196f575c9d29
--- /dev/null
+++ b/doc/v2/design/interface/01.inference_implementation.md
@@ -0,0 +1,131 @@
+# C-API 模型推断实现文档
+
+本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API，通过模型推断API的实现作为一个样例，来进行讨论。至于为什么需要C-API，请参考[Why Plain C](./00.why_plain_c.md)。
+
+## Table of Contents
+   * [C-API 模型推断实现文档](#c-api-模型推断实现文档)
+      * [暴露接口原则](#暴露接口原则)
+      * [目录结构](#目录结构)
+      * [实现方式](#实现方式)
+         * [capi.h](#capih)
+         * [具体某种类型的头文件](#具体某种类型的头文件)
+         * [capi_private.h](#capi_privateh)
+         * [具体某种类型的实现文件](#具体某种类型的实现文件)
+         * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib)
+         * [libpaddle_capi_whole.a](#libpaddle_capi_wholea)
+         * [examples](#examples)
+      * [编译选项](#编译选项)
+
+
+## 暴露接口原则
+
+1. 所有的接口均为C接口。即使用`extern "C"`
+2. 除构造某种类型的函数(`paddle_matrix_create`等)，其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。
+3. 所有类型名为`paddle_类型名`，所有与类型相关的函数，函数名为`paddle_类型名_函数名`
+4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言，那么
+	* 为了暴露的接口尽量简单。只暴露概念的接口，而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。
+	* 暴露这个概念必要函数。`必要`是指，即完成某一个任务的最少函数。
+5. 不在`capi`接口层做过多封装。
+	* 如果某一个Paddle概念必须要暴露，但是又过于琐碎。不在`capi`这一层进行封装，而是直接修改Paddle Core。让Paddle核心中，这一概念不再琐碎。
+
+
+## 目录结构
+
+```text
+Paddle
+  `-- paddle
+        `-- capi
+              `-- examples  # The example project for C-API.
+              `-- tests  # unittests for C-API
+              `-- capi.h  # C-API header file.
+              `-- capi_private.h  # The shared header file between implementation sources.
+              `-- matrix.{h, cpp}
+              `-- gradient_machine.{h, cpp}
+              `-- ...
+```
+
+
+Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件，均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即，安装后的目录结构为
+
+```text
+`-- include
+      `-- paddle
+             `-- capi.h
+             `-- matrix.h
+             `-- gradient_machine.h
+             `-- ...
+`-- lib
+     `-- libpaddle_capi_shared.{so, dylib}  # In mac, dynamic libary's file name extention is `dylib`
+     `-- libpaddle_capi_whole.a  # static library for all symbols of Paddle.
+```
+
+## 实现方式
+
+下面分别介绍某一类文件的实现方式。
+
+### capi.h
+
+`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中，引入了类型的头文件，`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时，使用相对路径的引用方式。即`#include "matrix.h"`
+
+### 具体某种类型的头文件
+
+具体某种类型的头文件，即例如`matrix.h`，`gradient_machine.h`等。在这些头文件中，包含了某种类型的类型定义和暴露的全部函数。
+
+这个头文件不假设其他文件的引用顺序，即使用户直接引用某种类型的头文件，也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型，例如`gradient_machine`需要引用`matrix`，则直接引入另一种类型的头文件，即`#include "matrix.h"`。
+
+### capi_private.h
+
+`capi_prviate.h`是各个实现中共享的头文件，他主要包含了实际暴露的类型结构。在用户使用C-API时，Paddle的类型全部退化成`void *`，即`typedef paddle_matrix void*`。但，对于每种C-API暴露的类型，均是在`capi_private.h`中实现的结构体。
+
+```cpp
+struct CMatrix {
+   int type = MatrixType;
+   std::shared_ptr<paddle::Matrix> mat;
+};
+```
+
+通常，这个结构体包含两个项目。
+
+* `type`是一个类型的标志。对于每种类型，type字段均不尽相同。这样，即使C-API接受的类型全是`void *`，我们也可以确定每一个参数的类型。
+
+  ```cpp
+  void some_c_api_function(void* some_instance) {
+     int* type = (int *) some_instance;
+     switch (*type) {
+       case MatrixType:
+         CMatrix* mat = (CMatrix *) some_instance;
+         ...
+       ...
+     }
+  }
+  ```
+* 这个结构体中的另一个项目是，Paddle Core中这一类型接口的智能指针(shared_ptr)。
+	* 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例，而不必在意Paddle Core是否还在使用这个实例。
+	* 例如，用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后，直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数，这个参数也不会一并删除。
+
+### 具体某种类型的实现文件
+
+具体某种类型的实现文件，即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中，使用C++ 11实现了C-API的接口，并且使用`extern "C"`导出这些接口。在实现过程中，对输入参数的安全性进行了必要的判断，并将C-API接口的参数转发给`Paddle Core`。
+
+### libpaddle\_capi_shared.{so, dylib}
+
+`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。
+
+### libpaddle\_capi_whole.a
+
+`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。
+
+
+### examples
+
+在样例中，使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。
+
+## 编译选项
+
+C-API的编译选项默认关闭，打开这个编译选项，需要在cmake的时候，设置
+
+```bash
+cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF
+```
+
+编译C-API的时候推荐Paddle不嵌入Python解释器，也不生成`SWIG`接口，具体原因参考[Why Plain C](./00.why_plain_c.md)。
diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2509a5c5f4182d8ce3a16a3b7bd92c0d7bf5b056
--- /dev/null
+++ b/doc/v2/design/interface/index_cn.rst
@@ -0,0 +1,7 @@
+多语言接口
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..356e58c39c5ef6ee5ee50ab999b85f88628bfb85
--- /dev/null
+++ b/doc/v2/design/interface/index_en.rst
@@ -0,0 +1,7 @@
+Multilingual Interface
+-----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/mkl/image/engine.png b/doc/v2/design/mkl/image/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/v2/design/mkl/image/engine.png differ
diff --git a/doc/v2/design/mkl/image/gradients.png b/doc/v2/design/mkl/image/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/v2/design/mkl/image/gradients.png differ
diff --git a/doc/v2/design/mkl/image/layers.png b/doc/v2/design/mkl/image/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/v2/design/mkl/image/layers.png differ
diff --git a/doc/v2/design/mkl/image/matrix.png b/doc/v2/design/mkl/image/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/v2/design/mkl/image/matrix.png differ
diff --git a/doc/v2/design/mkl/image/overview.png b/doc/v2/design/mkl/image/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/v2/design/mkl/image/overview.png differ
diff --git a/doc/v2/design/mkl/mkl_packed.md b/doc/v2/design/mkl/mkl_packed.md
new file mode 100644
index 0000000000000000000000000000000000000000..0123315ad4368e68b377f66119949bfd6c1c7860
--- /dev/null
+++ b/doc/v2/design/mkl/mkl_packed.md
@@ -0,0 +1,108 @@
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4876de0045979be20fa45bdc84d2594516f71c03
--- /dev/null
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -0,0 +1,237 @@
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+
+MKL，MKLML以及MKL-DNN三者关系如下表：
+
+<table>
+<thead>
+<tr>
+<th>Name</th>
+<th>Open Source</th>
+<th>License</th>
+<th>Descriptions</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>MKL</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Accelerate math processing routines</td>
+</tr>
+<tr>
+<td>MKLML</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Small package of MKL, especially for Machine Learning</td>
+</tr>
+
+<tr>
+<td>MKL-DNN</td>
+<td>Yes</td>
+<td>Apache 2.0</td>
+<td>Accelerate primitives processing routines especially for Deep Neural Networks</td>
+</tr>
+
+</tbody>
+</table>
+
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
+</div>
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
+
+### CMake
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
+
+- `WITH_MKLML` 控制是否使用MKLML库。
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
+
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
+
+### Layers
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
+
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
+
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
+
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如：
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。
+
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+
+### Benchmarking
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
+
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
+
+我们总结出一些特别需要注意的点：
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
+
+## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3244eedf918b93f9351258f1218dfb2d507c1a9c
--- /dev/null
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,243 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
+
+## 克隆（Clone）
+
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
+```
+
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
+
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像，在源码目录树的根目录中运行：
+
+```bash
+➜  docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
+
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试，可以用如下命令：
+
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
+
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
+
+选择目标分支：
+
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
+
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
+
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后，删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+
+至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5
--- /dev/null
+++ b/doc/v2/dev/contribute_to_paddle_en.md
@@ -0,0 +1,162 @@
+# Contribute Code
+
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+
+1. Fork
+
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+   To make a copy of your fork to your local computers, please run
+
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+
+1. Create the local feature branch
+
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+
+1. Commit
+
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
+1. Build and test
+
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+
+1. Push and file a pull request
+
+   You can "push" your local work into your forked repo:
+
+   ```bash
+   git push origin my-cool-stuff
+   ```
+
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+
+### Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter.  For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/doc/v2/dev/index_cn.rst b/doc/v2/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aee3c68de05de26df3cd79170fa7f4ecad4bf386
--- /dev/null
+++ b/doc/v2/dev/index_cn.rst
@@ -0,0 +1,24 @@
+开发标准
+========
+PaddlePaddle遵守如下三个部分的代码和文档规范。
+
+PaddlePaddle使用git做版本管理，docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范，不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+
+PaddlePaddle面向国内外用户，包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档，重在问题描述，背景阐述，然后才是解决方案。文档由Sphinx生成，因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档，请参阅如下文档。
+
+..  toctree::
+  :maxdepth: 1
+
+  write_docs_cn.rst
+
+PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer，请参阅如下文档，欢迎提交patch。
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_cn.rst
diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cbff313fc5b9468b58159cf2b04e8464f9bebc78
--- /dev/null
+++ b/doc/v2/dev/index_en.rst
@@ -0,0 +1,28 @@
+Development
+------------
+
+
+PaddlePaddle adheres to the following three sections of code and document specifications.
+
+
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_en.md
+
+
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
+  write_docs_en.rst
+
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_en.rst
diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e5a14346123d342de0b67757cbbce654bd4180dc
--- /dev/null
+++ b/doc/v2/dev/new_layer_cn.rst
@@ -0,0 +1,389 @@
+==================
+如何实现新的网络层
+==================
+
+这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。
+
+1. 推导该层前向和后向传递的方程。
+2. 实现该层的C++类。
+3. 增加梯度检测的单元测试，以保证梯度的正确计算。
+4. 封装该层的Python接口。
+
+推导方程
+================
+
+首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入，计算输出。后向传播给定输出的梯度，计算输入和参数的梯度。
+
+下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
+
+..  image:: src/FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+一个网络层的前向传播部分把输入转化为相应的输出。
+全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入，使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量，并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。
+
+.. math::
+
+   y = f(W^T x + b)
+
+其中 :math:`f(.)` 是一个非线性的*激活方程*，例如sigmoid， tanh，以及Relu。
+
+变换矩阵 :math:`W` 和偏置向量 :math:`b`  是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度，分别计算每个参数的梯度，以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。
+
+假设损失函数是 :math:`c(y)` ，那么
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+假设 :math:`z = W^T x + b` ，那么
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+PaddlePaddle的base layer类可以自动计算上面的导数。
+
+因此，对全连接层来说，我们需要计算：
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+其中 :math:`\mathbf 1` 是一个全1的向量， :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值， :math:`z_j` 是向量 :math:`z` 的第j个值， :math:`x_i` 是向量 :math:`x` 的第i个值。
+
+最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。
+
+实现C++类
+===================
+
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+
+这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
+
+- 类的构造函数和析构函数。
+- :code:`init` 函数。用于初始化参数和设置。
+- :code:`forward` 。实现网络层的前向传播。
+- :code:`backward` 。实现网络层的后向传播。
+- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新，则不需要重写该函数。（大多数网络层不需要支持远程稀疏更新）
+
+
+头文件如下：
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * 全连接层的每个输出都连接到上一层的所有的神经元上。
+     * 它的输入与经过学习的参数做内积并加上偏置（可选）。
+     *
+     * 配置文件接口是fc_layer。
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象，它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。
+
+- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。每个权重对应一个输入。
+- :code:`biases_` 是存有偏置向量的权重。
+
+全连接层没有网络层配置的超参数。如果一个网络层需要配置的话，通常的做法是将配置存于 :code:`LayerConfig& config` 中，并在类构建函数中把它放入一个类成员变量里。
+
+下面的代码片段实现了 :code:`init` 函数。
+
+- 首先，所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。
+- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。
+- 最后，初始化偏置向量。
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* 初始化父类 */
+      Layer::init(layerMap, parameterMap);
+
+      /* 初始化权重表 */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // 获得参数尺寸
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // 新建一个权重
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // 将新建的权重加入权重表
+        weights_.emplace_back(w);
+      }
+
+      /* 初始化biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+实现前向传播的部分有下面几个步骤。
+
+- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
+- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。
+- 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* 若有必要，为output_申请内存 */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // 设置输出的尺寸
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // 对每个输入乘上变换矩阵
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* 加上偏置向量 */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* 激活 */ {
+        forwardActivation();
+      }
+    }
+
+实现后向传播的部分有下面几个步骤。
+
+- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度，调用该函数后，梯度会就地（不使用额外空间）乘上输出的梯度。
+- 计算偏置的梯度。注意，我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后，**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。
+- 最后，计算转换矩阵和输入的梯度，并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度，从而可以做一些与计算重叠的工作（例如，网络通信）。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* 对激活求导 */ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* 计算当前层权重的梯度 */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* 计算输入层的偏差 */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+ :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时，完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时，该批次的输入中仅有一个子集是非零的。因此，该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。
+
+大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+最后，使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符， :code:`FullyConnectedLayer` 是该层的类名。
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下，其会自动被加入编译列表。
+
+
+写梯度检查单元测试
+===============================
+
+写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
+
+所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+
++ 生成网络层配置。网络层配置包含以下几项：
+   - 偏置参数的大小。（例子中是4096）
+   - 层的类型。（例子中是fc）
+   - 层的大小。（例子中是4096）
+   - 激活的类型。（例子中是softmax）
+   - dropout的比例。（例子中是0.1）
++ 配置网络层的输入。在这个例子里，我们仅有一个输入。
+   - 输入的类型（ :code:`INPUT_DATA` ），可以是以下几种：
+       - :code:`INPUT_DATA` ：稠密向量。
+       - :code:`INPUT_LABEL` ：整数。
+       - :code:`INPUT_DATA_TARGET` ：稠密向量，但不用于计算梯度。
+       - :code:`INPUT_SEQUENCE_DATA` ：含有序列信息的稠密向量。
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA` ：含有序列信息和子序列信息的稠密向量。
+       - :code:`INPUT_SEQUENCE_LABEL` ：含有序列信息的整数。
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA` ：0-1稀疏数据。
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` ：浮点稀疏数据。
+   - 输入的名字。（例子中是 :code:`layer_0` ）
+   - 输入的大小。（例子中是8192）
+   - 非零数字的个数，仅对稀疏数据有效。
+   - 稀疏数据的格式，仅对稀疏数据有效。
++ 对每个输入，都需要调用一次 :code:`config.layerConfig.add_inputs();` 。
++ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。
+   - 层和输入的配置。（例子中是 :code:`config` ）
+   - 网络层的类型。（例子中是 :code:`fc` ）
+   - 梯度检查的输入数据的批次大小。（例子中是100）
+   - 输入是否是转置的。大多数层需要设置为 :code:`false` 。（例子中是 :code:`false` ）
+   - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如，softmax激活的输出的和总是1。在这种情况下，我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和（非常数）来计算梯度。（例子中是 :code:`true` ，因为全连接层的激活可以是softmax）
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+如果你要为了测试而增加新的文件，例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+实现python封装
+========================
+
+python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步：
+
+- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。
+- 实现构造函数 :code:`__init__` 。
+	- 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作，这些名字必须要写对。
+	- 之后，计算变换矩阵的大小和格式（是否稀疏）。
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+在网络配置中，网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括：
+
+- :code:`name` 是网络层实例的名字标识符。
+- :code:`type` 是网络层的类型，通过网络层的标识符来指定。
+- :code:`size` 是网络层输出的大小。
+- :code:`bias` 表明这个层的一个实例是否需要偏置。
+- :code:`inputs` 说明这个层的输入，输入是由一个list中的网络层实例的名字组成的。
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+我们建议你为你的Python封装实现一个“助手”，使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。
diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ad723738801908a5f48343574c204bdbfc97ee08
--- /dev/null
+++ b/doc/v2/dev/new_layer_en.rst
@@ -0,0 +1,390 @@
+================
+Write New Layers
+================
+
+This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
+
+- Derive equations for the forward and backward part of the layer.
+- Implement C++ class for the layer.
+- Write gradient check unit test to make sure the gradients are correctly computed.
+- Implement Python wrapper for the layer.
+
+Derive Equations
+================
+
+First we need to derive equations of the *forward* and *backward* part of the layer. The forward part computes the output given an input. The backward part computes the gradients of the input and the parameters given the the gradients of the output.
+
+The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes.
+
+..  image:: src/FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+The *forward part* of a layer transforms an input into the corresponding output.
+Fully connected layer takes a dense input vector with dimension :math:`D_i`. It uses a transformation matrix :math:`W` with size :math:`D_i \times D_o` to project :math:`x` into a :math:`D_o` dimensional vector, and add a bias vector :math:`b` with dimension :math:`D_o` to the vector.
+
+.. math::
+
+   y = f(W^T x + b)
+
+where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
+
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
+
+Suppose our loss function is :math:`c(y)`, then
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+Suppose :math:`z = W^T x + b`, then
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+This derivative can be automatically computed by our base layer class.
+
+Then, for fully connected layer, we need to compute:
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
+
+Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`, and :math:`\frac{\partial z}{\partial W}`. The details of the computation will be given in the next section.
+
+Implement C++ Class
+===================
+
+The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
+
+It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
+
+- constructor and destructor.
+- :code:`init` function. It is used to initialize the parameters and settings.
+- :code:`forward`. It implements the forward part of the layer.
+- :code:`backward`. It implements the backward part of the layer.
+- :code:`prefetch`. It is utilized to determine the rows corresponding parameter matrix to prefetch from parameter server. You do not need to override this function if your layer does not need remote sparse update. (most layers do not need to support remote sparse update)
+
+
+The header file is listed below:
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * A layer has full connections to all neurons in the previous layer.
+     * It computes an inner product with a set of learned weights, and
+     * (optionally) adds biases.
+     *
+     * The config file api is fc_layer.
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+It defines the parameters as class variables. We use :code:`Weight` class as abstraction of parameters. It supports multi-thread update. The details of this class will be described in details in the implementations.
+
+- :code:`weights_` is a list of weights for the transformation matrices. The current implementation can have more than one inputs. Thus, it has a list of weights. One weight corresponds to an input.
+- :code:`biases_` is a weight for the bias vector.
+
+The fully connected layer does not have layer configuration hyper-parameters. If there are some layer hyper-parameters, a common practice is to store it in :code:`LayerConfig& config`, and put it into a class variable in the constructor.
+
+The following code snippet implements the :code:`init` function.
+
+- First, every :code:`init` function must call the :code:`init` function of the base class :code:`Layer::init(layerMap, parameterMap);`. This statement will initialize the required variables and connections for each layer.
+- The it initializes all the weights matrices :math:`W`. The current implementation can have more than one inputs. Thus, it has a list of weights.
+- Finally, it initializes the bias.
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* Initialize the basic parent class */
+      Layer::init(layerMap, parameterMap);
+
+      /* initialize the weightList */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // Option the parameters
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // create a new weight
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // append the new weight to the list
+        weights_.emplace_back(w);
+      }
+
+      /* initialize biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+The implementation of the forward part has the following steps.
+
+- Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function.
+- Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
+- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`.
+- Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* malloc memory for the output_ if necessary */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // Settup the size of the output.
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // Apply the the transformation matrix to each input.
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* add the bias-vector */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* activation */ {
+        forwardActivation();
+      }
+    }
+
+The implementation of the backward part has the following steps.
+
+- :code:`backwardActivation()` computes the gradients of the activation. The gradients will be multiplies in place to the gradients of the output, which can be retrieved using :code:`getOutputGrad()`.
+- Compute the gradients of bias. Notice that we an use :code:`biases_->getWGrad()` to get the gradient matrix of the corresponding parameter. After the gradient of one parameter is updated, it **MUST** call :code:`getParameterPtr()->incUpdate(callback);`. This is utilize for parameter update over multiple threads or multiple machines.
+- Then it computes the gradients of the transformation matrices and inputs, and it calls :code:`incUpdate` for the corresponding parameter. This gives the framework the chance to know whether it has gathered all the gradient to one parameter so that it can do some overlapping work (e.g., network communication)
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* Do derivation for activations.*/ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* Calculate the W-gradient for the current layer */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* Calculate the input layers error */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+The :code:`prefetch` function specifies the rows that need to be fetched from parameter server during training. It is only useful for remote sparse training. In remote sparse training, the full parameter matrix is stored distributedly at the parameter server. When the layer uses a batch for training, only a subset of locations of the input is non-zero in this batch. Thus, this layer only needs the rows of the transformation matrix corresponding to the locations of these non-zero entries. The :code:`prefetch` function specifies the ids of these rows.
+
+Most of the layers do not need remote sparse training function. You do not need to override this function in this case.
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to register the layer. :code:`fc` is the identifier of the layer, and :code:`FullyConnectedLayer` is the class name of the layer.
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list.
+
+
+Write Gradient Check Unit Test
+===============================
+
+An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly.
+
+All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
+
++ Create layer configuration. A layer configuration can include the following attributes:
+   - size of the bias parameter. (4096 in our example)
+   - type of the layer. (fc in our example)
+   - size of the layer. (4096 in our example)
+   - activation type. (softmax in our example)
+   - dropout rate. (0.1 in our example)
++ configure the input of the layer. In our example, we have only one input.
+   - type of the input (:code:`INPUT_DATA`) in our example. It can be one of the following types
+       - :code:`INPUT_DATA`: dense vector.
+       - :code:`INPUT_LABEL`: integer.
+       - :code:`INPUT_DATA_TARGET`: dense vector, but it does not used to compute gradient.
+       - :code:`INPUT_SEQUENCE_DATA`: dense vector with sequence information.
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA`: dense vector with both sequence and sub-sequence information.
+       - :code:`INPUT_SEQUENCE_LABEL`: integer with sequence information.
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA`: 0-1 sparse data.
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA`: float sparse data.
+   - name of the input. (:code:`layer_0` in our example)
+   - size of the input. (8192 in our example)
+   - number of non-zeros, only useful for sparse inputs.
+   - format of sparse data, only useful for sparse inputs.
++ each inputs needs to call :code:`config.layerConfig.add_inputs();` once.
++ call :code:`testLayerGrad` to perform gradient checks. It has the following arguments.
+   - layer and input configurations. (:code:`config` in our example)
+   - type of the layer. (:code:`fc` in our example)
+   - batch size of the gradient check. (100 in our example)
+   - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example)
+   - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax)
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+Implement Python Wrapper
+========================
+
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+
+- Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer.
+- Implements :code:`__init__` constructor function.
+	- It first call :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` base constructor function. :code:`FCLayer` is the Python wrapper class name, and :code:`fc` is the layer identifier name. They must be correct in order for the wrapper to work.
+	- Then it computes the size and format (whether sparse) of each transformation matrix as well as the size.
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+In network configuration, the layer can be specifies using the following code snippets. The arguments of this class are:
+
+- :code:`name` is the name identifier of the layer instance.
+- :code:`type` is the type of the layer, specified using layer identifier.
+- :code:`size` is the output size of the layer.
+- :code:`bias` specifies whether this layer instance has bias.
+- :code:`inputs` specifies a list of layer instance names as inputs.
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+You are also recommended to implement a helper for the Python wrapper, which makes it easier to write models. You can refer to :code:`python/paddle/trainer_config_helpers/layers.py` for examples.
diff --git a/doc/v2/dev/src/FullyConnected.jpg b/doc/v2/dev/src/FullyConnected.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b2241f401434e527f95ee4e0e541a3f2ff78fd1e
Binary files /dev/null and b/doc/v2/dev/src/FullyConnected.jpg differ
diff --git a/doc/v2/dev/src/doc_en.png b/doc/v2/dev/src/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/dev/src/doc_en.png differ
diff --git a/doc/v2/dev/write_docs_cn.md b/doc/v2/dev/write_docs_cn.md
new file mode 100755
index 0000000000000000000000000000000000000000..0281d40e4bc96baa4652447ee07b13bf704d932b
--- /dev/null
+++ b/doc/v2/dev/write_docs_cn.md
@@ -0,0 +1,203 @@
+# 如何贡献文档
+
+PaddlePaddle非常欢迎您贡献文档。如果您撰写/翻译的文档满足我们的要求，您的文档将会呈现在paddlapaddle.org网站和Github上供PaddlePaddle的用户阅读。
+
+Paddle的文档主要分为以下几个模块：
+
+- 新手入门：包括安装说明、深度学习基础知识、学习资料等，旨在帮助用户快速安装和入门；
+
+- 使用指南：包括数据准备、网络配置、训练、Debug、预测部署和模型库文档，旨在为用户提供PaddlePaddle基本用法讲解；
+
+- 进阶使用：包括服务器端和移动端部署、如何贡献代码/文档、如何性能调优等，旨在满足开发者的需求；
+
+我们的文档支持[reStructured Text](http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html)和[Markdown](https://guides.github.com/features/mastering-markdown/) (GitHub风格)格式的内容贡献。
+
+撰写文档完成后，您可以使用预览工具查看文档在官网显示的效果，以验证您的文档是否能够在官网正确显示。
+
+
+## 如何使用预览工具
+
+如果您正在修改代码文档（即API），并在Docker容器中使用PaddlePaddle，请在您相应的docker容器中执行下列步骤。因为API的文档生成器依赖于PaddlePaddle。
+
+如果您只改进了文本/媒体内容(不需要安装或构建PaddlePaddle)，或者正在主机上构建PaddlePaddle，请继续在主机上执行下列步骤。
+
+### 1. Clone你希望更新或测试的相关仓库：
+
+首先下载完整的文档存储仓库，其中`--recurse-submodules`会同步更新FluidDoc中的submodule（所有的submodule均在`FluidDoc/external`中），以保证所有文档可以正常显示：
+
+```
+git clone --recurse-submodules https://github.com/PaddlePaddle/FluidDoc
+```
+
+其他可拉取的存储库有：
+
+
+```
+git clone https://github.com/PaddlePaddle/book.git
+git clone https://github.com/PaddlePaddle/models.git
+git clone https://github.com/PaddlePaddle/Mobile.git
+
+```
+
+您可以将这些本地副本放在电脑的任意目录下，稍后我们会在启动 PaddlePaddle.org时指定这些仓库的位置。
+
+### 2. 在新目录下拉取 PaddlePaddle.org 并安装其依赖项
+
+在此之前，请确认您的操作系统安装了python的依赖项
+
+以ubuntu系统为例，运行：
+
+```
+sudo apt-get update && apt-get install -y python-dev build-essential
+```
+
+然后：
+
+```
+git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+cd PaddlePaddle.org/portal
+# To install in a virtual environment.
+# virtualenv venv; source venv/bin/activate
+pip install -r requirements.txt
+```
+
+**可选项**：如果你希望实现中英网站转换，以改善PaddlePaddle.org，请安装[GNU gettext](https://www.gnu.org/software/gettext/)
+
+### 3. 在本地运行 PaddlePaddle.org
+
+添加您希望加载和构建内容的目录列表(选项包括：--paddle，--book，--models，--mobile)
+
+运行：
+
+```
+./runserver --paddle <path_to_FluidDoc_dir>
+```
+
+**注意：**  `<pathe_to_FluidDoc_dir>`为第一步中paddle副本在您本机的存储地址。
+
+如果您需要处理依赖于`book`、`models`或`mobile`存储库内容的文档，您可以添加一个或多个可选项：
+
+```
+./runserver --paddle <path_to_fluiddoc_dir> \
+    --book <path_to_fluiddoc_dir>/external/book \
+    --models <path_to_fluiddoc_dir>/external/models \
+    --mobile <path_to_fluiddoc_dir>/external/mobile
+```
+然后：打开浏览器并导航到http://localhost:8000。
+
+>*网站可能需要几秒钟才能成功加载，因为构建需要一定的时间*
+
+>*如果您是在docker环境下运行的这些步骤，请检查ip确保可以将端口8000映射到您的主机*
+
+## 贡献新文档或更新API
+
+所有内容都应该以[Markdown](https://guides.github.com/features/mastering-markdown/) (GitHub风格)的形式编写(尽管在文档中有一些使用.rst格式的遗留内容)。
+
+
+在完成安装步骤后，您还需要完成下列操作：
+
+  - 在你开始写作之前，我们建议你回顾一下这些关于贡献内容的指南
+
+ ---
+
+  **贡献新文档**
+
+
+  - 创建一个新的` .md` 文件或者在您当前操作的仓库中修改已存在的文章
+  - 将新增的文档名，添加到对应的index文件中
+
+ ---
+
+  **贡献或修改Python API**
+
+
+  在编译代码的docker容器内,或主机的对应位置：
+
+  - 运行脚本 `paddle/scripts/paddle_build.sh`(在 Paddle repo 下)
+  
+  ```bash
+  # 编译paddle的python库
+  cd Paddle
+  ./paddle/scripts/paddle_docker_build.sh gen_doc_lib full
+  cd ..
+  ```
+
+  - 运行预览工具
+
+  ```
+  # 在编译paddle的对应docker镜像中运行预览工具
+
+  docker run -it -v /Users/xxxx/workspace/paddlepaddle_workplace:/workplace -p 8000:8000 [images_id] /bin/bash
+  ```
+  
+  > 其中`/Users/xxxx/workspace/paddlepaddle_workplace`请替换成您本机的paddle工作环境，`/workplace`请替换成您相应的 docker 下的工作环境，这一映射会保证我们同时完成编译python库、修改FluidDoc和使用预览工具。
+
+  > [images_id]为docker中您使用的paddlepaddle的镜像id。
+
+  - 设置环境变量
+
+  ```
+  # 在docker环境中
+  # 设置环境变量`PYTHONPATH`使预览工具可以找到 paddle 的 python 库
+  export PYTHONPATH=/workplace/Paddle/build/python/
+  ```
+
+  - 清理旧文件
+
+  ```
+  # 清除历史生成的文件，如果是第一次使用预览工具可以跳过这一步
+  rm -rf /workplace/FluidDoc/doc/fluid/menu.json /workplace/FluidDoc/doc/fluid/api/menu.json /tmp/docs/ /tmp/api/
+  ```
+
+  - 启动预览工具
+
+  ```
+  cd /workplace/PaddlePaddle.org/portal
+  pip install -r requirements.txt
+  ./runserver --paddle /workplace/FluidDoc/
+  ```
+
+---
+  
+  **预览修改**
+
+
+
+  打开浏览器并导航到http://localhost:8000。
+
+  在要更新的页面上，单击右上角的Refresh Content
+  
+  进入使用文档单元后，API部分并不包含内容，希望预览API文档需要点击API目录，几分钟后您将看到生成的 API reference。
+  
+	
+## 提交修改
+
+如果您希望修改代码，请在`Paddle`仓库下参考[如何贡献代码](../development/contribute_to_paddle.html)执行操作。
+
+如果您仅修改文档：
+
+  - 修改的内容在`doc`文件夹内，您只需要在`FluidDoc`仓库下提交`PR`
+  
+  - 修改的内容在`external`文件夹内：
+
+    1.在您修改的仓库下提交PR。这是因为：`FluidDoc`仓库只是一个包装器，将其他仓库的链接（git术语的“submodule”）集合在了一起。
+    
+    2.当您的修改被认可后，更新FluidDoc中对应的`submodule`到源仓库最新的commit-id。
+
+      > 例如，您更新了book仓库中的develop分支下的文档：
+      
+
+      > - 进入`FluidDoc/external/book`目录
+      > - 更新 commit-id 到最新的提交：`git pull origin develop`
+      > - 在`FluidDoc`中提交你的修改
+		
+	3.在`FluidDoc`仓库下为您的修改提交PR
+
+提交修改与PR的步骤可以参考[如何贡献代码](../development/contribute_to_paddle.html)
+
+## 帮助改进预览工具
+
+我们非常欢迎您对平台和支持内容的各个方面做出贡献，以便更好地呈现这些内容。您可以Fork或Clone这个存储库，或者提出问题并提供反馈，以及在issues上提交bug信息。详细内容请参考[开发指南](https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/DEVELOPING.md)。
+
+## 版权和许可
+PaddlePaddle.org在Apache-2.0的许可下提供。
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70406b0f440aac51a045494a837aab5d7bd57e87
--- /dev/null
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -0,0 +1,136 @@
+#############
+如何贡献文档
+#############
+
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
+
+如何构建文档
+============
+
+PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
+
+我们建议使用PaddlePaddle.org工具来构建文档。
+
+使用PaddlePaddle.org工具
+------------------------
+这个是目前推荐的使用方法。除了可以自动编译文档，还可以直接在网页中预览文档，需要注意的是，采用后续说明的其它方式虽然也可以预览文档，但是文档的样式与官网文档是不一致的，使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。
+
+PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后即可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
+
+不使用PaddlePaddle.org工具
+--------------------------
+
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
+
+如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # 如果只需要构建使用文档，则执行以下命令
+   make -j $processors paddle_docs
+
+   # 如果只需要构建API，则执行以下命令
+   make -j $processors paddle_apis
+
+其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
+
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+
+..  image:: https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/v2/dev/src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+如何书写文档
+============
+
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
+
+如何更新www.paddlepaddle.org
+============================
+
+更新的文档以PR的形式提交到github中，提交方式参见 `如何贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/dev/write_docs_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6105455e202e4704aa25f0fd9916b9b61a569702
--- /dev/null
+++ b/doc/v2/dev/write_docs_en.rst
@@ -0,0 +1,139 @@
+########################
+Contribute Documentation
+########################
+
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
+
+How to build Documentation
+===========================
+
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
+
+We recommend using PaddlePaddle.org tool to build documentation.
+
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
+
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors paddle_docs
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors paddle_apis
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+How to write Documentation
+===========================
+
+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.
+
+How to update www.paddlepaddle.org
+===================================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0d644777287aea0a572adb6fa40f498f9c147af7
--- /dev/null
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -0,0 +1,224 @@
+.. _install_faq:
+
+###################
+编译安装与单元测试
+###################
+
+..  contents::
+
+1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
+----------------------------------------------------------------
+
+用户在使用PaddlePaddle GPU的Docker镜像的时候，常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
+具体的解决方法是：
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_cn.html>`_ 。
+
+
+2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
+用户强制指定特定的Python版本，具体操作如下：
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+3. CMake源码编译，Paddle版本号为0.0.0
+--------------------------------------
+
+如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`；或者运行 :code:`cmake ..`，出现
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
+
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
+5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. 遇到“非法指令”或者是“illegal instruction”
+--------------------------------------------
+
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
+
+7.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况：
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+并且查询PaddlePaddle单元测试的日志，提示：
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+解决办法是：
+
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+8. 下载MKLML库失败
+------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2
+    make[1]: *** 正在等待未完成的任务....
+
+原因：网速或SSL链接原因，导致MKLML库下载不成功。
+
+解决办法是：手动下载并安装，具体步骤如下。
+
+..  code-block:: bash
+
+    // 1. 进入对应的目录
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. 查看包的大小， 正常情况下是75M，如果小于75M，即下载失败：
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. 手动下载且解压缩，并手动生成download成功标签：
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. 接着编译即可
+
+9. 在Mac上无法安装numpy等Python包，权限错误
+------------------
+
+Mac上对自带的Python和包有严格的权限保护，最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
+
+virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝，并在这多个拷贝之间自由切换，这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
+
+下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境：
+
+安装virtualenv：
+::::::::::::::::
+
+virtualenv本身也是Python的一个包，可以用pip进行安装：
+
+..  code-block:: bash
+
+    sudo -H pip install virtualenv
+
+由于virtualenv需要安装给系统自带的Python，因此需要使用sudo权限。
+
+创建一个新的Python运行环境：
+:::::::::::::::::::
+
+..  code-block:: bash
+
+    virtualenv --no-site-packages paddle
+
+--no-site-packages 参数表示不拷贝已有的任何第三方包，创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
+
+执行完这一步后，当前目录下应该会出现一个名为paddle（或者你取的其他名字）的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
+
+启动运行环境：
+::::::::::::::::
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+执行后会发现命令提示符前面增加了(paddle)字样，说明已经成功启动了名为‘paddle’的Python环境。执行which python，可以发现使用的已经是刚刚创建的paddle目录下的Python。
+
+在这个环境中，我们可以自由地进行Paddle的安装、使用和开发工作，无需担心对系统自带Python的影响。
+
+退出运行环境：
+:::::::::::::::
+
+直接执行：
+
+..  code-block:: bash
+
+    deactivate
+
+可以看到命令提示符前面的(paddle)字样消失。
+
+自动启动某一Python环境：
+::::::::::::::::
+
+如果我们经常使用Paddle，我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境，比较繁琐。为了简便，可以修改终端的配置文件，来让终端每次启动后自动启动特定的Python环境。
+
+执行:
+
+..  code-block:: bash
+
+    vi ~/.bash_profile
+
+打开终端配置文件，并在文件的最后添加一行：
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+保存并关闭文件。
+
+这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7488ed8137d57785f36b9f1e1ed1269f864960bc
--- /dev/null
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -0,0 +1,143 @@
+.. _install_faq:
+
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/doc/v2/faq/cluster/index_cn.rst b/doc/v2/faq/cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e59c1e1a54a0c876d1e6e89f88030de59fb9fc1a
--- /dev/null
+++ b/doc/v2/faq/cluster/index_cn.rst
@@ -0,0 +1,17 @@
+###############
+集群训练与预测
+###############
+
+..  contents::
+
+1. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------------------------
+
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fa942a09625bef78b28456beeb735272b686e061
--- /dev/null
+++ b/doc/v2/faq/cluster/index_en.rst
@@ -0,0 +1,16 @@
+###############################
+Cluster Training and Prediction
+###############################
+
+.. contents::
+
+1. Network connection errors in the log during multi-node cluster training
+------------------------------------------------
+There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
+This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
+
+* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
+
+* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
+
+* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
diff --git a/doc/v2/faq/index_cn.rst b/doc/v2/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4537c7a481e2efbcfed5fa7be2c81c36e13cd108
--- /dev/null
+++ b/doc/v2/faq/index_cn.rst
@@ -0,0 +1,13 @@
+FAQ
+====
+
+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_cn.rst
+  model/index_cn.rst
+  parameter/index_cn.rst
+  local/index_cn.rst
+  cluster/index_cn.rst
diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3fa220792b252617848a1c76bc2be49928e35f64
--- /dev/null
+++ b/doc/v2/faq/index_en.rst
@@ -0,0 +1,13 @@
+FAQ
+====
+
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
+
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_en.rst
+  model/index_en.rst
+  parameter/index_en.rst
+  local/index_en.rst
+  cluster/index_en.rst
diff --git a/doc/v2/faq/local/index_cn.rst b/doc/v2/faq/local/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3c5bfac5a276e253c248ffd415c7789b20b29
--- /dev/null
+++ b/doc/v2/faq/local/index_cn.rst
@@ -0,0 +1,259 @@
+###############
+本地训练与预测
+###############
+
+..  contents::
+
+1. 如何减少内存占用
+-------------------
+
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
+* 其他内存杂项
+
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
+
+减少DataProvider缓冲池内存
+++++++++++++++++++++++++++
+
+PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+
+所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
+个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 :ref:`api_pydataprovider2` 。
+
+神经元激活内存
+++++++++++++++
+
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
+在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
+一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
+的时间步信息成正比。
+
+所以做法可以有两种：
+
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
+* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
+
+参数内存
+++++++++
+
+PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
+文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
+
+可以考虑使用一些优化算法，例如 :code:`momentum`。
+
+2. 如何加速训练速度
+-------------------
+
+加速PaddlePaddle训练可以考虑从以下几个方面\：
+
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用分布式训练驾驭更多的计算资源
+
+减少数据载入的耗时
+++++++++++++++++++
+
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+
+
+加速训练速度
+++++++++++++
+
+PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
+
+这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
+
+使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+这个任务的配置为\:
+
+..  literalinclude:: src/word2vec_config.py
+
+
+利用更多的计算资源
+++++++++++++++++++
+
+利用更多的计算资源可以分为以下几个方式来进行\:
+
+* 单机CPU训练
+
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
+* 单机GPU训练
+
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
+* 多机训练
+
+  * 请参考 :ref:`cluster_train` 。
+
+3. 如何指定GPU设备
+------------------
+
+例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
+
+* 方式1：通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* 方式2：通过命令行参数 ``--gpu_id`` 指定。
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+这里有两种有效的解决方法：
+
+1. 设置 :code:`gradient_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. 设置 :code:`error_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+两种方法的区别：
+
+1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
+2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
+
+除此之外，还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
+
+5.  如何调用 infer 接口输出多个layer的预测结果
+-----------------------------------------------
+
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+需要注意的是：
+
+* 如果指定了2个layer作为输出层，实际上需要的输出结果是两个矩阵；
+* 假设第一个layer的输出A是一个 N1 * M1 的矩阵，第二个 Layer 的输出B是一个 N2 * M2 的矩阵；
+* paddle.v2 默认会将A和B 横向拼接，当N1 和 N2 大小不一样时，会报如下的错误：
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+多个层的输出矩阵的高度不一致导致拼接失败，这种情况常常发生在：
+
+* 同时输出序列层和非序列层；
+* 多个输出层处理多个不同长度的序列;
+
+此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤，来解决上面的问题。这时，infer接口的返回值是一个python list:
+
+* list 中元素的个数等于网络中输出层的个数；
+* list 中每个元素是一个layer的输出结果矩阵，类型是numpy的ndarray；
+* 每一个layer输出矩阵的高度，在非序列输入时：等于样本数；序列输入时等于：输入序列中元素的总数；宽度等于配置中layer的size；
+
+6.  如何在训练过程中获得某一个layer的output
+-----------------------------------------------
+
+可以在event_handler中，通过 :code:`event.gm.getLayerOutputs("layer_name")` 获得在模型配置中某一层的name :code:`layer_name` 在当前
+mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarray` ，可以通过这个输出来完成自定义的评估指标计算等功能。例如下面代码：
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+注意：此方法不能获取 :code:`paddle.layer.recurrent_group` 里step的内容，但可以获取 :code:`paddle.layer.recurrent_group` 的输出。
+
+7.  如何在训练过程中获得参数的权重和梯度
+-----------------------------------------------
+
+在某些情况下，获得当前mini-batch的权重（或称作weights, parameters）有助于在训练时观察具体数值，方便排查以及快速定位问题。
+可以通过在 :code:`event_handler` 中打印其值（注意，需要使用 :code:`paddle.event.EndForwardBackward` 保证使用GPU训练时也可以获得），
+示例代码如下：
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+注意：“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy，会对训练性能造成影响。不要在注重性能的训练场景下使用。
\ No newline at end of file
diff --git a/doc/v2/faq/local/index_en.rst b/doc/v2/faq/local/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fa95b1753dbe293811d7a8601497ad521fa3ecda
--- /dev/null
+++ b/doc/v2/faq/local/index_en.rst
@@ -0,0 +1,248 @@
+#############################
+Parameter Setting
+#############################
+
+..  contents::
+
+1. Reduce Memory Consumption
+-------------------
+
+The training procedure of neural networks demands dozens of gigabytes of host memory or serval gigabytes of device memory, which is a rather memory consuming work. The memory consumed by PaddlePaddle framework mainly includes:
+\:
+
+* Cache memory for DataProvider (only on host memory),
+* Memory for neurons' activation information (on both host memory and device memory),
+* Memory for parameters (on both host memory and device memory),
+* Other memory demands.
+
+Other memory demands is mainly used to support the running demand of PaddlePaddle framework itself, such as string allocation，temporary variables, which are not considered currently.
+
+Reduce DataProvider Cache Memory
+++++++++++++++++++++++++++
+
+PyDataProvider works under asynchronous mechanism, it loads together with the data fetch and shuffle procedure in host memory:
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        Data Files -> Host Memory Pool -> PaddlePaddle Training
+    }
+
+Thus the reduction of the DataProvider cache memory can reduce memory occupancy, meanwhile speed up the data loading procedure before training. However, the size of the memory pool can actually affect the granularity of shuffle，which means a shuffle operation is needed before each data ﬁle reading process to ensure the randomness of data when try to reduce the size of the memory pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+In this way, the memory consumption can be significantly reduced and hence the training procedure can be accelerated. More details are demonstrated in :ref:`api_pydataprovider2`.
+
+The Neurons Activation Memory
+++++++++++++++
+
+Each neuron activation operating in a neural network training process contains certain amount of temporary data such as the activation data (like the output value of a neuron). These data will be used to update parameters in back propagation period. The scale of memory consumed by these data is mainly related with two parameters, which are batch size and the length of each Sequence. Therefore, the neurons activation memory consuming is actually in proportion to the information contains in each mini-batch training.
+
+Two practical ways:
+
+* Reduce batch size. Set a smaller value in network configuration settings(batch_size=1000) can be helpful. But setting batch size to a smaller value may affect the training result due to it is a super parameter of the neural network itself.
+* Shorten the sequence length or cut oﬀ those excessively long sequences. For example, if the length of sequences in a dataset are mostly varies between 100 and 200, but there is sequence lengthen out to 10,000, then it’s quite potentially leads to OOM (out of memory), especially in RNN models such as LSTM.
+
+The Parameters Memory
+++++++++
+
+The PaddlePaddle framework supports almost all popular optimizers. Different optimizers have different memory requirement. For example, the :code:`adadelta` consumes approximately 5 times memory
+
+space than the weights parameter’s scale, which means the :code:`adadelta` needs at least :code:`500M` memory if the model ﬁle contains all
+
+parameters needs :code:`100M`.
+
+Some optimization algorithms such as :code:`momentum` are worth giving a shot.
+
+2. Tricks To Speed Up Training
+-------------------
+
+The training procedure of PaddlePaddle may be speed up when considering following aspects:\：
+
+* Reduce the time consumption of data loading
+* Speed up training epochs
+* Introduce more computing resources with the utilization of distribute training frameworks
+
+Reduce The Time Consumption of Data Loading
+++++++++++++++++++
+
+
+The \ :code:`pydataprovider`\ holds big potential to speed up the data loading procedure if the cache pool and enable memory cache when use it. The principle of the reduction of :code:`DataProvider` cache pool is basically the same with the method which reduct the memory occupation with the set of a smaller cache pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+Beside, the interface :code:`@provider` provides a parameter :code:`cache` to control cache. If set it to :code:`CacheType.CACHE_PASS_IN_MEM`, the data after the first :code:`pass` ( a pass means all data have be fed into the network for training) will be cached in memory and no new data will be read from the :code:`python` side in following :code:`pass` , instead from the cached data in memory. This strategy can also drop the time consuming in data loading process.
+
+
+Accelerating Training Epochs
+++++++++++++
+
+Sparse training is supported in PaddlePaddle. The features needs to be trained is any of :code:`sparse_binary_vector`, :code:`sparse_vector` and :code:`integer_value` . Meanwhile, the Layer interacts with the training data need to turn the Parameter to sparse updating mode by setting :code:`sparse_update=True`.
+Take :code:`word2vec` as an example, to train a language distance, one needs to predict the middle word with two words prior to it and next to it. The DataProvider of this task is:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+The configuration of this task is:
+
+..  literalinclude:: src/word2vec_config.py
+
+Introduce More Computing Resources
+++++++++++++++++++
+
+More computing resources can be introduced with following manners:
+* Single CPU platform training
+
+  * Use multi-threading by set :code:`trainer_count`。
+
+* Single GPU platform training
+
+  * Set :code:`use_gpu` to train on single GPU.
+  * Set :code:`use_gpu` and :code:`trainer_count` to enable multiple GPU training support.
+
+* Cluster Training
+
+  * Refer to :ref:`cluster_train` 。
+
+3. Assign GPU Devices
+------------------
+
+Assume a computing platform consists of 4 GPUs which serial number from 0 to 3:
+
+* Method1: specify a GPU as computing device by set:
+ `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* Method2: Assign by —gpu_id:
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. How to Fix Training Termination Caused By :code:`Floating point exception` During Training.
+------------------------------------------------------------------------
+
+Paddle binary catches floating exceptions during runtime, it will be terminated when NaN or Inf occurs. Floating exceptions are mostly caused by float overflow, divide by zero. There are three main reasons may raise such exception:
+
+* Parameters or gradients during training are oversize, which leads to float overflow during calculation.
+* The model failed to converge and diverges to a big value.
+* Parameters may converge to a singular value due to bad training data. If the scale of input data is too big and contains millions of parameter values, float overflow error may arise when operating matrix multiplication.
+
+Two ways to solve this problem:
+
+1. Set :code:`gradient_clipping_threshold` as:
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+Details can refer to example `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. Set :code:`error_clipping_threshold` as:
+
+..  code-block:: python
+
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+Details can refer to example `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+The main difference between these two methods are:
+
+1. They both block the gradient, but happen in different occasions，the former one happens when then :code:`optimzier` updates the network parameters while the latter happens when the back propagation computing of activation functions.
+2. The block target are different, the former blocks the trainable parameters’ gradient while the later blocks the gradient to be propagated to prior layers.
+
+Moreover, Such problems may be fixed with smaller learning rates or data normalization.
+
+5.  Fetch Multi Layers’ Prediction Result With Infer Interface
+-----------------------------------------------
+
+* Join the layer to be used as :code:`output_layer` layer to the input parameters of  :code:`paddle.inference.Inference()` interface with:
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* Assign certain ﬁelds to output. Take :code:`value` as example, it can be down with following code:
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+It is important to note that:
+
+* If 2 layers are assigned as output layer, then the output results consists of 2 matrixes.
+* Assume the output of first layer A is a matrix sizes N1 * M1, the output of second layer B is a matrix sizes N2 * M2；
+* By default, paddle.v2 will transverse join A and B, when N1 not equal to N2, it will raise following error:
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+The transverse of diﬀerent matrixes of multi layers mainly happens when:
+
+* Output sequence layer and non sequence layer;
+* Multiple output layers process multiple sequence with different length;
+
+Such issue can be avoided by calling infer interface and set :code:`flatten_result=False`. Thus, the infer interface returns a python list, in which
+
+* The number of elements equals to the number of output layers in the network;
+* Each element in list is a result matrix of a layer, which type is numpy.ndarray;
+* The height of each matrix outputted by each layer equals to the number of samples under non sequential mode or equals to the number of elements in the input sequence under sequential mode. Their width are both equal to the layer size in configuration.
+
+6.  Fetch the Output of A Certain Layer During Training
+-----------------------------------------------
+
+In event_handler, the interface :code:`event.gm.getLayerOutputs("layer_name")` gives the forward output value organized in :code:`numpy.ndarray` corresponding to :code:`layer_name` in the mini-batch.
+The output can be used in custom measurements in following way:
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+Note: this function can not get content of :code:`paddle.layer.recurrent_group` step, but output of  :code:`paddle.layer.recurrent_group` can be fetched.
+
+7.  Fetch Parameters’ Weight and Gradient During Training
+-----------------------------------------------
+
+Under certain situations, knowing the weights of currently training mini-batch can provide more inceptions of many problems. Their value can be acquired by printing values in :code:`event_handler` (note that to gain such parameters when training on GPU, you should set :code:`paddle.event.EndForwardBackward`). Detailed code is as following:
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+Note that “acquire the output of a certain layer during training” or “acquire the weights and gradients of parameters during training ” both needs to copy training data from C++ environment to numpy, which have certain degree of inﬂuence on training performance. Don’t use these two functions when the training procedure cares about the performance.
diff --git a/doc/v2/faq/local/src/reduce_min_pool_size.py b/doc/v2/faq/local/src/reduce_min_pool_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba96652f764d26c724ea22697e04572709bf6a4
--- /dev/null
+++ b/doc/v2/faq/local/src/reduce_min_pool_size.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
+    with open('%s.shuf' % filename, 'r') as f:
+        for line in f:
+            yield get_sample_from_line(line)
diff --git a/doc/v2/faq/local/src/word2vec_config.py b/doc/v2/faq/local/src/word2vec_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5b84e8ed4de5123097026a5c7992b06fd321750
--- /dev/null
+++ b/doc/v2/faq/local/src/word2vec_config.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+...  # the settings and define data provider is omitted.
+DICT_DIM = 3000  # dictionary dimension.
+word_ids = data_layer('word_ids', size=DICT_DIM)
+
+emb = embedding_layer(
+    input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            'label', size=DICT_DIM)))
diff --git a/doc/v2/faq/local/src/word2vec_dataprovider.py b/doc/v2/faq/local/src/word2vec_dataprovider.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe67b6d6cbbbdc8a98d497f352cf114a882636f
--- /dev/null
+++ b/doc/v2/faq/local/src/word2vec_dataprovider.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DICT_DIM = 3000
+
+
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+    with open(filename) as f:
+        # yield word ids to predict inner word id
+        # such as [28, 29, 10, 4], 4
+        # It means the sentance is  28, 29, 4, 10, 4.
+        yield read_next_from_file(f)
diff --git a/doc/v2/faq/model/index_cn.rst b/doc/v2/faq/model/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6947948bc79f4dba63954c459afb940e3242c405
--- /dev/null
+++ b/doc/v2/faq/model/index_cn.rst
@@ -0,0 +1,80 @@
+#########
+模型配置
+#########
+
+..  contents::
+
+1. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
+
+2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
+
+* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
+
+3. 两种使用 drop_out 的方法有何区别
+------------------------------------
+
+* 在PaddlePaddle中使用dropout有两种方式
+
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+
+* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
+
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
+
+4. 不同的 recurrent layer 的区别
+----------------------------------
+以LSTM为例，在PaddlePaddle中包含以下 recurrent layer：
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+按照具体实现方式可以归纳为2类：
+
+1. 由 recurrent_group 实现的 recurrent layer：
+
+  * 用户在使用这一类recurrent layer时，可以访问由recurrent unit在一个时间步内计算得到的中间值（例如：hidden states, memory cells等）；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ；
+
+2. 将recurrent layer作为一个整体来实现：
+
+  * 用户在使用这一类recurrent layer，只能访问它们的输出值；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现；
+
+将recurrent layer作为一个整体来实现， 能够针对CPU和GPU的计算做更多优化， 所以相比于recurrent group的实现方式， 第二类 recurrent layer 计算效率更高。 在实际应用中，如果用户不需要访问LSTM的中间变量，而只需要获得recurrent layer计算的输出，我们建议使用第二类实现。
+
+此外，关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元：
+
+  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程，它并不是一个完整的recurrent layer，也不能接收序列数据作为输入；
+  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用；
+
+5. PaddlePaddle的softmax能否指定计算的维度
+-----------------------------------------
+
+PaddlePaddle的softmax不能指定计算维度，只能按行计算。
+在图像任务中，对于NCHW，如果需要在C维度计算softmax，可以先使用 :code:`paddle.layer.switch_order` 改变维度顺序，即将NCHW转换成NHWC，再做一定的reshape，最后计算softmax。
+
+6. PaddlePaddle是否支持维数可变的数据输入
+------------------------------------------
+
+PaddlePaddle提供的 :code:`paddle.data_type.dense_array` 支持维数可变的数据输入。在使用时，将对应数据层的维数设置成一个大于输入数据维数的值用于占位即可。
diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67a33e08e192e5627ac3b0abd76e979f21ed2079
--- /dev/null
+++ b/doc/v2/faq/model/index_en.rst
@@ -0,0 +1,81 @@
+###################
+Model Configuration
+###################
+
+..  contents::
+
+1. How to deal with error :code:`Duplicated layer name`
+----------------------------------------------------------
+
+The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
+
+2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
+----------------------------------------------------------------------
+
+* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus,  :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
+
+* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name`  and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
+
+
+3. What is the difference between the two ways of using dropout
+-----------------------------------------------------------------
+
+* There are two ways to use dropout in PaddlePaddle
+
+  * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
+
+* PaddlePaddle implements dropout in the activation function rather than in the layer.
+
+* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
+
+4. The differences between different recurrent layers
+--------------------------------------------------------
+Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+According to implementations, recurrent layer can be classified into 2 types:
+
+1. Recurrent layer implemented by recurrent_group:
+
+  * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
+  * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
+
+2. Recurrent layer implemented as a complete operation：
+
+  * Users can only access output values when using this type of recurrent layers.
+  * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and  :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer；
+
+By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
+
+In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
+
+  * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
+  * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
+
+5. Can Softmax's calculation dimension be specified？
+--------------------------------------------------------------------
+
+We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
+In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
+
+6. Does PaddlePaddle support variable-dimensional data inputs
+----------------------------------------------------------------
+
+PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
diff --git a/doc/v2/faq/parameter/index_cn.rst b/doc/v2/faq/parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..987e8cf088be4ee8daa7c28fdc855506cbfd31c7
--- /dev/null
+++ b/doc/v2/faq/parameter/index_cn.rst
@@ -0,0 +1,201 @@
+#########
+参数设置
+#########
+
+..  contents::
+
+1. 如何选择SGD算法的学习率
+--------------------------
+
+在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
+
+通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
+
+如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
+
+2. 如何设置学习率退火（learning rate annealing）
+------------------------------------------------
+
+在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+
+PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
+
+* "constant"
+  
+  lr = learning_rate
+
+* "poly"
+
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  其中，num_samples_processed为已训练样本数，下同。
+
+* "caffe_poly"
+
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
+
+* "pass_manual"
+
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="pass_manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
+
+3. 如何初始化参数
+-----------------
+
+默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
+
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
+
+..  code-block:: python
+
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+
+4. 如何共享参数
+---------------
+
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
+
+简单的全连接网络，参数共享的配置示例为\:
+
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
+
+5. 如何加载预训练参数
+------------------------
+
+* 对加载预训练参数的层，设置其参数属性 :code:`is_static=True`，使该层的参数在训练过程中保持不变。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    emb_para = paddle.attr.Param(name='emb', is_static=True)
+    paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* 从模型文件将预训练参数载入 :code:`numpy.array`，在创建parameters后，使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息，用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    def load_parameter(file_name, h, w):
+        with open(file_name, 'rb') as f:
+            f.read(16)  # skip header.
+            return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    parameters = paddle.parameters.create(my_cost)
+    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. 存储的参数格式是什么，如何和明文进行相互转化
+--------------------------------------------------
+
+PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中，1~4字节表示PaddlePaddle版本信息，请直接填充0；5~8字节表示每个参数占用的字节数，当保存的网络参数为float类型时为4，double类型时为8；9~16字节表示保存的参数总个数。
+
+将PaddlePaddle保存的模型参数还原回明文时，可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数，此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时，未指定按照double精度编译，默认情况下按照float精度计算，保存的参数也是float类型。这时在使用 :code:`numpy.array` 时，一般设置 :code:`dtype=float32` 。示例如下：
+
+..  code-block:: python
+
+    def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                fmt="%.6f", delimiter=",")
+
+
+将明文参数转化为PaddlePaddle可加载的模型参数时，首先构造头信息，再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
+
+..  code-block:: python
+
+    def gen_rand_param(param_file, width, height, need_trans):
+        np.random.seed()
+        header = struct.pack("iil", 0, 4, height * width)
+        param = np.float32(np.random.rand(height, width))
+        with open(param_file, "w") as fparam:
+            fparam.write(header + param.tostring())
+
+7. A protocol message was rejected because it was too big
+------------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_recurrent.py>`_ 示例。
+
+
diff --git a/doc/v2/faq/parameter/index_en.rst b/doc/v2/faq/parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9edb8dd620f972d019db9c0063cefce616de0ebd
--- /dev/null
+++ b/doc/v2/faq/parameter/index_en.rst
@@ -0,0 +1,198 @@
+##################
+Parameter Settings
+##################
+
+.. contents::
+
+1. How to Choose the Learning Rate of SGD Algorithm
+--------------------------
+
+An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
+
+Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
+
+If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
+
+2. How to Implement Learning Rate Annealing
+------------------------------------------------
+
+We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
+
+.. code-block:: python
+
+    Optimizer = paddle.optimizer.Adam(
+        Learning_rate=1e-3,
+        Learning_rate_decay_a=0.5,
+        Learning_rate_decay_b=0.75,
+        Learning_rate_schedule="poly",)
+
+PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
+
+* "constant"
+  
+  Lr = learning_rate
+
+* "poly"
+
+  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  Variable :code:`num_samples_processed` is the number of trained samples.
+
+* "caffe_poly"
+
+  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="manual",
+          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
+
+* "pass_manual"
+
+  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="pass_manual",
+          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
+
+3. How to Initialize Parameters
+-----------------
+
+By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
+
+* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
+
+.. code-block:: python
+
+    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
+
+4. How to Share Parameters
+---------------
+
+PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
+
+A simple fully connected network has its configuration of parameter sharing as follows \:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
+
+5. How to Load Pre-training Parameters
+------------------------
+* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Emb_para = paddle.attr.Param(name='emb', is_static=True)
+    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Def load_parameter(file_name, h, w):
+        With open(file_name, 'rb') as f:
+            F.read(16) # skip header.
+            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    Parameters = paddle.parameters.create(my_cost)
+    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. Format of the Stored Parameter and How to Convert the File to Plain Text
+--------------------------------------------------
+
+The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
+
+When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
+
+.. code-block:: python
+
+    Def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        Vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                Fmt="%.6f", delimiter=",")
+
+
+When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
+
+.. code-block:: python
+
+    Def gen_rand_param(param_file, width, height, need_trans):
+        Np.random.seed()
+        Header = struct.pack("iil", 0, 4, height * width)
+        Param = np.float32(np.random.rand(height, width))
+        With open(param_file, "w") as fparam:
+            Fparam.write(header + param.tostring())
+
+7. A Protocol Message Rejected Because of its Large Size
+-------------------------------------------------- ----------
+
+If you are training NLP related models, and the following error occurs:
+
+.. code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
+
+.. code-block:: python
+
+     Src_dict = dict()
+     For line_count, line in enumerate(open(src_dict_path, "r")):
+        Src_dict[line.strip()] = line_count
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict": src_dict})
+
+The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
+
+.. code-block:: python
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict_path": src_dict_path})
+
+The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
diff --git a/doc/v2/getstarted/concepts/src/infer.py b/doc/v2/getstarted/concepts/src/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe256f234a1c7d29c33f3b65b8302646df0c45c
--- /dev/null
+++ b/doc/v2/getstarted/concepts/src/infer.py
@@ -0,0 +1,32 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
diff --git a/doc/v2/getstarted/concepts/src/train.py b/doc/v2/getstarted/concepts/src/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85d5d8a3acee61d11488e5b842831a79072680a
--- /dev/null
+++ b/doc/v2/getstarted/concepts/src/train.py
@@ -0,0 +1,71 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import numpy as np
+
+# init paddle
+paddle.init(use_gpu=False)
+
+# network config
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+
+# create parameters
+parameters = paddle.parameters.create(cost)
+# create optimizer
+optimizer = paddle.optimizer.Momentum(momentum=0)
+# create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+
+
+# event_handler to print training info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 1 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
+                                                  event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+
+# define training dataset reader
+def train_reader():
+    train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
+    train_y = np.array([[-2], [-3], [-7], [-7]])
+
+    def reader():
+        for i in xrange(train_y.shape[0]):
+            yield train_x[i], train_y[i]
+
+    return reader
+
+
+# define feeding map
+feeding = {'x': 0, 'y': 1}
+
+# training
+trainer.train(
+    reader=paddle.batch(
+        train_reader(), batch_size=1),
+    feeding=feeding,
+    event_handler=event_handler,
+    num_passes=100)
diff --git a/doc/v2/getstarted/concepts/use_concepts_cn.rst b/doc/v2/getstarted/concepts/use_concepts_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..608f49f5a969b3291eb43bf2acf582af74e566a1
--- /dev/null
+++ b/doc/v2/getstarted/concepts/use_concepts_cn.rst
@@ -0,0 +1,155 @@
+############
+基本使用概念
+############
+
+PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
+这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
+在使用该文档之前，请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+
+
+配置网络
+============
+
+加载PaddlePaddle
+----------------------
+
+在进行网络配置之前，首先需要加载相应的Python库，并进行初始化操作。
+
+..	code-block:: bash
+
+    import paddle.v2 as paddle
+    import numpy as np
+    paddle.init(use_gpu=False)
+
+
+搭建神经网络
+-----------------------
+
+搭建神经网络就像使用积木搭建宝塔一样。在PaddlePaddle中，layer是我们的积木，而神经网络是我们要搭建的宝塔。我们使用不同的layer进行组合，来搭建神经网络。
+宝塔的底端需要坚实的基座来支撑，同样，神经网络也需要一些特定的layer作为输入接口，来完成网络的训练。
+
+例如，我们可以定义如下layer来描述神经网络的输入：
+
+..	code-block:: bash
+
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+
+其中x表示输入数据是一个维度为2的稠密向量，y表示输入数据是一个维度为1的稠密向量。
+
+PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+
+在定义输入layer之后，我们可以使用其他layer进行组合。在组合时，需要指定layer的输入来源。
+
+例如，我们可以定义如下的layer组合：
+
+..	code-block:: bash
+
+    y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+    cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+
+其中，x与y为之前描述的输入层；而y_predict是接收x作为输入，接上一个全连接层；cost接收y_predict与y作为输入，接上平方误差层。
+
+最后一层cost中记录了神经网络的所有拓扑结构，通过组合不同的layer，我们即可完成神经网络的搭建。
+
+
+训练模型
+============
+
+在完成神经网络的搭建之后，我们首先需要根据神经网络结构来创建所需要优化的parameters，并创建optimizer。
+之后，我们可以创建trainer来对网络进行训练。
+
+..	code-block:: bash
+
+    parameters = paddle.parameters.create(cost)
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+其中，trainer接收三个参数，包括神经网络拓扑结构、神经网络参数以及迭代方程。
+
+在搭建神经网络的过程中，我们仅仅对神经网络的输入进行了描述。而trainer需要读取训练数据进行训练，PaddlePaddle中通过reader来加载数据。
+
+..	code-block:: bash
+
+    # define training dataset reader
+    def train_reader():
+        train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
+        train_y = np.array([[-2], [-3], [-7], [-7]])
+        def reader():
+            for i in xrange(train_y.shape[0]):
+                yield train_x[i], train_y[i]
+        return reader
+
+最终我们可以调用trainer的train方法启动训练：
+
+..	code-block:: bash
+
+    # define feeding map
+    feeding = {'x': 0, 'y': 1}
+
+    # event_handler to print training info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+    # training
+    trainer.train(
+        reader=paddle.batch(train_reader(), batch_size=1),
+        feeding=feeding,
+        event_handler=event_handler,
+        num_passes=100)
+
+关于PaddlePaddle的更多使用方法请参考 `进阶指南 <../../howto/index_cn.html>`_。
+
+线性回归完整示例
+==============
+
+下面给出在三维空间中使用线性回归拟合一条直线的例子：
+
+..  literalinclude:: src/train.py
+    :linenos:
+
+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
+有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/v2/getstarted/concepts/use_concepts_en.rst b/doc/v2/getstarted/concepts/use_concepts_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..406b0cbb913894dc333d8e4561c207793c33e475
--- /dev/null
+++ b/doc/v2/getstarted/concepts/use_concepts_en.rst
@@ -0,0 +1,3 @@
+Basic Concept
+=============
+TBD
diff --git a/doc/v2/getstarted/index_cn.rst b/doc/v2/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..75af7354be93a6eeabfa9ccf86903505402a7ca6
--- /dev/null
+++ b/doc/v2/getstarted/index_cn.rst
@@ -0,0 +1,19 @@
+新手入门
+============
+
+
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/v2/getstarted/index_en.rst b/doc/v2/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94b306895c9ddf6140cf600131930a6675a583eb
--- /dev/null
+++ b/doc/v2/getstarted/index_en.rst
@@ -0,0 +1,19 @@
+GET STARTED
+============
+
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+  
+  
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+  
+..  toctree::
+  :maxdepth: 1
+  
+  concepts/use_concepts_en.rst
diff --git a/doc/v2/getstarted/quickstart_cn.rst b/doc/v2/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d511cead262dabafd095f68adb5ffc596a7fe596
--- /dev/null
+++ b/doc/v2/getstarted/quickstart_cn.rst
@@ -0,0 +1,47 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：:ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/v2/getstarted/quickstart_en.rst b/doc/v2/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70f7fe0646068aa79cd72955c6848ac0250c2300
--- /dev/null
+++ b/doc/v2/getstarted/quickstart_en.rst
@@ -0,0 +1,51 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..8878ee9d85064ba27708ed92790aa9b83ba316e5
--- /dev/null
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -0,0 +1,181 @@
+## 安装、编译与链接C-API预测库
+
+### 直接下载安装
+
+从CI系统中下载最新的C-API开发包进行安装，用户可以从下面的表格中找到需要的版本：
+
+<table>
+<thead>
+<tr>
+<th>版本说明</th>
+<th>C-API</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cpu_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_avx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
+
+### 从源码编译
+
+用户也可以从 PaddlePaddle 核心代码编译C-API链接库，只需在编译时配制下面这些编译选项：
+
+<table>
+<thead>
+<tr>
+<th>选项</th>
+<th>值</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+执行上述代码生成Makefile文件后，执行：`make && make install`。成功编译后，使用C-API所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件）均会存放于`PADDLE_ROOT`目录中。
+
+编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构（包括了编译出的PaddlePaddle头文件和链接库，以及第三方依赖链接库和头文件（如果需要，由链接方式决定））：
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### 链接说明
+
+目前提供三种链接方式：
+
+1. 链接`libpaddle_capi_shared.so` 动态库（这种方式最为简便，链接相对容易，**在无特殊需求情况下，推荐使用此方式**），需注意：
+    1. 如果编译时指定编译CPU版本，且使用`OpenBLAS`数学库，在使用C-API开发预测程序时，只需要链接`libpaddle_capi_shared.so`这一个库。
+    1. 如果是用编译时指定CPU版本，且使用`MKL`数学库，由于`MKL`库有自己独立的动态库文件，在使用PaddlePaddle C-API开发预测程序时，需要自己链接MKL链接库。
+    1. 如果编译时指定编译GPU版本，CUDA相关库会在预测程序运行时动态装载，需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
+
+2. 链接静态库 `libpaddle_capi_whole.a`，需注意：
+    1. 需要指定`-Wl,--whole-archive`链接选项。
+    1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库，可在`PADDLE_ROOT/third_party`下找到。
+    1. 如果在编译 C-API 时使用OpenBLAS数学库，需要显示地链接`libopenblas.a`。
+    1. 如果在编译 C-API 是使用MKL数学库，需要显示地链接MKL的动态库。
+
+3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`，需注意：
+    1. 这种链接方式主要用于移动端预测。
+    1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
+    1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
+    1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..70a6edef27e75af6b38d7d4824c928eba0d29b9a
--- /dev/null
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -0,0 +1,180 @@
+## Install and Build
+
+### Download & Install 
+
+  Download the latest C-API development package from CI system and install. You can find the required version in the table below:
+<table>
+<thead>
+<tr>
+<th>Version Tips</th>
+<th>C-API</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cpu_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_avx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
+
+### From source
+
+  Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
+  
+<table>
+<thead>
+<tr>
+<th>Options</th>
+<th>Value</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+
+It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
+
+Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+After running the above code to generate Makefile , run: `make && make install`.  After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
+
+If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### Linking Description:
+
+There are three kinds of linking methods:
+
+1. Linking with dynamic library `libpaddle_capi_shared.so`（This way is much more convenient and easier, **Without special requirements, it is recommended**）, refer to the following：
+    1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
+    1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
+    1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
+
+2. Linking with static library `libpaddle_capi_whole.a`，refer to the following：
+    1. Specify `-Wl,--whole-archive` linking options.
+    1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
+    1. Use OpenBLAS library if compiling C-API，must explicitly link `libopenblas.a`.
+    1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
+
+3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`，refer to the following：
+    1. This linking methods is mainly used for mobile prediction.
+    1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
+    1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and  `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
+    1. The third-party dependencies need explicitly link same as method 2 above. 
diff --git a/doc/v2/howto/capi/images/csr.png b/doc/v2/howto/capi/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/v2/howto/capi/images/csr.png differ
diff --git a/doc/v2/howto/capi/images/sequence_data.png b/doc/v2/howto/capi/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/v2/howto/capi/images/sequence_data.png differ
diff --git a/doc/v2/howto/capi/images/workflow_of_CAPI.png b/doc/v2/howto/capi/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/v2/howto/capi/images/workflow_of_CAPI.png differ
diff --git a/doc/v2/howto/capi/index_cn.rst b/doc/v2/howto/capi/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7f100717983f5e950b801e6b05ee48bfff273c62
--- /dev/null
+++ b/doc/v2/howto/capi/index_cn.rst
@@ -0,0 +1,26 @@
+C-API预测库
+==================
+
+当我们训练完一个神经网络模型之后，下一步就是用模型来做预测。预测就是准备输入数据，经过模型处理之后，得到预测结果的过程。
+
+相比于模型训练，预测有如下特点：
+
+#. 预测不需要训练过程中反向传播和参数更新的部分。
+#. 预测不需要标签(label)。
+#. 预测很多时候需要和用户系统整合在一起。
+
+因为上述特点，模型预测SDK需要单独设计，并具备以下特点：
+
+#. 预测SDK不包含反向传播和参数更新部分，以减小SDK的体积。
+#. 预测SDK需要提供一个简洁的用户接口，方便使用。
+#. 因为输入数据可能有多种结构，对输入数据的格式做清晰简洁的封装。
+#. 为了和用户系统兼容，SDK的接口需要是满足C标准的接口。
+
+PaddlePaddle提供了C-API，用于解决上述问题。关于C-API的使用，我们提供了如下指南：
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ec39c9d5223442cf6872edaf7befeb5053b538e
--- /dev/null
+++ b/doc/v2/howto/capi/index_en.rst
@@ -0,0 +1,26 @@
+C-API Inference Library
+========================
+
+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+
+Compared with model training, prediction has the following features:
+
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_en.md
+  organization_of_the_inputs_en.md
+  workflow_of_capi_en.md
diff --git a/doc/v2/howto/capi/organization_of_the_inputs_cn.md b/doc/v2/howto/capi/organization_of_the_inputs_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..343526c213110cb9c6abaf9a12b3d634ad3fabe9
--- /dev/null
+++ b/doc/v2/howto/capi/organization_of_the_inputs_cn.md
@@ -0,0 +1,289 @@
+## 输入/输出数据组织
+
+这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据，以及如何解析神经网络前向计算的输出结果。
+
+### 输入/输出数据类型
+在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+
+1. 一维整型数组
+1. 二维浮点型矩阵
+
+    - 稠密矩阵
+    - 稀疏矩阵
+
+说明：
+
+1. 一维数组**仅支持整型值**；
+    - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
+    - 分类任务中类别标签；
+1. 逻辑上高于二维的数据（例如含有多个通道的图片，视频等）在程序实现中都会转化为二维矩阵，转化方法在相应的领域都有通用解决方案，需要使用者自己了解并完成转化；
+1. 二维矩阵可以表示行向量和列向量，任何时候如果需要浮点型数组（向量），都应使用C-API中的矩阵来表示，而不是C-API中的一维数组。
+1. 不论是一维整型数组还是二维浮点数矩阵，**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时，无需关心和处理序列信息。关于什么是“序列信息”，下文会详细进行介绍。
+
+### 基本使用概念
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`Argument`内部由`IVector`（对应着上文提到的一维整型数组）和`Matrix`（对应着上文提到的二维浮点型矩阵）来实际存储数据；由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
+
+- **注**：
+    1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。
+    1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。
+    1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。
+
+### 组织输入数据
+- 一维整型数组
+
+    概念上可以将`paddle_ivector`理解为一个一维的整型数组，通常用于表示离散的类别标签，或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。
+    ```c
+    int ids[] = {1, 2, 3};
+     paddle_ivector ids_array =
+         paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false);
+     CHECK(paddle_arguments_set_ids(in_args, 0, ids_array));
+    ```
+
+- **稠密矩阵**
+    - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列，矩阵里的元素是浮点数。对神经网络来说，矩阵的高度`m`是一次预测接受的样本数目，宽度$n$是神经网络定义时，`paddle.layer.data`的`size`。
+    - 下面的代码片段创建了一个高度为1，宽度为`layer_size`的稠密矩阵，矩阵中每个元素的值随机生成。
+
+    ```c
+    paddle_matrix mat = paddle_matrix_create(
+                            /* height = batch size */ 1,
+                            /* width = dimensionality of the data layer */ layer_size,
+                            /* whether to use GPU */ false);
+
+    paddle_real* array;
+    // Get the pointer pointing to the start address of the first row of the
+    // created matrix.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    // Fill the matrix with a randomly generated test sample.
+    srand(time(0));
+    for (int i = 0; i < layer_size; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    // Assign the matrix to the argument.
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+    ```
+
+- **稀疏矩阵**
+
+  PaddlePaddle C-API 中 稀疏矩阵使用[CSR（Compressed Sparse Row Format）](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。
+  <p align="center">
+  <img src="https://user-images.githubusercontent.com/5842774/34159369-009fd328-e504-11e7-9e08-36bc6dc5e505.png" width=700><br> 图1. 稀疏矩阵存储示意图
+  </p>
+
+  CSR存储格式通过：（1）非零元素的值（上图中的`values`）；（2）行偏移(上图中的`row offsets`)：每一行元素在`values`中的起始偏移，`row offsets`中元素个数总是等于行数 + 1；（3）非零元素的列号（上图中的`column indices`）来确定稀疏矩阵的内容。
+
+  在PaddlePaddle C-API中，通过调用以下接口创建稀疏矩阵：
+
+  ```c
+  PD_API paddle_matrix paddle_matrix_create_sparse(
+      uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+  ```
+
+  1. 创建稀疏矩阵时需要显示地指定矩阵的（1）高度（`height`，在神经网络中等于一次预测处理的样本数）（2）宽度（`width`，`paddle.layer.data`的`size`）以及（3）非零元个数（`nnz`）。
+  1. 当上述接口第4个参数`isBinary`指定为`true`时，**只需要设置行偏移（`row_offset`）和列号(`colum indices`)，不需要提供元素值（`values`）**，这时行偏移和列号指定的元素默认其值为1。
+
+  下面的代码片段创建了一个CPU上的二值稀疏矩阵：
+
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 (colIndices) / sizeof(int),
+                                 NULL /*values array is NULL.*/,
+                                 0 /*size of the value arrary is 0.*/));
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+  ```
+  下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵：
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+  float values[] = {0.5, 0.5, 0.5};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 sizeof(colIndices) / sizeof(int),
+                                 values,
+                                 sizeof(values) / sizeof(float)));
+  ```
+  注意事项：
+  1. 移动端预测**不支持**稀疏矩阵及相关的接口。
+
+### 组织序列信息
+
+多个排成一列的元素（可以是整型、浮点数、浮点数向量等）构成一个序列，元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中，序列输入/输出数据是在上文介绍的**数据输入（一维整型数组，二维浮点数矩阵）基础上，附加上序列信息**。下面详细解释什么是“序列信息”。
+
+我们将神经网络一次计算接受的所有输入样本称之为一个`batch`（可以含有一条或多条样本），每一个序列在整个`batch`中的偏移，就是PaddlePaddle中所指的**序列信息**，称之为“sequence start positions”。PaddlePaddle 支持两种序列类型：
+
+1. 单层序列
+    - 序列中的每一个元素是非序列，是进行计算的基本单位，不可再进行拆分。
+    - 例如：自然语言中的句子是一个序列，序列中的元素是词语；
+1. 双层序列
+    - 序列中的每一个元素又是一个序列。
+    - 例如：自然语言中的段落是一个双层序列；段落是由句子构成的序列；句子是由词语构成的序列。
+    - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。
+
+这篇文档之后部分会统一使用`sequence_start_positions`来特指：PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。
+
+对双层序列来讲，不仅要提供每一个外层序列在整个`batch`中的偏移，每一个外层序列又含有若干个内层序列，需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说：**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。
+
+**注：**
+1. 不论序列中的元素在内存中占用多少实际存储空间，`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位，而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。
+1. 非序列输入不携带`sequence_start_positions`，非序列输入无需构造`sequence_start_positions`。
+1. **不论是单层序列还是双层序列的序列信息，都使用`paddle_ivector`（也就是PaddlePaddle中的一维整型数组）来存储。**
+
+图2 是PaddlePaddle中单层序列和双层序列存储示意图。
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34159714-1f81a9be-e505-11e7-8a8a-4902146ec899.png" width=800><br>图2. 序列输入示意图
+</p>
+
+- 单层序列
+
+    图2 (a) 展示了一个含有4个序列的`batch`输入：
+    1. 4个序列的长度分别为：5、3、2、4；
+    1. 这时的`sequence_start_positions`为：`[0, 5, 8, 10, 14]`；
+    1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，都可以通过调用下面的接口为原有的数据输入附加上序列信息，使之变为一个单层序列输入，代码片段如下：
+
+    ```c
+    int seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+    // Suppose the network only has one input data layer.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+    ```
+
+- 双层序列
+
+    图2 (b) 展示了一个含有4个序列的`batch`输入；
+    1. 4个序列的长度分别为：5、3、2、4；这四个序列又分别含有3、2、1、2个子序列；
+    1. 这时的需要同时提供：
+        - 外层序列在`batch`中的起始偏移`：[0, 5, 8, 10, 14]`；
+        - 内层序列在`batch`中的起始偏移：`[0, 2, 3, 5, 7， 8， 10， 13， 14]`；
+    1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**，分别为数据输入添加外层序列和内层序列的序列信息，使之变为一个双层序列输入，代码片段如下：
+    ```c
+    // set the sequence start positions for the outter sequences.
+    int outter_seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos =
+        paddle_ivector_create(outter_seq_pos_array,
+                              sizeof(outter_pos_array) / sizeof(int),
+                              false,
+                              false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    // If the input is a sequence not the nested sequence, the third parameter is
+    // fixed to be 0.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+    // set the sequence start positions for the outter sequences.
+    int inner_seq_pos_array[] = {0, 2, 3, 5, 7， 8， 10， 13， 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos));
+    ```
+
+注意事项：
+1. 当一个`batch`中含有多个序列，**不支持序列长度为`0`的序列（也就是空输入）** 作为输入。不同计算层对空输入的处理策略有可能不同，潜在会引起未定义行为，或者引起行时错误，请在输入时进行合法性检查。
+
+### Python 端数据类型说明
+
+下表列出了Python端训练接口暴露的数据类型（`paddle.layer.data`函数`type`字段的取值）对应于调用C-API需要创建的数据类型：
+
+<html>
+<table border="2" frame="border">
+<table>
+<thead>
+<tr>
+<th style="text-align:left">Python 端数据类型</th>
+<th style="text-align:left">C-API 输入数据类型</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value</td>
+<td style="text-align:left">整型数组，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector</td>
+<td style="text-align:left">浮点型稠密矩阵，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sequence</td>
+<td style="text-align:left">整型数组，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sub_sequence</td>
+<td style="text-align:left">整型数组，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加双层序列信息</td>
+</tr>
+</tbody>
+</table>
+</html>
+<br>
+
+
+### 输出数据
+
+PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`，`argument`通过`paddle_matrix`或`paddle_ivector`存数数据，如果输出是一个序列，那么会携带有`sequence_start_positions`信息。调用C-API相关接口，读取需要的结果即可。
+
+### 总结
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为`argument`。
+- `argument`并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`argument`内部由`paddle_ivector`（一维整型数组）和`paddle_matrix`（二维浮点型矩阵）来实际存储数据。
+如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
+
+于是，在组织神经网络输入时，需要思考完成以下工作：
+
+1. 为每一个输入/输出创建`argument`。
+    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
+1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
+    - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。
+    - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。
+1. 如果输入是序列数据，需要创建并填写`sequence_start_positions`信息。
+    - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。
+    - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。
+    - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。
diff --git a/doc/v2/howto/capi/organization_of_the_inputs_en.md b/doc/v2/howto/capi/organization_of_the_inputs_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..250d3b2f749aed018e63527e817899c843dff996
--- /dev/null
+++ b/doc/v2/howto/capi/organization_of_the_inputs_en.md
@@ -0,0 +1,3 @@
+## Input/Output Data Organization
+
+TBD
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..db1568a2afbea3cca0d4e1fe053ba9536a60ab3d
--- /dev/null
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -0,0 +1,124 @@
+## C-API使用流程
+
+这篇文档介绍 PaddlePaddle C-API 整体使用流程。
+
+### 使用流程
+
+使用 C-API 的工作流程如图1所示，分为（1）准备预测模型和（2）预测程序开发两大部分。
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34658453-365f73ea-f46a-11e7-9b3f-0fd112b27bae.png" width=500><br> 图1. C-API使用流程示意图
+</p>
+
+- 准备预测模型
+
+    1. 只将神经网络结构进行序列化。
+        - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
+    1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
+        - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。
+        - 预测时只需加载一个文件便于发布。
+    - **注意**：以上两种方式只需选择其一即可。
+- 调用 C-API 开发预测序
+
+    1. 初始化PaddlePaddle运行环境。
+    1. 加载预测模型。
+    1. 创建神经网络输入，组织输入数据。
+    1. 进行前向计算，获得计算结果。
+    1. 清理和结束。
+
+### 准备预测模型
+
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。
+
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+
+下面，我们将训练结束后存储下来的模型转换成预测模型。
+
+1. 序列化神经网络模型配置
+
+    PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数，使用 C-API 进行预测时，需要将网络结构使用 protobuf 进行序列化，写入文件中。
+
+    调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中，示例代码如下：
+
+    ```python
+    from paddle.utils.dump_v2_config import dump_v2_config
+    from mnist_v2 import network
+
+    predict = network(is_infer=True)
+    dump_v2_config(predict, "trainer_config.bin", True)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+
+    使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
+
+2. 合并模型文件(可选)
+
+    一些情况为了便于发布，希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求，可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化，将序列化结果写入一个文件内。
+
+    代码示例如下：
+
+    ```python
+    from paddle.utils.merge_model import merge_v2_model
+    from mnist_v2 import network
+
+    net = network(is_infer=True)
+    param_file = "models/params_pass_4.tar"
+    output_file = "output.paddle.model"
+    merge_v2_model(net, param_file, output_file)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+
+#### 注意事项
+1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
+1. **预测使用的网络结构往往不同于训练**，通常需要去掉网络中的：（1）类别标签层；（2）损失函数层；（3）`evaluator`等，只留下核心计算层，请注意是否需要修改网络结构。
+1. 预测时，可以获取网络中定义的任意多个（大于等于一个）层前向计算的结果，需要哪些层的计算结果作为输出，就将这些层加入一个Python list中，作为调用`dump_v2_config`的第一个参数。
+
+### 编写预测代码
+
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+
+#### step 1. 初始化PaddlePaddle运行环境
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+
+#### step2. 加载模型
+
+这里介绍C-API使用中的一个重要概念：Gradient Machine。
+
+概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
+
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。
+
+- 注意事项
+
+    1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
+    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
+    1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
+
+#### step 3. 创建神经网络输入，组织输入数据
+
+基本使用概念：
+- 在PaddlePaddle内部，神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出数据有机地组织在一起。
+- 在`Argument`内部由：1. `Matrix`（二维矩阵，存储浮点类型输入/输出）；2. `IVector`（一维数组，**仅用于存储整型值**，多用于自然语言处理任务）来实际存储数据。
+
+C-API支持的所有输入数据类型和他们的组织方式，请参考“输入/输出数据组织”一节。
+
+这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
+
+在组织神经网络输入，获取输出时，需要思考完成以下工作：
+
+1. 为每一个输入/输出创建`argument`；
+1. 为每一个`argument`创建`paddle_matrix`来存储数据；
+
+与输入不同的是，不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。
+
+#### step 4. 前向计算
+
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+
+#### step 5. 清理
+
+结束预测之后，对使用的中间变量和资源进行清理和释放。
diff --git a/doc/v2/howto/capi/workflow_of_capi_en.md b/doc/v2/howto/capi/workflow_of_capi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..1692ecd56520675f02ad25ef73761330ebd0e740
--- /dev/null
+++ b/doc/v2/howto/capi/workflow_of_capi_en.md
@@ -0,0 +1,3 @@
+## C-API Workflow
+
+TBD
diff --git a/doc/v2/howto/cluster/cmd_argument_cn.md b/doc/v2/howto/cluster/cmd_argument_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0ba093cbf2eac5c3b60a0b071b31776a11998f3
--- /dev/null
+++ b/doc/v2/howto/cluster/cmd_argument_cn.md
@@ -0,0 +1,167 @@
+# 启动参数说明
+
+下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
+
+## 启动参数服务器
+
+执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+```
+
+如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
+
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+
+## 启动计算节点
+
+执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+
+```bash
+$ python train.py
+```
+
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
+
+使用环境变量：
+
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+```
+
+使用参数：
+
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
+
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前trainer的线程数目
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
+
+```python
+trainer = paddle.trainer.SGD(..., is_local=False)
+```
+
+参数说明
+
+- is_local: **必选, 默认True**, 是否使用PServer更新参数
+
+## 准备数据集
+
+参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
+
+在线上系统中，通常会使用MapReduce任务的输出结果作为训练结果，这样训练文件的个数会比较多，而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件：
+
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
+
+```bash
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
+
+在进行分布式训练时，每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中，系统会提供一个分布式存储服务，这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储，则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。
+
+对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
+
+## 准备训练程序
+
+我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
+
+最后，工作空间应如下所示：
+
+```bash
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
+
+- `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
+- `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
+
+- `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
+- `test_data_dir`：包含测试数据集的目录。
+
+## 异步 SGD 更新
+
+我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
+例如，设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数：
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: 是否为异步SGD更新模式。
+- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制，接收到足够的gradient(
+  `async_lagged_grad_discard_ratio * num_gradient_servers`)之后，后面的gradient
+  将会被抛弃。
diff --git a/doc/v2/howto/cluster/cmd_argument_en.md b/doc/v2/howto/cluster/cmd_argument_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..df1381a00fa0fa129eecffe002164c489a4183aa
--- /dev/null
+++ b/doc/v2/howto/cluster/cmd_argument_en.md
@@ -0,0 +1,169 @@
+# Command-line arguments
+
+We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
+
+## Starting parameter server
+
+Type the below command to start a parameter server which will wait for trainers to connect:
+
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
+```
+
+If you wish to run parameter servers in background, and save a log file, you can type:
+
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
+```
+
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.
+- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
+
+## Starting trainer
+
+Type the command below to start the trainer(name the file whatever you want, like "train.py")
+
+```bash
+$ python train.py
+```
+
+Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables.
+
+Use environment viriables:
+
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+python train.py
+```
+
+Pass arguments:
+
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
+
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, number of threads in current trainer.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
+
+```python
+trainer = paddle.trainer.SGD(..., is_local=False)
+```
+
+Parameter Description
+
+- is_local: **required, default True**, whether update parameters by PServer.
+
+## Prepare Training Dataset
+
+Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
+
+In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers:
+
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
+
+```bash
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
+
+When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node.
+
+Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
+
+## Prepare Training program
+
+We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
+
+Your workspace may looks like:
+
+```bash
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
+
+- `my_lib.py`: user defined libraries, like PIL libs. This is optional.
+- `word_dict.pickle`: dict file for training word embeding.
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
+
+- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
+- `test_data_dir`: containing testing data.
+
+## Async SGD Update
+
+We can set some parameters of the optimizer to make it support async SGD update.
+For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: Is Async-SGD or not.
+- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
+  when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
+  current async gradient will be discard silently.
diff --git a/doc/v2/howto/cluster/index_cn.rst b/doc/v2/howto/cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2583457c54116b7a1d797d4f7b7c2c4789c6d882
--- /dev/null
+++ b/doc/v2/howto/cluster/index_cn.rst
@@ -0,0 +1,36 @@
+分布式训练
+==========
+
+深度学习模型的效果好坏与数据量的大小往往有直接的关系：相同的模型，在增大训练数据集后一般都能取得更好的效果。但是当数据量增大到一定程度后，单台计算机已经难以承受。这时，使用多台计算机进行分布式训练就是一个很自然的解决方案。在分布式训练中，训练数据被分割为多份，参与训练的多台机器分别读取自己的数据进行训练，并协同对整体模型的参数进行更新。
+
+分布式训练一般有着如下图所示的架构：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+通过计算节点和参数服务器的分布式协作，可以完成神经网络的同步随机梯度下降（SGD）方法的训练。PaddlePaddle同时支持同步随机梯度下降（SGD）和异步随机梯度下降（ASGD）。
+
+在开始集群训练之前，需要先进行集群配置、PaddlePaddle安装等准备工作，了解如何通过这些步骤来配置分布式训练所需的基本环境：
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_cn.md
+
+集群训练有大量可配置的参数，例如使用的机器数量、通信端口等。了解如何通过设置启动参数的方式，对分布式训练的过程进行配置：
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_argument_cn.md
+
+PaddlePaddle可以兼容各种不同的集群。每种集群各有优势，使用的具体方式也有区别：
+
+..  toctree::
+  :maxdepth: 1
+
+  multi_cluster/index_cn.rst
diff --git a/doc/v2/howto/cluster/index_en.rst b/doc/v2/howto/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..31eda57c4fb3947d92df45ea8dbb9274c9814140
--- /dev/null
+++ b/doc/v2/howto/cluster/index_en.rst
@@ -0,0 +1,38 @@
+Distributed Training
+====================
+
+The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
+
+Distributed training generally has framwork as shown below:
+
+.. image:: src/ps_en.png
+   :width: 500
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+The training of synchronous random gradient descent for neural network can be achieved by cooperation of trainers and parameter servers.
+
+PaddlePaddle supports both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+Before starting the cluster training, you need to prepare the cluster configuration, PaddlePaddle installation, and other preparations. To understand how to configure the basic environment for distributed training, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_en.md
+
+Cluster training has a large number of configurable parameters, such as the number of machines used, communication ports, etc. To learn how to configure the distributed training process by setting startup these parameters, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_argument_en.md
+
+PaddlePaddle is compatible with a variety of different clusters. Each cluster has its own advantages, To learn how to run PaddlePaddle in different types of them, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
+  multi_cluster/index_en.rst
diff --git a/doc/v2/howto/cluster/multi_cluster/fabric_cn.md b/doc/v2/howto/cluster/multi_cluster/fabric_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0385e401b399a51fad112e604dc56cb2f84c0a4b
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/fabric_cn.md
@@ -0,0 +1,42 @@
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/v2/howto/cluster/multi_cluster/fabric_en.md b/doc/v2/howto/cluster/multi_cluster/fabric_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..bac9ffe1526a06a3a23b1d8acf33a5fb74b7e50d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/fabric_en.md
@@ -0,0 +1,43 @@
+# Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/v2/howto/cluster/multi_cluster/index_cn.rst b/doc/v2/howto/cluster/multi_cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eabf95eda0b20f91913201a6b4e5b56fa440597e
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/index_cn.rst
@@ -0,0 +1,35 @@
+在不同集群中运行
+================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:
+
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md
+
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:
+
+..  toctree::
+  :maxdepth: 1
+
+  openmpi_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
diff --git a/doc/v2/howto/cluster/multi_cluster/index_en.rst b/doc/v2/howto/cluster/multi_cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9bc1eb2e3796d95dd69b165e916e263ea34b87f6
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/index_en.rst
@@ -0,0 +1,35 @@
+Use different clusters
+======================
+
+The user's cluster environment is not the same. To facilitate everyone's deployment, we provide a variety of cluster deployment methods to facilitate the submission of cluster training tasks, which will be introduced as follows:
+
+`Kubernetes <http://kubernetes.io>`_ is a scheduling framework of Google open source container cluster, supporting a complete cluster solution for large-scale cluster production environment. The following guidelines show PaddlePaddle's support for Kubernetes:
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_en.md
+  k8s_distributed_en.md
+
+`OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
+
+..  toctree::
+  :maxdepth: 1
+
+  openmpi_en.md
+
+`Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_en.md
+
+We also support the deployment of PaddlePaddle on AWS. Learn more about:
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_aws_en.md
+
+The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc753aa42f19631c49a451a797f28365e65ed1d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
@@ -0,0 +1,672 @@
+# Kubernetes on AWS
+
+我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练，让我们从核心概念开始
+
+## PaddlePaddle分布式训练的核心概念
+
+### 分布式训练任务
+
+一个分布式训练任务可以看做是一个Kubernetes任务
+每一个Kubernetes任务都有相应的配置文件，此配置文件指定了像任务的pod个数之类的环境变量信息
+
+在分布式训练任务中，我们可以如下操作：
+
+1. 在分布式文件系统中，准备分块数据和配置文件（在此次教学中，我们会用到亚马逊分布式存储服务（EFS））
+2. 创建和提交一个kubernetes任务配置到集群中开始训练
+
+### Parameter Server和Trainer
+
+在paddlepaddle集群中有两个角色：参数服务器（pserver）者和trainer， 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数，并可以利用本地数据更新模型。在这个训练过程中，trainer发送模型更新到参数服务器中，参数服务器职责就是聚合这些更新，以便于trainer可以把全局模型同步到本地。
+
+为了能够和pserver通信，trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制（比如：DNS、hostname）要比静态的IP地址要好一些，因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式，这种方式是可以更改的。
+
+参数服务器和trainer一块被打包成一个docker镜像，这个镜像会运行在被Kubernetes集群调度的pod中。
+
+### 训练者ID
+
+每一个训练过程都需要一个训练ID，以0作为基础值，作为命令行参数传递。训练过程因此用这个ID去读取数据分片。
+
+### 训练
+
+PaddlePaddle容器的入口是一个shell脚本，这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity，在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。
+
+每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务，可以用“pod id”作为训练ID。入口脚本详细工作流程如下：
+
+1. 查找apiserver得到pod信息，通过ip排序来分配一个trainer_id。
+2. 从EFS持久化卷中复制训练数据到容器中。
+3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数，然后开始启动流程。
+4. 以trainer_id来训练将自动把结果写入到EFS卷中。
+
+
+## AWS的Kubernetes中的PaddlePaddle
+
+### 选择AWS服务区域
+这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前，请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域：EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。
+
+### 创建aws账户和IAM账户
+
+在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限，作为IAM用户可以创建/操作aws集群
+
+注册aws账户，请遵循用户指南。在AWS账户下创建IAM用户和用户组，请遵循用户指南
+
+请注意此教程需要如下的IAM用户权限：
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### 下载kube-aws and kubectl
+
+#### kube-aws
+
+在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具
+
+##### kube-aws完整性验证
+提示：如果你用的是非官方版本（e.g RC release）的kube-aws，可以跳过这一步骤。引入coreos的应用程序签名公钥:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+指纹验证：
+
+```
+gpg2 --fingerprint FC8A365E
+```
+正确的指纹是： `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+我们可以从发布页面中下载kube-aws，教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases).
+
+验证tar包的GPG签名：
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### 安装kube-aws
+解压:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+添加到环境变量:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口
+
+利用`curl`工具从Kubernetes发布页面中下载`kubectl`
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### 配置AWS证书
+
+首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具
+
+然后配置aws账户信息:
+
+```
+aws configure
+```
+
+
+添加如下信息:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+描述任何运行在你账户中的实例来验证凭据是否工作:
+
+```
+aws ec2 describe-instances
+```
+
+### 定义集群参数
+
+#### EC2秘钥对
+
+秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。
+
+遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对
+
+你可以使用创建好的秘钥对名称来配置集群.
+
+在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ，所以请确认在这个区域（Oregon）中创建秘钥对。
+
+在浏览器中下载一个`key-name.pem`文件用来访问EC2实例，我们待会会用到.
+
+
+#### KMS秘钥
+
+亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥，你可以跳过创建新秘钥这一步，提供现存秘钥的ARN字符串。
+
+利用aws命令行创建kms秘钥:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+我们稍后用到`Arn` 的值.
+
+在IAM用户许可中添加多个内联策略.
+
+进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮，点击刚才创建的用户，然后点击`Add inline policy`按钮，选择`Custom Policy`
+
+粘贴内联策略:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : 值必须是"2012-10-17".
+`AWS_ACCOUNT_ID`: 你可以从命令行中获取:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME，稍后会用到。
+请注意，堆栈名称必须是正则表达式：[a-zA-Z][-a-zA-Z0-9*]*， 在名称中不能有"_"或者"-"，否则kube-aws在下面步骤中会抛出异常
+
+#### 外部DNS名称
+
+当集群被创建后，基于DNS名称控制器将会暴露安全的TLS API.
+
+DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。
+
+我们稍后会用到DNS名称，如果没有DNS名称的话，你可以选择一个（比如：`paddle`）还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP，稍后步骤中会查找集群IP.
+
+#### S3 bucket
+
+在启动Kubernetes集群前需要创建一个S3 bucket
+
+在AWS上创建s3 bucket会有许多的bugs，所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。
+
+链接到 `Create Bucket`，确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。
+
+#### 初始化assets
+
+在本机创建一个目录用来存放产生的assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+这里的`us-west-2a`用于参数`--availability-zone`，但必须在AWS账户的有效可用区中
+
+如果不能切换到其他的有效可用区（e.g., `us-west-2a`, or `us-west-2b`），请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。
+
+现在在asset目录中就有了集群的主配置文件cluster.yaml。
+
+默认情况下kube-aws会创建一个工作节点，修改`cluster.yaml`让`workerCount`从1个节点变成3个节点.
+
+#### 呈现asset目录内容
+
+在这个简单的例子中，你可以使用kuber-aws生成TLS身份和证书
+
+```
+kube-aws render credentials --generate-ca
+```
+
+下一步在asset目录中生成一组集群assets.
+
+```
+kube-aws render stack
+```
+asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联
+
+### 启动Kubernetes集群
+
+#### 创建一个在CloudFormation模板上定义好的实例
+
+现在让我们创建集群（在命令行中选择任意的 `PREFIX`）
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称
+
+
+#### 配置DNS
+
+你可以执行命令 `kube-aws status`来查看创建后集群的API.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+如果你用DNS名称，在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### 查询IP地址
+
+用命令`dig`去检查负载均衡器的域名来获取ip地址.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+在上面的例子中，`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态
+
+*如果你有DNS名称*，设置记录到ip上，然后你可以跳过“Access the cluster”这一步
+
+*如果没有自己的DNS名称*
+
+编辑/etc/hosts文件用DNS关联IP
+
+##### 更新本地的DNS关联
+编辑`/etc/hosts`文件用DNS关联IP
+##### 在VPC上添加route53私有名称服务
+ - 打开[Route53 Console](https://console.aws.amazon.com/route53/home)
+ - 根据配置创建域名zone
+   - domain名称为: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - 添加记录
+    - 点击zone中刚创建的“paddle”
+    - 点击按钮“Create record set”
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - 检查名称服务
+    - 连接通过kube-aws via ssh创建的任何实例
+    - 运行命令"host paddle"，看看是否ip为返回的kube-controller的私有IP
+
+#### 进入集群
+
+集群运行后如下命令会看到:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### 集群安装弹性文件系统
+
+训练数据存放在AWS上的EFS分布式文件系统中.
+
+1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组
+  1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id
+  <center>![](src/worker_security_group.png)</center>
+
+  2. 增加安全组`paddle-efs` ，以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个.
+  <center>![](src/add_security_group.png)</center>
+
+2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### 开始在AWS上进行paddlepaddle的训练
+
+#### 配置Kubernetes卷指向EFS
+
+首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上
+
+用 `pv.yaml`形式来保存
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+运行下面的命令来创建持久卷:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷
+
+用`pvc.yaml`来保存.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+行下面命令来创建持久卷声明:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### 准备训练数据
+
+启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份.
+
+用`paddle-data-job.yaml`保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+运行下面的命令来启动任务:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+任务运行大概需要7分钟，可以使用下面命令查看任务状态，直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功，这里here有怎样创建镜像的源码
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放，可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码
+
+#### 开始训练
+
+现在可以开始运行paddle的训练任务，用`paddle-cluster-job.yaml`进行保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod，当pod启动后3个任务将被完成。
+
+`env` 参数代表容器的环境变量，在这里指定paddlepaddle的参数.
+
+`ports` 指定TCP端口7164 - 7167和`pserver`进行连接，port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟
+
+运行下面命令来启动任务.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+检查pods信息
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+检查指定pod的控制台输出
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`).
+
+运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态，将会在大约20分钟完成
+
+`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中，这里[here](src/k8s_train/README.md) 有创建docker镜像的源码.
+
+#### 检查训练输出
+
+训练输出（模型快照和日志）将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上，查看mount过的EFS和训练输出.
+
+1. ssh登录EC2工作节点
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址，进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP`
+
+2. 挂载EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+文件夹`efs`上有这结构相似的node信息:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` 是`pserver`的log日志，`train.log`是`trainer`的log日志，模型快照和描述存放在`pass-0000*`.
+
+### Kubernetes集群卸载或删除
+
+#### 删除EFS
+
+到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷
+
+#### 删除安全组
+
+去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`.
+
+#### 删除S3 bucket
+
+进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket
+
+#### 销毁集群
+
+```
+kube-aws destroy
+```
+
+命令会立刻返回，但需要大约5分钟来销毁集群
+
+可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e8e87be711bd45177ed77c81c531606e801d1f0
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md
@@ -0,0 +1,688 @@
+# Kubernetes on AWS
+
+We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
+
+## Distributed PaddlePaddle Training Core Concepts
+
+### Distributed Training Job
+
+A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
+
+Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables.
+
+In a distributed training job, we would:
+
+1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
+
+### Parameter Servers and Trainers
+
+There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
+
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
+
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved.
+
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
+
+### Trainer ID
+
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
+
+### Training
+
+The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
+
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
+1. Copy the training data from EFS persistent volume into container.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
+
+
+## PaddlePaddle on AWS with Kubernetes
+
+### Choose AWS Service Region
+This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link
+https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/
+Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3.
+In this tutorial, we use "Oregon(us-west-2)" as example.
+
+### Create AWS Account and IAM Account
+
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
+
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create IAM users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+
+Please be aware that this tutorial needs the following privileges for the user in IAM:
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### Download kube-aws and kubectl
+
+#### kube-aws
+
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
+##### Verify kube-aws integrity
+Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp.
+Import the CoreOS Application Signing Public Key:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+Validate the key fingerprint:
+
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
+
+Validate the tarball's GPG signature:
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### Install kube-aws
+Extract the binary:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+Add kube-aws to your path:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
+
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### Configure AWS Credentials
+
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
+
+And then configure your AWS account information:
+
+```
+aws configure
+```
+
+
+Fill in the required fields:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+Verify that your credentials work by describing any instances you may already have running on your account:
+
+```
+aws ec2 describe-instances
+```
+
+### Define Cluster Parameters
+
+#### EC2 key pair
+
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+
+Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
+
+After creating a key pair, you will use the key pair name to configure the cluster.
+
+Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon).
+
+Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later.
+
+
+#### KMS key
+
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+
+You can create a KMS key with the aws command line tool:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+We will need to use the value of `Arn` later.
+
+And then let's add several inline policies in your IAM user permission.
+
+Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`.
+
+Paste into following inline policies:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : Its value has to be exactly "2012-10-17".
+`AWS_ACCOUNT_ID`: You can get it from following command line:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. 
+Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps.
+
+#### External DNS name
+
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps.
+
+#### S3 bucket
+
+You need to create an S3 bucket before startup the Kubernetes cluster.
+
+There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2).
+
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon).
+
+
+#### Initialize Assets
+
+Create a directory on your local machine to hold the generated assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`)
+
+
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+
+By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3.
+
+
+#### Render contents of the asset directory
+
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+
+```
+kube-aws render credentials --generate-ca
+```
+
+The next command generates the default set of cluster assets in your asset directory.
+
+```
+kube-aws render stack
+```
+Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder.
+
+
+### Kubernetes Cluster Start Up
+
+#### Create the instances defined in the CloudFormation template
+
+Now let's create your cluster (choose any `PREFIX` for the command below):
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
+
+
+#### Configure DNS
+
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+
+If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### Find IP address
+
+Use command `dig` to check the load balancer hostname to get the ip address.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
+
+*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster".
+
+*If you do not own a DNS name*:
+##### Update local DNS association
+Edit `/etc/hosts` to associate above ip with the DNS name.
+##### Add Route53 private name service in VPC
+ - Open [Route53 Console](https://console.aws.amazon.com/route53/home)
+ - Create hosted zone with following config
+   - Domain name: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - Add A record
+    - Click on the zone "paddle" just created
+    - Click the button "Create record set"
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - Verify name service
+    - Connect to any instance created by kube-aws via ssh
+    - Run command "host paddle", see if the ip returned is the private ip of kube-controller
+
+#### Access the cluster
+
+Once the API server is running, you should see:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes 
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### Setup Elastic File System for Cluster
+
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS.
+
+1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)
+  1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below)
+  <center>![](src/worker_security_group.png)</center>
+  2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets).
+  <center>![](src/add_security_group.png)</center>
+
+2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### Start PaddlePaddle Training Demo on AWS
+
+#### Configure Kubernetes Volume that Points to EFS
+
+First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn.
+
+Save following snippet as `pv.yaml`
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+Run following command to create a persistent volumn:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+
+Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume.
+
+Save following snippet as `pvc.yaml`.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+Run following command to create a persistent volumn claim:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### Prepare Training Data
+
+We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created.
+
+save following snippet as `paddle-data-job.yaml`
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+Run following command to launch the job:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+
+Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1`
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+
+#### Start Training
+
+Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml`
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods.
+
+`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables.
+
+`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency.
+
+Run following command to launch the job.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+Inspect individual pods
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+Inspect individual console output
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`).
+
+Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
+
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+
+#### Inspect Training Output
+
+Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output.
+
+1. ssh Into Worker EC2 instance
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance.
+
+2. Mount EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+Now folder `efs` will have structure similar to:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`.
+
+### Kubernetes Cluster Tear Down
+
+#### Delete EFS
+
+Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created.
+
+#### Delete security group
+
+Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`.
+
+
+#### Delete S3 Bucket
+
+Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created.
+
+#### Destroy Cluster
+
+```
+kube-aws destroy
+```
+
+The command will return immediately, but it might take 5 min to tear down the whole cluster.
+
+You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process.
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1a11f7165a2f9da9dd044641274447e7943a597
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_cn.md
@@ -0,0 +1,206 @@
+# Kubernetes单机训练
+
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+
+## 制作Docker镜像
+
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。
+
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。
+
+### 运行容器
+
+```
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
+```
+
+### 下载数据
+
+进入容器`/root/paddle/demo/quick_start/data`目录，使用`get_data.sh`下载数据
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### 修改启动脚本
+
+下载完数据后，修改`/root/paddle/demo/quick_start/train.sh`文件，内容如下（增加了一条cd命令）
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### 提交镜像
+
+修改启动脚本后，退出容器，使用`docker commit`命令创建新镜像。
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## 使用 Kubernetes 进行训练
+
+>针对任务运行完成后容器自动退出的场景，Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。
+
+### 编写yaml文件
+
+在训练时，输出结果可能会随着容器的消耗而被删除，需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像，可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job)，简单的yaml文件如下：
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### 创建PaddlePaddle Job
+
+使用上文创建的yaml文件创建Kubernetes Job，命令为：
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+查看job的详细情况：
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### 查看训练结果
+
+根据Job对应的Pod信息，可以查看此Pod运行的宿主机。
+
+```
+kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+我们还可以登录到宿主机上查看训练结果。
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..167089b8074b33e3b094fa3ec8e377630cec42ac
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md
@@ -0,0 +1,312 @@
+# Kubernetes分布式训练
+
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
+
+## 整体方案
+
+在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
+
+上图描述了一个3节点的分布式训练场景，在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
+
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，按照下面步骤即可：
+
+1. [制作PaddlePaddle镜像](#制作镜像)
+1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
+1. [编写本次训练的YAML文件，创建一个Kubernetes job](#创建Job)
+1. [训练结束后查看输出结果](#查看输出)
+
+下面就根据这几个步骤分别介绍。
+
+### 制作镜像
+
+PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
+
+- 拷贝训练文件到容器内
+- 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
+
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+然后将构建成功的镜像上传到镜像仓库。
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+
+### 准备训练数据
+
+这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据，也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image.
+
+在启动Job之前，需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+完成后volume中的文件内容大致如下：
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
+
+### 创建Job
+
+Kubernetes可以通过YAML文件来创建相关对象，然后可以使用命令行工具创建job。
+
+Job YAML文件描述了这次训练使用的Docker镜像，需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数，也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义，可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如，本次训练的YAML文件可以写成：
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
+
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
+
+
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`
+
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
+
+```bash
+kubectl create -f job.yaml
+```
+
+创建成功后，Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像，启动容器开始训练。
+
+
+### 查看输出
+
+在训练过程中，可以在共享存储上查看输出的日志和模型，例如output目录下就存放了输出结果。注意node_0，node_1，node_2这几个目录表示PaddlePaddle节点与trainer_id，并不是Kubernetes中的node概念。
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+我们可以通过日志查看容器训练的情况，例如：
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+
+## 一些细节的补充
+
+### 使用环境变量
+
+使用容器方式运行训练任务的Kubernetes Job，通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本，将环境变量转换成paddle的命令行参数：
+```
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Pod间通信
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+* *注意*: `getPodList()`会获取当前namespace下的所有pod，如果已经有pod运行，可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+### 启动任务
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2dc4da8451af317df76c5b3df328b6f58429610
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -0,0 +1,372 @@
+# Distributed Training on Kubernetes
+
+We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
+previous document.
+In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
+on Kubernetes cluster.
+
+## Overall Architecture
+
+Before creating a training job, the users need to slice the training data and deploy
+the Python scripts along with it into the distributed file system
+(We can use the different type of Kuberentes Volumes to mount different distributed
+file systems). Before training starts, The program will copy the training data into the
+Container and also save the models at the same path during training. The global architecture
+is as follows:
+
+![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png)
+
+The above figure describes a distributed training architecture which contains 3 nodes, each 
+Pod mounts a folder of the distributed file system to save training data and models
+by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
+3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
+PaddlePaddle starts up the communication between PServer and Trainer and read training
+data for this training job.
+
+As the description above, we can start up a PaddlePaddle distributed training job on a 
+Kubernetes ready cluster with the following steps:
+
+1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
+1. [Split training data and upload to the distributed file system](#Upload Training Data)
+1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
+1. [Check the output](#Check The Output)
+
+We will introduce these steps as follows:
+
+### Build a Docker Image
+
+Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
+
+- Copying the training data into container.
+- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
+
+Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
+- https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
+
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+And then upload the new Docker Image to a Docker hub:
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
+you need to use your repository instead of it. We will replace it with your respository name to
+represent the Docker Image which built in this step.
+
+### Prepare Training Data
+
+We can download and split the training job by creating a Kubernetes Job, or custom your image
+by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train).
+
+Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
+the different file system, the generated dataset would be saved on this volume.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+Create the Job with the following command:
+
+```bash
+> kubectl create -f xxx.yaml
+```
+
+If created successfully, you can see some information like this:
+
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+The `paddle-cluster-job` above is the job name for this training job; we need 3
+PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
+the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
+
+
+### Create a Job
+
+Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
+to create it.
+
+The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
+[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
+The following is an example for this training job:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+In the above YAML file:
+- `metadata.name`, The job name.
+- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
+- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
+  is equal to `completions`.
+- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
+  the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
+  to configure the host path we want to mount.
+- `env`, the environment variables in the Container, we pass some startup arguments by
+  this approach, some details are as following:
+  - JOB_PATH：the mount path in the container
+  - JOB_NAME：the job name
+  - TRAIN_CONFIG_DIR：the job path in the container, we can find the training data path by
+    combine with JOB_NAME.
+  - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
+    device name.
+  - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
+  - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
+    for dense prameter update. 
+  - CONF_PADDLE_PORTS_NUM_SPARSE：the argument `--ports_num_for_sparse` of `Paddle PServer`,
+    the port number for sparse parameter update.
+  - CONF_PADDLE_GRADIENT_NUM：the number of training node, the argument 
+  `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
+
+You can find some details information at [here]
+(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
+
+```bash
+kubectl create -f job.yaml
+```
+
+Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
+pull the Docker image and begin to train.
+
+
+### Checkout the Output
+
+At the process of training, we can check the logs and the output models which is stored in
+the `output` folder.
+
+**NOTE**, `node_0`, `node_1` and `node_2` represent the
+`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can checkout the status of each training Pod by viewing the logs:
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+## Some Additional Details
+
+### Using Environment Variables
+
+Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
+Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
+to the start up arguments of PaddlePaddle process:
+
+```bash
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Communication between Pods
+
+At the begin of `start_paddle.py`, it would initializes and parses the arguments.
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some 
+Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
+Kubernetes Pod or Replicaset in the future.
+
+The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
+to generate `trainer_id`.
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
+so that we can start up them by `startPaddle(idMap, train_args_dict)`.
+
+### Create Job
+
+The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
+`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
+environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
+finally find `trainerId` from `idMap` according to its IP address.
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..96ff652705726fc56fa0078593cd2a695fcdb5e2
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_en.md
@@ -0,0 +1,210 @@
+# Kubernetes
+
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.
+
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.
+
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start PaddlePaddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md b/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..954b2215cc3136ec5b3e1cdc2f6d3f508f814516
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md
@@ -0,0 +1,41 @@
+# 在OpenMPI集群中启动训练
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/openmpi_en.md b/doc/v2/howto/cluster/multi_cluster/openmpi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5c02b336b8a974f546499acae32edac24219be9
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/openmpi_en.md
@@ -0,0 +1,41 @@
+# OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png b/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/create_efs.png b/doc/v2/howto/cluster/multi_cluster/src/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/create_efs.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png b/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6d3a12ae393aa594b8e6e9a5f726109426937284
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
@@ -0,0 +1,7 @@
+FROM alpine
+
+RUN apk update && apk upgrade && apk add coreutils
+ADD quick_start /quick_start
+ADD get_data.sh /bin/
+RUN chmod +x /bin/get_data.sh
+ENTRYPOINT ["/bin/get_data.sh"]
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..83cef7affd0ac4d3a1ca08ea5b046fa81e1bc630
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md
@@ -0,0 +1,6 @@
+To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
+
+```
+cp -r ../../../../../../demo/quick_start .
+docker build . -t prepare-data-image-name
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d187ba5ac8d03f69dfdefd4f63610ed7921575be
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+out_dir=$OUT_DIR
+split_count=$SPLIT_COUNT
+
+set -e
+
+mkdir -p $out_dir
+cp -r /quick_start $out_dir/
+
+mkdir -p $out_dir/0/data
+cd $out_dir/0/data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+tar zxvf preprocessed_data.tar.gz
+rm preprocessed_data.tar.gz
+
+split -d --number=l/$split_count -a 5 train.txt train.
+mv train.00000 train.txt
+
+cd $out_dir
+end=$(expr $split_count - 1)
+for i in $(seq 1 $end); do
+    mkdir -p $i/data
+    cp -r 0/data/* $i/data
+    mv $i/data/train.`printf %05d $i` $i/data/train.txt
+done;
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..77f021a89a70d934bf70424eaa3c6dc3f7c93a28
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddlepaddle/paddle:latest
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+RUN chmod +x /root/start.sh
+CMD ["bash"," -c","/root/start.sh"]
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96bf65497ffa23e90c4c9350504f86367b48daf2
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md
@@ -0,0 +1,5 @@
+To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
+
+```
+docker build . -t train-image-name
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..12dfe1e6386885a6989d3887f21c6922f137a9ae
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+set -eu
+
+jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
+cd /root
+cp -rf $jobconfig/* .
+
+python /root/start_paddle.py \
+  --dot_period=10 \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
+  --log_period=50 \
+  --num_passes=10 \
+  --trainer_count=$TRAINER_COUNT \
+  --saving_period=1 \
+  --local=0 \
+  --config=trainer_config.lr.py \
+  --use_gpu=0
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
new file mode 100755
index 0000000000000000000000000000000000000000..935c12bb67e1fe08bc135a7a2220fcd43c548482
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
@@ -0,0 +1,170 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+import socket
+import os
+import argparse
+
+# configuration for cluster
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+
+tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+
+
+def refine_unknown_args(cmd_args):
+    '''
+    refine unknown parameters to handle some special parameters
+    '''
+    new_args = []
+    for arg in cmd_args:
+        if arg.startswith("--") and arg.find("=") != -1:
+            equal_pos = arg.find("=")  # find first = pos
+            arglist = list(arg)
+            arglist[equal_pos] = " "
+            arg = "".join(arglist)
+            arg = arg.lstrip("-")
+            new_args += arg.split(" ")
+        elif arg.startswith("--") and arg.find("=") == -1:
+            arg = arg.lstrip("-")
+            new_args.append(arg)
+        else:
+            new_args.append(arg)
+    return new_args
+
+
+def isPodAllRunning(podlist):
+    '''
+    check all pod is running
+    '''
+    require = len(podlist["items"])
+    running = 0
+    for pod in podlist["items"]:
+        if pod["status"]["phase"] == "Running":
+            running += 1
+    print "waiting for pods running, require:", require, "running:", running
+    if require == running:
+        return True
+    return False
+
+
+def getPodList():
+    '''
+    get all container status of the job
+    '''
+    apiserver = "https://" + \
+        os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \
+        os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")
+
+    pod = API + NAMESPACE + "/pods?"
+    job = JOBNAME
+    if os.path.isfile(tokenpath):
+        tokenfile = open(tokenpath, mode='r')
+        token = tokenfile.read()
+        Bearer = "Bearer " + token
+        headers = {"Authorization": Bearer}
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            headers=headers,
+                            verify=False).json()
+    else:
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            verify=False).json()
+
+
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+
+
+def startPaddle(idMap={}, train_args_dict=None):
+    '''
+    start paddle pserver and trainer
+    '''
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+    logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
+    if not os.path.exists(JOB_PATH_OUTPUT):
+        os.makedirs(JOB_PATH_OUTPUT)
+    if not os.path.exists(logDir):
+        os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH + \
+        "/" + str(trainerId) + "/data/*" + " ./data/"
+    os.system(copyCommand)
+    startPserver = 'nohup paddle pserver' + \
+        " --port=" + str(PADDLE_PORT) + \
+        " --ports_num=" + str(PADDLE_PORTS_NUM) + \
+        " --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \
+        " --nics=" + PADDLE_NIC + \
+        " --comment=" + "paddle_process_by_paddle" + \
+        " --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\
+        " > " + logDir + "/server.log 2>&1 &"
+    print startPserver
+    os.system(startPserver)
+    # wait until pservers completely start
+    time.sleep(20)
+    startTrainer = program + args + " 2>&1 | tee " + \
+        logDir + "/train.log"
+    print startTrainer
+    os.system(startTrainer)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        prog="start_paddle.py", description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(20)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+    startPaddle(idMap, train_args_dict)
diff --git a/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png b/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png b/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png b/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png b/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png differ
diff --git a/doc/v2/howto/cluster/preparations_cn.md b/doc/v2/howto/cluster/preparations_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce40697e703503b66f6306e15ebdb0ce1329991d
--- /dev/null
+++ b/doc/v2/howto/cluster/preparations_cn.md
@@ -0,0 +1,16 @@
+## 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/v2/howto/cluster/preparations_en.md b/doc/v2/howto/cluster/preparations_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b77b293907ae0548134fc65ceed3aa0ed0b845d
--- /dev/null
+++ b/doc/v2/howto/cluster/preparations_en.md
@@ -0,0 +1,17 @@
+## Preparations
+
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/v2/howto/cluster/src/Dockerfile b/doc/v2/howto/cluster/src/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e178bf4da0f32fca9586b5b69a2c7419de5d9cb1
--- /dev/null
+++ b/doc/v2/howto/cluster/src/Dockerfile
@@ -0,0 +1,7 @@
+FROM paddlepaddle/paddle:latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/src/efs_mount.png b/doc/v2/howto/cluster/src/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/v2/howto/cluster/src/efs_mount.png differ
diff --git a/doc/v2/howto/cluster/src/managed_policy.png b/doc/v2/howto/cluster/src/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/v2/howto/cluster/src/managed_policy.png differ
diff --git a/doc/v2/howto/cluster/src/ps_cn.png b/doc/v2/howto/cluster/src/ps_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/howto/cluster/src/ps_cn.png differ
diff --git a/doc/v2/howto/cluster/src/ps_en.png b/doc/v2/howto/cluster/src/ps_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/howto/cluster/src/ps_en.png differ
diff --git a/doc/v2/howto/cluster/src/trainer.png b/doc/v2/howto/cluster/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/howto/cluster/src/trainer.png differ
diff --git a/doc/v2/howto/cluster/src/trainer_cn.png b/doc/v2/howto/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/howto/cluster/src/trainer_cn.png differ
diff --git a/doc/v2/howto/cluster/src/word2vec/api_train_v2.py b/doc/v2/howto/cluster/src/word2vec/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9107e24c175f1fbf29d86e222e4b66031a5b505e
--- /dev/null
+++ b/doc/v2/howto/cluster/src/word2vec/api_train_v2.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gzip
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def main():
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py b/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..791504094f3ecae925226ff1d90f20f91d4c018d
--- /dev/null
+++ b/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import paddle.v2 as paddle
+import pickle
+
+embsize = 32
+hiddensize = 256
+N = 5
+cluster_train_file = "./train_data_dir/train/train.txt"
+cluster_test_file = "./test_data_dir/test/test.txt"
+node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+if not node_id:
+    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def cluster_reader_cluster(filename, node_id):
+    def cluster_reader():
+        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
+            for l in f:
+                csv_data = [int(cell) for cell in l.split(",")]
+                yield tuple(csv_data)
+
+    return cluster_reader
+
+
+def main():
+    # get arguments from env
+
+    # for local training
+    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
+    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
+    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
+
+    if not cluster_train:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
+    else:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
+            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
+            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
+            ports_num_for_sparse=int(
+                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
+            num_gradient_servers=int(
+                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
+            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
+            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
+    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
+    word_dict = pickle.load(fn)
+    fn.close()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        cluster_reader_cluster(cluster_test_file, node_id), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/v2/howto/cluster/src/word2vec/prepare.py b/doc/v2/howto/cluster/src/word2vec/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42548fbf03a0298e1e397c868e4d531801ec89a
--- /dev/null
+++ b/doc/v2/howto/cluster/src/word2vec/prepare.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import tarfile
+import os
+import pickle
+
+SPLIT_COUNT = 3
+N = 5
+
+
+def file_len(fd):
+    for i, l in enumerate(fd):
+        pass
+    return i + 1
+
+
+def split_from_reader_by_line(filename, reader, split_count):
+    fn = open(filename, "w")
+    for batch_id, batch_data in enumerate(reader()):
+        batch_data_str = [str(d) for d in batch_data]
+        fn.write(",".join(batch_data_str))
+        fn.write("\n")
+    fn.close()
+
+    fn = open(filename, "r")
+    total_line_count = file_len(fn)
+    fn.close()
+    per_file_lines = total_line_count / split_count + 1
+    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
+    os.system(cmd)
+
+
+word_dict = paddle.dataset.imikolov.build_dict()
+with open("word_dict.pickle", "w") as dict_f:
+    pickle.dump(word_dict, dict_f)
+
+split_from_reader_by_line("train.txt",
+                          paddle.dataset.imikolov.train(word_dict, N),
+                          SPLIT_COUNT)
+split_from_reader_by_line("test.txt",
+                          paddle.dataset.imikolov.test(word_dict, N),
+                          SPLIT_COUNT)
diff --git a/doc/v2/howto/cmd_parameter/arguments_cn.md b/doc/v2/howto/cmd_parameter/arguments_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dea231ca5487978d59a4d0a570431722ed6b3bf
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/arguments_cn.md
@@ -0,0 +1,394 @@
+# 参数概述
+
+虽然Paddle看起来包含了众多参数，但是大部分参数是为开发者提供的，或者已经在集群提交环境中自动设置，因此用户并不需要关心它们。在此，根据这些参数的使用场合，我们将它们划分为不同的类别。例如，`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中，而有些参数需要在集群多机训练中使用等。
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">参数</th>
+<th scope="col" class="left">本地训练</th>
+<th scope="col" class="left">集群训练</th>
+<th scope="col" class="left">本地测试</th>
+<th scope="col" class="left">集群测试</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">通用</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">训练/测试</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">递归神经网络(RNN)</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/v2/howto/cmd_parameter/arguments_en.md b/doc/v2/howto/cmd_parameter/arguments_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1963067bda949b11ececefed3db7db1432c6223
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/arguments_en.md
@@ -0,0 +1,394 @@
+# Argument Outline
+
+It looks like there are a lot of arguments. However, most of them are for developers or alrealy set automatically in cluster submitting environment and users do not need to care about them. Here, we divide these arguments into serveral classes according to the scenario that they are used in. For example, the arguments in `common` can be used in all scenes. Some arguments can be only used in certain layers. Some are needed by multi machines training in cluster, etc.
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">args</th>
+<th scope="col" class="left">local train</th>
+<th scope="col" class="left">cluster train</th>
+<th scope="col" class="left">local test</th>
+<th scope="col" class="left">cluster test</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">common</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">train</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">train/test</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">test</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/PnpairValidation</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">RNN</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">Async SGD</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">Performance Tuning</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">Data Provider</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">RandomNumber</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">UnitTest</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">Matrix/Vector</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/v2/howto/cmd_parameter/detail_introduction_cn.md b/doc/v2/howto/cmd_parameter/detail_introduction_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4625ba68cf23e5697554ba94efaf0b873f2c1de
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/detail_introduction_cn.md
@@ -0,0 +1,323 @@
+# 细节描述
+
+## 通用
+
+* `--job`
+  - 工作模式，包括: **train, test, checkgrad**，其中checkgrad主要为开发者使用，使用者不需要关心。
+  - 类型: string (默认: train)
+
+* `--config`
+  - 用于指定网络配置文件。
+  - 类型: string (默认: null).
+
+* `--use_gpu`
+  - 训练过程是否使用GPU，设置为true使用GPU模式，否则使用CPU模式。
+  - 类型: bool (默认: 1).
+
+* `--local`
+  - 训练过程是否为本地模式，设置为true使用本地训练或者使用集群上的一个节点，否则使用多机训练。
+  - 类型: bool (默认: 1).
+
+* `--trainer_count`
+  - 指定一台机器上使用的线程数。例如，trainer_count = 4, 意思是在GPU模式下使用4个GPU，或者在CPU模式下使用4个线程。每个线程（或GPU）分配到当前数据块样本数的四分之一。也就是说，如果在训练配置中设置batch_size为512，每个线程分配到128个样本用于训练。
+  - 类型: int32 (默认: 1).
+
+* `--num_passes`
+  - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时，意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
+  - 类型: int32 (默认: 100).
+
+* `--config_args`
+  - 传递给配置文件的参数。格式: key1=value1,key2=value2.
+  - 类型: string (默认: null).
+
+* `--version`
+  - 是否打印版本信息。
+  - 类型: bool (默认: 0).
+
+* `--show_layer_stat`
+  - 是否显示**每个批次数据**中每层的数值统计.
+  - 类型: bool (默认: 0).
+
+## 训练
+
+* `--log_period`
+  - 每log_period个批次打印日志进度.
+  - 类型: int32 (默认: 100).
+
+* `--dot_period`
+  - 每dot_period个批次输出符号'.'.
+  - 类型: int32 (默认: 1).
+
+* `--saving_period`
+  - 每saving_period轮保存训练参数.
+  - 类型: int32 (默认: 1).
+
+* `--save_dir`
+  - 保存模型参数的目录，需要明确指定，但不需要提前创建。
+  - 类型: string (默认: null).
+
+* `--start_pass`
+  - 从start_pass轮开始训练，会加载上一轮的参数。
+  - 类型: int32 (默认: 0).
+
+* `--show_parameter_stats_period`
+  - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
+  - 类型: int32 (默认: 0).
+
+* `--save_only_one`
+  - 只保存最后一轮的参数，而之前的参数将会被删除。
+  - 类型: bool (默认: 0).
+
+* `--load_missing_parameter_strategy`
+  - 当模型参数不存在时，指定加载的方式。目前支持fail/rand/zero三种操作.
+    - `fail`: 程序直接退出.
+    - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
+    - `zero`: 所有参数置为零.
+  - 类型: string (默认: fail).
+
+* `--init_model_path`
+   - 初始化模型的路径。如果设置该参数，start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
+   - 类型: string (默认: null).
+
+* `--saving_period_by_batches`
+   - 在一轮中每saving_period_by_batches个批次保存一次参数。
+   - 类型: int32 (默认: 0).
+
+* `--log_error_clipping`
+  - 当在网络层配置中设置**error_clipping_threshold**时，该参数指示是否打印错误截断日志。如果为true，**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--log_clipping`
+  - 当在训练配置中设置**gradient_clipping_threshold**时，该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--use_old_updater`
+  - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater，主要为开发者使用，使用者通常无需关心.
+  - 类型: bool (默认: 0).
+
+* `--enable_grad_share`
+  - 启用梯度参数的阈值，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - 梯度参数的分块数目，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 64).
+
+## 测试
+
+* `--test_pass`
+  - 加载test_pass轮的模型用于测试.
+  - 类型: int32 (默认: -1).
+
+* `--test_period`
+   - 如果为0，每轮结束时对所有测试数据进行测试；如果不为0，每test_period个批次对所有测试数据进行测试.
+  - 类型: int32 (默认: 0).
+
+* `--test_wait`
+  - 指示当指定轮的测试模型不存在时，是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试，可以使用该参数.
+  - 类型: bool (默认: 0).
+
+* `--model_list`
+  - 测试时指定的存储模型列表的文件.
+  - 类型: string (默认: "", null).
+
+* `--predict_output_dir`
+  - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定，默认为null，意思是不保存结果。在测试阶段，如果你想要保存某些层的特征图，请指定该目录。需要注意的是，网络层的输出是经过激活函数之后的值.
+  - 类型: string (默认: "", null).
+
+* `--average_test_period`
+  - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除，默认为0，意思是不使用平均参数执行测试.
+  - 类型: int32 (默认: 0).
+
+* `--distribute_test`
+  - 在分布式环境中测试，将多台机器的测试结果合并.
+  - 类型: bool (默认: 0).
+
+* `--predict_file`
+  - 保存预测结果的文件名。该参数默认为null，意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层，每轮都会保存预测结果.
+  - 类型: string (默认: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - 指示使用哪个GPU核.
+  - 类型: int32 (默认: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - 如果为true，一个GPU设备上不允许配置多个模型.
+  - 类型: bool (默认: 1).
+
+* `--parallel_nn`
+  - 指示是否使用多线程来计算一个神经网络。如果为false，设置gpu_id指定使用哪个GPU核（训练配置中的设备属性将会无效）。如果为true，GPU核在训练配置中指定（gpu_id无效）.
+  - 类型: bool (默认: 0).
+
+* `--cudnn_dir`
+  - 选择路径来动态加载NVIDIA CuDNN库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cuda_dir`
+  - 选择路径来动态加载NVIDIA CUDA库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - 指定cuDNN的最大工作空间容限，单位是MB，默认为4096MB=4GB. 
+  - 类型: int32 (默认: 4096MB=4GB)
+
+## 自然语言处理(NLP): RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
+  - 类型: bool (默认: 0).
+
+* `--prev_batch_state`
+  - 标识是否为连续的batch计算.
+  - 类型: bool (默认: 0).
+
+* `--beam_size`
+  - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上，都会产生当前层状态的所有继承结果，按启发式损失的大小递增排序。然而，每层上只能保存固定数目个最好的状态，该数目是提前定义好的，称之为集束大小.
+  - 类型: int32 (默认: 1).
+
+* `--diy_beam_search_prob_so`
+  - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
+  - 类型: string (默认: "", null).
+
+## 数据支持(DataProvider)
+
+* `--memory_threshold_on_load_data`
+  - 内存容限阈值，当超过该阈值时，停止加载数据.
+  - 类型: double (默认: 1.0).
+
+## 单元测试
+
+* `--checkgrad_eps`
+  - 使用checkgrad模式时的参数变化大小.
+  - 类型: double (默认: 1e-05).
+
+## 参数服务器和分布式通信
+
+* `--start_pserver`
+  - 指示是否开启参数服务器(parameter server).
+  - 类型: bool (默认: 0).
+
+* `--pservers`
+  - 参数服务器的IP地址，以逗号间隔.
+  - 类型: string (默认: "127.0.0.1").
+
+* `--port`
+  - 参数服务器的监听端口.
+  - 类型: int32 (默认: 20134).
+
+* `--ports_num`
+  - 发送参数的端口号，根据默认端口号递增.
+  - 类型: int32 (默认: 1).
+
+* `--trainer_id`
+  - 在分布式训练中，每个训练节点必须指定一个唯一的id号，从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
+  - 类型: int32 (默认: 0).
+
+* `--num_gradient_servers`
+  - 梯度服务器的数量，该参数在集群提交环境中自动设置.
+  - 类型: int32 (默认: 1).
+
+* `--small_messages`
+  - 如果消息数据太小，建议将该参数设为true，启动快速应答，无延迟.
+  - 类型: bool (默认: 0).
+
+* `--sock_send_buf_size`
+  - 限制套接字发送缓冲区的大小。如果仔细设置的话，可以有效减小网络的阻塞.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - 限制套接字接收缓冲区的大小.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - 参数服务器的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--parameter_block_size_for_sparse`
+  - 参数服务器稀疏更新的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--log_period_server`
+  - 在参数服务器终端每log_period_server个批次打印日志进度.
+  - 类型: int32 (默认: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - 在参数服务器上加载和保存参数，只有当设置了sparse_remote_update参数时才有效.
+  - 类型: bool (默认: 0).
+
+* `--pserver_num_threads`
+  - 同步执行操作的线程数.
+  - 类型: bool (默认: 1).
+
+* `--ports_num_for_sparse`
+  - 发送参数的端口号，根据默认值递增(port + ports_num)，用于稀疏训练中.
+  - 类型: int32 (默认: 0).
+
+* `--nics`
+  - 参数服务器的网络设备名称，已经在集群提交环境中完成设置.
+  - 类型: string (默认: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - 使用rdma还是tcp传输协议，该参数已经在集群提交环境中完成设置.
+  - 类型: string (默认: "tcp").
+
+## 异步随机梯度下降(Async SGD)
+* `--async_count`
+  - 定义异步训练的长度，如果为0，则使用同步训练.
+  - 类型: int32 (默认: 0).
+
+* `--async_lagged_ratio_min`
+  - 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
+  - 类型: double (默认: 1.0).
+
+* `--async_lagged_ratio_default`
+  - 如果在网络配置中未设置async_lagged_grad_discard_ratio，则使用该参数作为默认值.
+  - 类型: double (默认: 1.5).
+
+## 性能调优(Performance Tuning)
+
+* `--log_barrier_abstract`
+  - 如果为true，则显示阻隔性能的摘要信息.
+  - 类型: bool (默认: 1).
+
+* `--log_barrier_show_log`
+  - 如果为true，则总会显示阻隔摘要信息，即使间隔很小.
+  - 类型: bool (默认: 0).
+
+* `--log_barrier_lowest_nodes`
+  - 最少显示多少个节点.
+  - 类型: int32 (默认: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
+  - 类型: bool (默认: 0).
+
+* `--show_check_sparse_distribution_log`
+  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
+  - 类型: bool (默认: 0).
+
+* `--check_sparse_distribution_batches`
+  - 每运行多少个批次执行一次稀疏参数分布的检查.
+  - 类型: int32 (默认: 100).
+
+* `--check_sparse_distribution_ratio`
+  - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio *  check_sparse_distribution_batches次，程序停止.
+  - 类型: double (默认: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - 不同参数服务器上数据大小的最大值与最小值的比率.
+  - 类型: double (默认: 2).
+
+## 矩阵/向量/随机数
+* `--enable_parallel_vector`
+  - 启动并行向量的阈值.
+  - 类型: int32 (默认: 0).
+
+* `--seed`
+  - 随机数的种子。srand(time)的为0.
+  - 类型: int32 (默认: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - 是否将全局种子应用于本地线程的随机数.
+  - 类型: bool (默认: 0).
diff --git a/doc/v2/howto/cmd_parameter/detail_introduction_en.md b/doc/v2/howto/cmd_parameter/detail_introduction_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b681ebc81a355dfc1a7638a4463dff6979929a45
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/detail_introduction_en.md
@@ -0,0 +1,327 @@
+```eval_rst
+..  _cmd_detail_introduction:
+```
+
+# Detail Description
+
+## Common
+
+* `--job`
+  - Job mode, including: **train, test, checkgrad**, where checkgrad is mainly for developers and users do not need to care about.
+  - type: string (default: train)
+
+* `--config`
+  - Use to specfiy network configure file.
+  - type: string (default: null).
+
+* `--use_gpu`
+  - Whether to use GPU for training, false is cpu mode and true is gpu mode.
+  - type: bool (default: 1).
+
+* `--local`
+  - Whether the training is in local mode or not. True when training locally or using one node in cluster. False when using multiple machines in cluster.
+  - type: bool (default: 1).
+
+* `--trainer_count`
+  - Define the number of threads used in one machine. For example, trainer_count = 4, means use 4 GPU in GPU mode and 4 threads in CPU mode. Each thread (or GPU) is assigned to 1/4 samples in current batch. That is to say, if setting batch_size of 512 in trainer config, each thread train 128 samples.
+  - type: int32 (default: 1).
+
+* `--num_passes`
+   - When `--job=train`, means training for num_passes passes. One pass means training all samples in dataset one time. When `--job=test`, means testing data from model of test_pass to  model of (num_passes - 1).
+   - type: int32 (default: 100).
+
+* `--config_args`
+  - arguments passed to config file. Format: key1=value1,key2=value2.
+  - type: string (default: null).
+
+* `--version`
+  - Whether to print version information.
+  - type: bool (default: 0).
+
+* `--show_layer_stat`
+  - Whether to show the statistics of each layer **per batch**.
+  - type: bool (default: 0).
+
+## Train
+
+* `--log_period`
+  - Log progress every log_period batches.
+  - type: int32 (default: 100).
+
+* `--dot_period`
+  - Print '.' every dot_period batches.
+  - type: int32 (default: 1).
+
+* `--saving_period`
+  - Save parameters every saving_period passes
+  - type: int32 (default: 1).
+
+* `--save_dir`
+  - Directory for saving model parameters. It needs to be specified, but no need to be created in advance.
+  - type: string (default: null).
+
+* `--start_pass`
+  - Start training from this pass. It will load parameters from the previous pass.
+  - type: int32 (default: 0).
+
+* `--show_parameter_stats_period`
+  - Show parameter statistic during training every show_parameter_stats_period batches. It will not show by default.
+  - type: int32 (default: 0).
+
+* `--save_only_one`
+  - Save the parameters only in last pass, while the previous parameters will be removed.
+  - type: bool (default: 0).
+
+* `--load_missing_parameter_strategy`
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
+    - `fail`: program will exit.
+    - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
+    - `zero`: all parameters are zero.
+  - type: string (default: fail).
+
+* `--init_model_path`
+   - Path of the initialization model. If it was set, start\_pass will be ignored. It can be used to specify model path in testing mode as well.
+   - type: string (default: null).
+
+* `--saving_period_by_batches`
+   - Save parameters every saving_period_by_batches batches in one pass.
+   - type: int32 (default: 0).
+
+* `--log_error_clipping`
+  - Whether to print error clipping log when setting **error_clipping_threshold** in layer config. If it is true, log will be printed in backward propagation **per batch**. This clipping effects on **gradient of output**.
+  - type: bool (default: 0).
+
+* `--log_clipping`
+  - Enable print log clipping or not when setting **gradient_clipping_threshold** in trainer config. This clipping effects on **gradient w.r.t. (with respect to) weight**.
+  - type: bool (default: 0).
+
+* `--use_old_updater`
+  - Whether to use the old RemoteParameterUpdater. Default use ConcurrentRemoteParameterUpdater. It is mainly for deverlopers and users usually do not need to care about.
+  - type: bool (default: 0).
+
+* `--enable_grad_share`
+  - threshold for enable gradient parameter, which is shared for batch multi-cpu training.
+  - type: int32 (default: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - block number of gradient parameter, which is shared for batch multi-cpu training.
+  - type: int32 (default: 64).
+
+## Test
+
+* `--test_pass`
+  - Load parameter from this pass to test.
+  - type: int32 (default: -1).
+
+* `--test_period`
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
+  - type: int32 (default: 0).
+
+* `--test_wait`
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
+  - type: bool (default: 0).
+
+* `--model_list`
+  - File that saves the model list when testing. 
+  - type: string (default: "", null).
+
+* `--predict_output_dir`
+  - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
+  - type: string (default: "", null).
+
+* `--average_test_period`
+  - Do test on average parameter every `average_test_period` batches. It MUST be devided by FLAGS_log_period. Default 0 means do not test on average parameter.
+  - type: int32 (default: 0).
+
+* `--distribute_test`
+  - Testing in distribute environment will merge results from multiple machines.
+  - type: bool (default: 0).
+
+* `--predict_file`
+  - File name for saving predicted result. Default, this argument is null, meaning save nothing. Now, this argument is only used in AucValidationLayer and PnpairValidationLayer, and saves predicted result every pass.
+  - type: string (default: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - Which gpu core to use.
+  - type: int32 (default: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - If true, do not allow multiple models on one GPU device.
+  - type: bool (default: 1).
+
+* `--parallel_nn`
+  - Whether to use multi-thread to calculate one neural network or not. If false, use gpu_id specify which gpu core to use (the device property in trainer config will be ingored). If true, the gpu core is specified in trainer config (gpu_id will be ignored).
+  - type: bool (default: 0).
+
+* `--cudnn_dir`
+  - Choose path to dynamic load NVIDIA CuDNN library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
+  - type: string (default: "", null)
+
+* `--cuda_dir`
+  - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
+  - type: string (default: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
+## NLP: RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - Whether to use batch method for calculation in simple RecurrentLayer.
+  - type: bool (default: 0).
+
+* `--prev_batch_state`
+  - batch is continue with next batch.
+  - type: bool (default: 0).
+
+* `--beam_size`
+  - Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
+  - type: int32 (default: 1).
+
+* `--diy_beam_search_prob_so`
+  - Specify shared dynamic library. It can be defined out of paddle by user.
+  - type: string (default: "", null).
+
+## DataProvider
+
+* `--memory_threshold_on_load_data`
+  - Stop loading data when memory is not sufficient.
+  - type: double (default: 1.0).
+
+## Unit Test
+
+* `--checkgrad_eps`
+  - parameter change size for checkgrad.
+  - type: double (default: 1e-05).
+
+## Parameter Server and Distributed Communication
+
+* `--start_pserver`
+  - Whether to start pserver (parameter server).
+  - type: bool (default: 0).
+
+* `--pservers`
+  - Comma separated IP addresses of pservers.
+  - type: string (default: "127.0.0.1").
+
+* `--port`
+  - Listening port for pserver.
+  - type: int32 (default: 20134).
+
+* `--ports_num`
+  - The ports number for parameter send, increment based on default port number.
+  - type: int32 (default: 1).
+
+* `--trainer_id`
+  - In distributed training, each trainer must be given an unique id ranging from 0 to num_trainers-1. Trainer 0 is the master trainer. User do not need to care this flag.
+  - type: int32 (default: 0).
+
+* `--num_gradient_servers`
+  - Numbers of gradient servers. This arguments is set automatically in cluster submitting environment.
+  - type: int32 (default: 1).
+
+* `--small_messages`
+  - If message size is small, recommend set it True to enable quick ACK and no delay
+  - type: bool (default: 0).
+
+* `--sock_send_buf_size`
+  - Restrict socket send buffer size. It can reduce network congestion if set carefully.
+  - type: int32 (default: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - Restrict socket recieve buffer size.
+  - type: int32 (default: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - Parameter block size for pserver, will automatically calculate a suitable value if it's not set.
+  - type: int32 (default: 0).
+
+* `--parameter_block_size_for_sparse`
+  - Parameter block size for sparse update pserver, will automatically calculate a suitable value if it's not set.
+  - type: int32 (default: 0).
+
+* `--log_period_server`
+  - Log progress every log_period_server batches at pserver end.
+  - type: int32 (default: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - Load and save parameters in pserver. Only work when parameter set sparse_remote_update.
+  - type: bool (default: 0).
+
+* `--pserver_num_threads`
+  - number of threads for sync op exec.
+  - type: bool (default: 1).
+
+* `--ports_num_for_sparse`
+  - The ports number for parameter send, increment based on default (port + ports_num). It is used by sparse Tranning.
+  - type: int32 (default: 0).
+
+* `--nics`
+  - Network device name for pservers, already set in cluster submitting environment.
+  - type: string (default: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - Use rdma or tcp transport protocol, already set in cluster submitting environment.
+  - type: string (default: "tcp").
+
+## Async SGD
+* `--async_count`
+  - Defined the asynchronous training length, if 0, then use synchronized training.
+  - type: int32 (default: 0).
+
+* `--async_lagged_ratio_min`
+  - Control the minimize value of `config_.async_lagged_grad_discard_ratio()`.
+  - type: double (default: 1.0).
+
+* `--async_lagged_ratio_default`
+  - If async_lagged_grad_discard_ratio is not set in network config, use it as defalut value.
+  - type: double (default: 1.5).
+
+## Performance Tuning
+
+* `--log_barrier_abstract`
+  - If true, show abstract barrier performance information.
+  - type: bool (default: 1).
+
+* `--log_barrier_show_log`
+  - If true, always show barrier abstract even with little gap.
+  - type: bool (default: 0).
+
+* `--log_barrier_lowest_nodes`
+  - How many lowest node will be logged.
+  - type: int32 (default: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - Whether to check that the distribution of sparse parameter on all pservers is balanced.
+  - type: bool (default: 0).
+
+* `--show_check_sparse_distribution_log`
+  - show log details for sparse parameter distribution in pserver.
+  - type: bool (default: 0).
+
+* `--check_sparse_distribution_batches`
+  - Running sparse parameter distribution check every so many batches.
+  - type: int32 (default: 100).
+
+* `--check_sparse_distribution_ratio`
+  - If parameters dispatched to different pservers have an unbalanced distribution for check_sparse_distribution_ratio *  check_sparse_distribution_batches times, crash program.
+  - type: double (default: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - The ratio of maximum data size / minimun data size for different pserver.
+  - type: double (default: 2).
+
+## Matrix/Vector/RandomNumber
+* `--enable_parallel_vector`
+  - threshold for enable parallel vector.
+  - type: int32 (default: 0).
+
+* `--seed`
+  - random number seed. 0 for srand(time)
+  - type: int32 (default: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - Whether to use global seed in rand of thread local.
+  - type: bool (default: 0).
diff --git a/doc/v2/howto/cmd_parameter/index_cn.rst b/doc/v2/howto/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6900bb1443e611d326e8d5640e794ac2b9079beb
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/index_cn.rst
@@ -0,0 +1,26 @@
+..  _cmd_line_index:
+
+命令行参数设置
+===============
+深度学习算法的实现有着多样化的特点，运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。
+
+在这一部分，首先以几个实际场景为例，展示了部分命令行参数的使用:
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+
+接着对所有参数的使用场合进行概述和分类:
+
+..  toctree::
+  :maxdepth: 1
+
+  arguments_cn.md
+
+最后给出细节描述，详细解释这些参数的属性和意义:
+
+..  toctree::
+  :maxdepth: 1
+
+  detail_introduction_cn.md
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f49683948ef78f363e2439cc25332431830eeb24
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -0,0 +1,26 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
+  arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
+  detail_introduction_en.md
diff --git a/doc/v2/howto/cmd_parameter/use_case_cn.md b/doc/v2/howto/cmd_parameter/use_case_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..db8c39d950771726346ff9c9481990abc13036cf
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/use_case_cn.md
@@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/v2/howto/cmd_parameter/use_case_en.md b/doc/v2/howto/cmd_parameter/use_case_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e287f0c4b9617cbc6504596512bf408c56dc10f9
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/use_case_en.md
@@ -0,0 +1,182 @@
+# Use Case
+
+## Local Training
+
+These command line arguments are commonly used by local training experiments, such as image classification, natural language processing, et al.
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(default:true)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(default:1)
+  --test_period=M \                      #(default:0) 
+  --num_passes=N \                       #(defalut:100)
+  --log_period=K \                       #(default:100)
+  --dot_period=1000 \                    #(default:1)
+  #[--show_parameter_stats_period=100] \ #(default:0)
+  #[--saving_period_by_batches=200] \    #(default:0)
+```
+`show_parameter_stats_period` and `saving_period_by_batches` are optional according to your task.
+
+### 1) Pass Command Argument to Network config
+
+`config_args` is a useful parameter to pass arguments to network config.
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+And `get_config_arg` can be used to parse these arguments in network config as follows:
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: the name specified in the `--config_args`
+- type: value type, bool, int, str, float etc.
+- default_value: default value if not set.
+
+### 2) Use Model to Initialize Network
+
+add argument:
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## Local Testing
+
+Method 1:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- use init\_model\_path to specify test model.
+- only can test one model.
+
+Method 2:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- use model_list to specify test models
+- can test several models, where model.list likes:
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+Method 3:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+This way must use model path saved by Paddle like this: `model/pass-%5d`. Testing model is from M-th pass to (N-1)-th pass. For example: M=12 and N=14 will test `model/pass-00012` and `model/pass-00013`.
+
+## Sparse Training
+
+Sparse training is usually used to accelerate calculation when input is sparse data with highly dimension. For example, dictionary dimension of input data is 1 million, but one sample just have several words. In paddle, sparse matrix multiplication is used in forward propagation and sparse updating is perfomed on weight updating after backward propagation.
+
+### 1) Local training
+
+You need to set **sparse\_update=True** in network config.  Check the network config documentation for more details.
+
+### 2) cluster training
+
+Add the following argument for cluster training of a sparse model. At the same time you need to set **sparse\_remote\_update=True** in network config. Check the network config documentation for more details.
+
+```
+--ports_num_for_sparse=1    #(default: 0)
+```
+
+## parallel_nn
+`parallel_nn` can be set to mixed use of GPUs and CPUs to compute layers. That is to say, you can deploy network to use a GPU to compute some layers and use a CPU to compute other layers. The other way is to split layers into different GPUs, which can **reduce GPU memory** or **use parallel computation to accelerate some layers**.
+
+If you want to use these characteristics, you need to specify device ID in network config (denote it as deviceId) and add command line argument:
+
+```
+--parallel_nn=true
+```
+### case 1: Mixed Use of GPU and CPU
+Consider the following example:
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
+
+- device=-1: use the CPU for layer fc3.
+
+- trainer_count:
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
+
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
+
+### Case 2: Specify Layers in Different Devices
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+In this case, we assume that there are 4 GPUs in one machine.
+
+- trainer_count=1:
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.
+
+- trainer_count=2:
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.
+
+- trainer_count=4:
+  - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
+
+**Allocation of device ID when `device!=-1`**:
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             specified in layer.
+gpu_id:               0 by default.
+threadId:             thread ID, range: 0,1,..., trainer_count-1
+numDevices_:          device (GPU) count in machine.
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/v2/howto/index_cn.rst b/doc/v2/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0268907bceb11cd53a4630c3f8b8e0424abe247
--- /dev/null
+++ b/doc/v2/howto/index_cn.rst
@@ -0,0 +1,37 @@
+进阶使用
+========
+
+PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。使用方式请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_parameter/index_cn.rst
+
+PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务，具体环境配置和使用说明请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  cluster/index_cn.rst
+
+PaddlePaddle提供了用于预测的C-API，关于C-API的使用，我们提供了如下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  capi/index_cn.rst
+
+PaddlePaddle支持多种灵活和高效的循环神经网络，具体配置使用方式请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/index_cn.rst
+
+关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优，请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..35ef197f58f1f865e2cdbdebb567d5637284637a
--- /dev/null
+++ b/doc/v2/howto/index_en.rst
@@ -0,0 +1,37 @@
+HOW TO
+========
+
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_parameter/index_en.rst
+
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+
+..  toctree::
+  :maxdepth: 1
+
+  cluster/index_en.rst
+
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
+  capi/index_en.rst
+
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/index_en.rst
+
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_en.rst
diff --git a/doc/v2/howto/optimization/gpu_profiling_cn.rst b/doc/v2/howto/optimization/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2396716bddd4810fa77c738d41f5482aa6d6055
--- /dev/null
+++ b/doc/v2/howto/optimization/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+============
+GPU性能调优
+============
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/v2/howto/optimization/gpu_profiling_en.rst b/doc/v2/howto/optimization/gpu_profiling_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e439be9bba8935cdd65f1c131cfd3725530ec0e
--- /dev/null
+++ b/doc/v2/howto/optimization/gpu_profiling_en.rst
@@ -0,0 +1,240 @@
+====================
+Tune GPU Performance 
+====================
+
+..  contents::
+
+This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
+
+- What is profiling?
+- Why we need profiling?
+- How to do profiling?
+- Profile tools
+- Hands-on Tutorial
+- Profiling tips
+
+What's profiling?
+=================
+In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
+complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
+Most commonly, profiling information serves to aid program optimization.
+
+Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
+understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
+profiling, it can interpret why does an operation take a long time?
+
+Why we need profiling?
+======================
+Since training deep neural network typically take a very long time to get over, performance is gradually becoming
+the most important thing in deep learning field. The first step to improve performance is to understand what parts
+are slow.  There is no point in improving performance of a region which doesn’t take much time!
+
+
+How to do profiling?
+====================
+To achieve maximum performance, there are five steps you can take to reach your goals.
+
+- Profile the code
+- Find the slow parts
+- Work out why they’re slow
+- Make them fast
+- Profile the code again
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+Profiler Tools
+==============
+For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
+
+**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
+In this tutorial, we will focus on nvprof and nvvp.
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+The above code snippet includes two methods, you can use any of them to profile the regions of interest.
+
+1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+You can find more details about how to use both of them in the next session.
+
+Hands-on Approach
+=================
+
+Built-in Timer
+--------------
+
+To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
+Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
+As a simple example, consider the following:
+
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. Execute your code and observe the results (see the emphasize-lines).
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof profiler
+---------------
+
+To use this command line profiler **nvprof**, you can simply issue the following command:
+
+1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. Use Nvidia profiler **nvprof** to profile the binary.
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+Then, you can get the following profiling result:
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp profiler
+-------------
+
+For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
+run application through GUI.
+
+**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
+As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
+allow us to fully utilize all warps on the GPU.
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
+For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+Profiling tips
+==============
+
+- The **nvprof** and **nvvp** output is a very good place to start.
+- The timeline is a good place to go next.
+- Only dig deep into a kernel if it’s taking a significant amount of your time.
+- Where possible, try to match profiler output with theory.
+    1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
+    2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
+- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
+
+
+Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
+Your mileage may vary!
+
+Reference
+=========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/v2/howto/optimization/nvvp1.png b/doc/v2/howto/optimization/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp1.png differ
diff --git a/doc/v2/howto/optimization/nvvp2.png b/doc/v2/howto/optimization/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp2.png differ
diff --git a/doc/v2/howto/optimization/nvvp3.png b/doc/v2/howto/optimization/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp3.png differ
diff --git a/doc/v2/howto/optimization/nvvp4.png b/doc/v2/howto/optimization/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp4.png differ
diff --git a/doc/v2/howto/rnn/hierarchical_layer_cn.rst b/doc/v2/howto/rnn/hierarchical_layer_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2f8f408b40299890da694862a7b9418cf9ff07f2
--- /dev/null
+++ b/doc/v2/howto/rnn/hierarchical_layer_cn.rst
@@ -0,0 +1,89 @@
+###########################
+支持双层序列作为输入的Layer
+###########################
+
+..	contents::
+
+概述
+====
+
+在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
+
+双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
+
+我们可以按照如下层次定义非序列，单层序列，以及双层序列。
+
++ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
++ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
++ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
+
+在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
+
+pooling
+========
+
+pooling 的使用示例如下。
+
+..	code-block:: bash
+
+        seq_pool = pooling(input=layer,
+                           pooling_type=pooling.Max(),
+                           agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
+
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
+
+  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列，或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
+
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
+
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
+
+last_seq 和 first_seq
+=====================
+
+last_seq 的使用示例如下（first_seq 类似）。
+
+..	code-block:: bash
+
+        last = last_seq(input=layer,
+                        agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
+
+  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
+
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
+
+expand
+======
+
+expand 的使用示例如下。
+
+..	code-block:: bash
+
+        ex = expand(input=layer1,
+                    expand_as=layer2,
+                    expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时（默认值）：
+
+  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列或一个双层序列，输出序列的类型（双层序列或单层序列）和序列中含有元素的数目同 layer2 一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+
+- `expand_level=ExpandLevel.FROM_SEQUENCE` 时：
+
+  - 作用：一个单层序列经过运算扩展成一个双层序列
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
diff --git a/doc/v2/howto/rnn/hierarchical_layer_en.rst b/doc/v2/howto/rnn/hierarchical_layer_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fb668f1babb47f49b2dab6d2411565e99599d8b0
--- /dev/null
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
@@ -0,0 +1,89 @@
+###########################
+Layers that Support Hierarchical Sequences as Input
+###########################
+ 
+.. contents::
+ 
+Overview 
+====
+ 
+A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
+ 
+A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
+ 
+We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
+ 
++ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
++ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
++ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
+ 
+In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
+ 
+`pooling`
+========
+ 
+The use of pooling is as follows:
+ 
+.. code-block:: bash
+ 
+        Seq_pool = pooling(input=layer,
+                           Pooling_type=pooling.Max(),
+                           Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
+ 
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+ 
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
+ 
+`last_seq` and `first_seq`
+=====================
+ 
+An example of using `last_seq` is as follows (usage of `first_seq` is similar).
+ 
+.. code-block:: bash
+ 
+        Last = last_seq(input=layer,
+                        Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
+ 
+`expand`
+======
+ 
+The use of expand is as follows.
+ 
+.. code-block:: bash
+ 
+        Ex = expand(input=layer1,
+                    Expand_as=layer2,
+                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
+ 
+  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
+  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
+  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
+ 
+- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
+ 
+  - Effect: a single-level sequence is extended to a double-level sequence
+  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
+  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9d6d417075485dceb1ee71f527b408aa6a6638ea
--- /dev/null
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -0,0 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
+单双层RNN API对比介绍
+#####################
+
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+示例1：双层RNN，子序列间无Memory
+================================
+
+在双层RNN中的经典情况是将内层的每一个时间序列数据，分别进行序列操作；并且内层的序列操作之间独立无依赖，即不需要使用Memory\ 。
+
+在本示例中，单层RNN和双层RNN的网络配置，都是将每一句分好词后的句子，使用LSTM作为encoder，压缩成一个向量。区别是RNN使用两层序列模型，将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下：
+
+* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+读取双层序列数据
+----------------
+
+首先，本示例中使用的原始数据如下\:
+
+- 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- 这是普通的单层时间序列的DataProvider代码，其说明如下：
+  
+  * DataProvider共返回两个数据，分别是words和label。即上述代码中的第19行。
+
+    - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
+    - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- 对于同样的数据，双层时间序列的DataProvider的代码。其说明如下：
+
+  - DataProvider共返回两组数据，分别是sentences和labels。即在双层序列的原始数据中，每一组内的所有句子和labels
+  - sentences是双层时间序列的数据。由于它内部包含了每组数据中的所有句子，且每个句子表示为对应的词表索引数组，因此它是integer_value_sub_sequence 类型的，即双层时间序列。
+  - labels是每组内每个句子的标签，故而是一个单层时间序列。
+
+
+模型配置的模型配置
+------------------------------------------
+
+首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+其次，我们看一下语义相同的双层RNN的网络配置\:
+
+* PaddlePaddle中的许多layer并不在意输入是否是时间序列，例如\ :code:`embedding_layer`\ 。在这些layer中，所有的操作都是针对每一个时间步来进行的。
+
+* 在该配置的7-26行(高亮部分)，将双层时间序列数据先变换成单层时间序列数据，再对每一个单层时间序列进行处理。
+
+  * 使用\ :code:`recurrent_group`\ 这个函数进行变换，在变换时需要将输入序列传入。由于我们想要的变换是双层时间序列=> 单层时间序列，所以我们需要将输入数据标记成\ :code:`SubsequenceInput`\ 。
+  
+  * 在本例中，我们将原始数据的每一组，通过\ :code:`recurrent_group`\ 进行拆解，拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。
+
+* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。
+
+* 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+示例2：双层RNN，子序列间有Memory
+================================
+
+本示例意图使用单层RNN和双层RNN实现两个完全等价的全连接RNN。
+
+* 对于单层RNN，输入数据为一个完整的时间序列，例如\ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ 。
+
+* 对于双层RNN，输入数据为在单层RNN数据里面，任意将一些数据组合成双层时间序列，例如\ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`。
+
+模型配置的模型配置
+------------------
+
+我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
+
+- 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- 双层RNN，外层memory是一个元素：
+
+  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
+  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    PaddlePaddle目前只支持在每个时间步中，Memory的时间序列长度一致的情况。
+
+示例3：双层RNN，输入不等长
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
+
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
+
+示例3对于单层RNN和双层RNN数据完全相同。
+
+* 对于单层RNN的数据一共有两个样本，他们分别是\ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ 和\ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ 。对于每一个单层RNN的数据，均有两组特征。
+
+* 在单层数据的基础上，双层RNN数据随意加了一些隔断，例如将第一条数据转化为\ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ 。
+
+* 需要注意的是PaddlePaddle目前只支持子序列数目一样的多输入双层RNN。例如本例中的两个特征，均有三个子序列。每个子序列长度可以不一致，但是子序列的数目必须一样。
+
+
+模型配置
+--------
+
+和示例2中的配置类似，示例3的配置使用了单层RNN和双层RNN，实现两个完全等价的全连接RNN。
+
+* 单层RNN\:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* 双层RNN\ \:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+在上面代码中，单层和双层序列的使用和示例2中的示例类似，区别是同时处理了两个输入。而对于双层序列，两个输入的子序列长度也并不相同。但是，我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状，和\ :code:`emb2`\ 的序列形状一致。
+
+
+词汇表
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory是PaddlePaddle实现RNN时候使用的一个概念。RNN即时间递归神经网络，通常要求时间步之间具有一些依赖性，即当前时间步下的神经网络依赖前一个时间步神经网络中某一个神经元输出。如下图所示。
+
+..  graphviz:: src/glossary_rnn.dot
+
+上图中虚线的连接，即是跨越时间步的网络连接。PaddlePaddle在实现RNN的时候，将这种跨越时间步的连接用一个特殊的神经网络单元实现。这个神经网络单元就叫Memory。Memory可以缓存上一个时刻某一个神经元的输出，然后在下一个时间步输入给另一个神经元。使用Memory的RNN实现便如下图所示。
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+使用这种方式，PaddlePaddle可以比较简单的判断哪些输出是应该跨越时间步的，哪些不是。
+
+..  _glossary_timestep:
+
+时间步
+------
+
+参考时间序列。
+
+
+..  _glossary_sequence:
+
+时间序列
+--------
+
+时间序列(time series)是指一系列的特征数据。这些特征数据之间的顺序是有意义的。即特征的数组，而不是特征的集合。而这每一个数组元素，或者每一个系列里的特征数据，即为一个时间步(time step)。值得注意的是，时间序列、时间步的概念，并不真正的和『时间』有关。只要一系列特征数据中的『顺序』是有意义的，即为时间序列的输入。
+
+举例说明，例如文本分类中，我们通常将一句话理解成一个时间序列。比如一句话中的每一个单词，会变成词表中的位置。而这一句话就可以表示成这些位置的数组。例如 :code:`[9, 2, 3, 5, 3]` 。
+
+关于时间序列(time series)的更详细准确的定义，可以参考 `维基百科页面 Time series <https://en.wikipedia.org/wiki/Time_series>`_ 或者 `维基百科中文页面 时间序列 <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_ 。
+
+另外，Paddle中经常会将时间序列成为 :code:`Sequence` 。他们在Paddle的文档和API中是一个概念。 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+RNN 在PaddlePaddle的文档中，一般表示 :code:`Recurrent neural network`，即时间递归神经网络。详细介绍可以参考 `维基百科页面 Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ 或者 `中文维基百科页面 <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ 中关于时间递归神经网络的介绍。
+
+RNN 一般在PaddlePaddle中，指对于一个时间序列输入数据，每一个时间步之间的神经网络具有一定的相关性。例如，某一个神经元的一个输入为上一个时间步网络中某一个神经元的输出。或者，从每一个时间步来看，神经网络的网络结构中具有有向环结构。
+
+..  _glossary_双层RNN:
+
+双层RNN
+-------
+
+双层RNN顾名思义，即RNN之间有一次嵌套关系。输入数据整体上是一个时间序列，而对于每一个内层特征数据而言，也是一个时间序列。即二维数组，或者数组的数组这个概念。 而双层RNN是可以处理这种输入数据的网络结构。
+
+例如，对于段落的文本分类，即将一段话进行分类。我们将一段话看成句子的数组，每个句子又是单词的数组。这便是一种双层RNN的输入数据。而将这个段落的每一句话用lstm编码成一个向量，再对每一句话的编码向量用lstm编码成一个段落的向量。再对这个段落向量进行分类，即为这个双层RNN的网络结构。
+
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a4485f7b5edf21871444801230ab1ee191b1137b
--- /dev/null
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -0,0 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
+API comparision between RNN and hierarchical RNN
+#####################
+
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+Reading hierarchical sequence data
+----------------
+
+Firstly, the original data in this example is as follows \:
+
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+
+
+Model configuration
+------------------------------------------
+
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+
+model configuration
+------------------
+
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- hierarchical RNN, the outer layer's memory is an element. 
+
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ .
+
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+
+
+model configuration
+--------
+
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+
+* single-layer RNN\:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* hierarchical RNN\ \:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+
+
+Glossary
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+
+..  graphviz:: src/glossary_rnn.dot
+
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+
+..  _glossary_timestep:
+
+time step
+------
+
+refers to time series
+
+
+..  _glossary_sequence:
+
+time series
+--------
+
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+
+..  _glossary_hierarchical_RNN:
+
+hierarchical RNN
+-------
+
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
+
diff --git a/doc/v2/howto/rnn/index_cn.rst b/doc/v2/howto/rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2032fb9e296ab024c68da1348064580c8c88d5be
--- /dev/null
+++ b/doc/v2/howto/rnn/index_cn.rst
@@ -0,0 +1,34 @@
+RNN模型
+===========
+循环神经网络（RNN）是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。
+这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。
+
+第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌：首先以简单的循环神经网络（vanilla RNN）为例，
+说明如何封装配置循环神经网络组件；然后更进一步的通过序列到序列（sequence to sequence）模型，逐步讲解如何构建完整而复杂的循环神经网络模型。
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_cn.rst
+
+Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键，第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理，
+对Recurrent Group接口进行了详细说明。另外，对双层RNN（对应的输入为双层序列）及Recurrent Group在其中的使用进行了介绍。
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+
+第三部分对双层序列进行了解释说明，列出了PaddlePaddle中支持双层序列作为输入的Layer，并对其使用进行了逐一介绍。
+
+..  toctree::
+  :maxdepth: 1
+
+  hierarchical_layer_cn.rst
+
+第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例，辅以效果相同的单层RNN网络配置作为对比，讲解了多种情况下双层RNN的使用。
+
+..  toctree::
+  :maxdepth: 1
+
+  hrnn_rnn_api_compare_cn.rst
diff --git a/doc/v2/howto/rnn/index_en.rst b/doc/v2/howto/rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e8b5c61b23ca2725dc0c9761c8dd4165033973c
--- /dev/null
+++ b/doc/v2/howto/rnn/index_en.rst
@@ -0,0 +1,32 @@
+RNN Models
+==========
+Recurrent neural networks(RNN) are an important tool to model sequential data. PaddlePaddle provides flexible interface for building complex recurrent neural network. We will demonstrate how to use PaddlePaddle to build RNN models in the following 4 parts.
+
+In the first part, we will guide you how to configure recurrent neural network in PaddlePaddle from simple to complex. First, we will use a vanilla recurrent neural network as an example to show how to configure recurrent neural network architecture. Then We will use the sequence to sequence model as an example to demonstrate how you can configure complex recurrent neural network models gradually.
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
+
+Recurrent Group is the key unit to build complex recurrent neural network models. The second part describes related concepts and Basic principles of Recurrent Group, and give a detailed description of Recurrent Group API interface. In addition, it also introduces Sequence-level RNN(hierarchical sequence as input) and the usage of Recurrent Group in it.
+
+..  toctree::
+  :maxdepth: 1
+  
+  recurrent_group_en.md
+  
+In the third part, two-level sequence is demonstrated briefly and then layers supporting two-level sequence as input are listed and described respectively.
+
+..  toctree::
+  :maxdepth: 1
+  
+  hierarchical_layer_en.rst
+
+In the last part, the unit test of hierarchical RNN is presented as an example to explain how to use hierarchical RNN. We will use two-level sequence RNN and single-layer sequence RNN which have same effects with former as the network configuration seperately in unit test.
+
+..  toctree::
+  :maxdepth: 1
+  
+  hrnn_rnn_api_compare_en.rst
+
diff --git a/doc/v2/howto/rnn/recurrent_group_cn.md b/doc/v2/howto/rnn/recurrent_group_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..06dc9e089ab2b2b926fcb1bd034262f2c846f06f
--- /dev/null
+++ b/doc/v2/howto/rnn/recurrent_group_cn.md
@@ -0,0 +1,96 @@
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical_layer_cn.html">支持双层序列作为输入的Layer</a>。
+ 
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+ 
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+ 
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+		
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+	  
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+    
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+		
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+		 
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+ 
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+ 
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+ 
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+
+- 单层RNN：从一个word生成下一个word。
+- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..de6b60f29eb97029a54609cd2194bb7faf3ffec5
--- /dev/null
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
@@ -0,0 +1,96 @@
+# Recurrent Group Tutorial
+
+## Overview
+
+Sequential data is common in natural language processing.
+
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: <a href = "hierarchical_layer_en.html">Layers for supporting double-layer sequences as input.</a>
+
+## Related Concepts
+
+### Basic Principle 
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+
+``` python 
+recurrent_group(step, input, reverse) 
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+		
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+	  
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+
+### Input Example
+
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+
+## Sequence-level RNN Introduction
+
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+
+- Word-level RNN:  each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+
+## Usage of Sequence-level RNN
+
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+
+- **Single-input Single-output**: Both input and output are single layer sequences.
+  - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+  - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+  - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+  - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+ 
+- **Double-input Double-output**: Both input and output are two-level sequence.
+  - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+  - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+  - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+  - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+ 
+### Usage of Generation Process
+Using `beam_search` need follow those conventions: 
+
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
diff --git a/doc/v2/howto/rnn/rnn_config_cn.rst b/doc/v2/howto/rnn/rnn_config_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..63fa161fafed0f3a8ec8799af21304cbec62d813
--- /dev/null
+++ b/doc/v2/howto/rnn/rnn_config_cn.rst
@@ -0,0 +1,261 @@
+RNN配置
+========
+
+本教程将指导你如何在 PaddlePaddle
+中配置循环神经网络（RNN）。PaddlePaddle
+高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
+
+-  配置循环神经网络架构。
+-  使用学习完成的循环神经网络模型生成序列。
+
+我们将使用 vanilla 循环神经网络和 sequence to sequence
+模型来指导你完成这些步骤。sequence to sequence
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。
+
+配置循环神经网络架构
+--------------------
+
+简单门控循环神经网络(Gated Recurrent Neural Network)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
+
+.. image:: src/bi_lstm.jpg
+      :align: center
+
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
+
+.. math::
+
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
+
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
+循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
+可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
+sequence
+模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的
+vanilla
+循环神经网络作为使用\ ``recurrent_group``\ 配置简单循环神经网络的例子。
+注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用\ ``grumemory``\ 和\ ``lstmemory``\ ，因为它们的计算效率比\ ``recurrent_group``\ 更高。
+
+对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
+
+.. math::
+
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
+
+``recurrent_group``\ 是构建循环神经网络的最重要的工具。
+它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
+
+.. code:: python
+
+    def simple_rnn(input,
+                   size=None,
+                   name=None,
+                   reverse=False,
+                   rnn_bias_attr=None,
+                   act=None,
+                   rnn_layer_attr=None):
+        def __rnn_step__(ipt):
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
+           return rnn_out
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
+
+PaddlePaddle
+使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
+一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
+layer(引导层)**\ ，其输出被用作Memory的初始值。
+在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
+(*x*\ \ *t* + 1)的输出被用作\ ``out_mem``\ Memory的\ **输出**\ 。
+
+Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。
+其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
+
+我们在函数的结尾返回\ ``rnn_out``\ 。 这意味着 ``rnn_out``
+层的输出被用作门控循环神经网络的\ **输出**\ 函数。
+
+Sequence to Sequence Model with Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+我们将使用 sequence to sequence model with attention
+作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
+
+.. image:: src/encoder-decoder-attention-model.png
+      :align: center
+
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
+用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
+
+模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
+``recurrent_group``
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
+
+我们还将编码向量投射到 ``decoder_size``
+维空间。这通过获得反向循环网络的第一个实例，并将其投射到
+``decoder_size`` 维空间完成：
+
+.. code:: python
+
+    # 定义源语句的数据层
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    # 计算每个词的词向量
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    # 应用前向循环神经网络
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    # 将循环神经网络的前向和反向部分混合在一起
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    # 投射编码向量到 decoder_size
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
+
+    # 计算反向RNN的第一个实例
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    # 投射反向RNN的第一个实例到 decoder size
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
+
+解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
+``gru_decoder_with_attention`` 中定义：
+
+.. code:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
+
+    # 对于配备有注意力机制的解码器，在训练中，
+    # 目标向量（groudtruth）是数据输入，
+    # 而源序列的编码向量可以被无边界的memory访问
+    # StaticInput 意味着不同时间步的输入都是相同的值，
+    # 否则它以一个序列输入，不同时间步的输入是不同的。
+    # 所有输入序列应该有相同的长度。
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
+attention，门控循环单元单步函数和输出函数：
+
+.. code:: python
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        # 定义解码器的Memory
+        # Memory的输出定义在 gru_step 内
+        # 注意 gru_step 应该与它的Memory名字相同
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        # 计算 attention 加权编码向量
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        # 混合当前词向量和attention加权编码向量
+         decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
+        # 定义门控循环单元循环神经网络单步函数
+         gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        # 定义输出函数
+         out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
+        return out
+
+生成序列
+--------
+
+训练模型后，我们可以使用它来生成序列。通常的做法是使用\ **beam search**
+生成序列。以下代码片段定义 beam search 算法。注意，\ ``beam_search``
+函数假设 ``step`` 的输出函数返回的是下一个时刻输出词的 softmax
+归一化概率向量。我们对模型进行了以下更改。
+
+-  使用 ``GeneratedInput`` 来表示 trg\_embedding。 ``GeneratedInput``
+   将上一时间步所生成的词的向量来作为当前时间步的输入。
+-  使用 ``beam_search`` 函数。这个函数需要设置：
+
+   -  ``bos_id``: 开始标记。每个句子都以开始标记开头。
+   -  ``eos_id``: 结束标记。每个句子都以结束标记结尾。
+   -  ``beam_size``: beam search 算法中的beam大小。
+   -  ``max_length``: 生成序列的最大长度。
+
+代码如下：
+
+.. code:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
+    # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
+    # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
+
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
diff --git a/doc/v2/howto/rnn/rnn_config_en.rst b/doc/v2/howto/rnn/rnn_config_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3
--- /dev/null
+++ b/doc/v2/howto/rnn/rnn_config_en.rst
@@ -0,0 +1,235 @@
+RNN Configuration
+=================
+
+This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
+
+- configure recurrent neural network architecture.
+- generate sequence with learned recurrent neural network models.
+
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 
+
+===============================================
+Configure Recurrent Neural Network Architecture
+===============================================
+
+-------------------------------------
+Simple Gated Recurrent Neural Network
+-------------------------------------
+
+Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
+
+.. image:: src/bi_lstm.jpg
+     :align: center
+
+Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
+
+.. math::
+
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
+
+
+where :math:`f_x(.)` is called **step function**, and :math:`f_y(.)` is called **output function**. In vanilla recurrent neural network, both of the step function and output function are very simple. However, PaddlePaddle supports the configuration of very complex architectures by modifying these two functions. We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. In this section, we will use a simple vanilla recurrent neural network as an example of configuring simple recurrent neural network using :code:`recurrent_group`. Notice that if you only need to use simple RNN, GRU, or LSTM, then :code:`grumemory` and :code:`lstmemory` is recommended because they are more computationally efficient than :code:`recurrent_group`.
+
+For vanilla RNN, at each time step, the **step function** is:
+
+.. math::
+
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+where :math:`x_t` is the RNN state, and :math:`I_t` is the input, :math:`W_x` and :math:`W_i` are transformation matrices for RNN states and inputs, respectively. :math:`b` is the bias.
+Its **output function** simply takes :math:`x_t` as the output.
+
+:code:`recurrent_group` is the most important tools for constructing recurrent neural networks. It defines the **step function**, **output function** and the inputs of the recurrent neural network. Notice that the :code:`step` argument of this function implements both the :code:`step function` and the :code:`output function`:
+
+.. code-block:: python
+
+    def simple_rnn(input,
+                   size=None,
+                   name=None,
+                   reverse=False,
+                   rnn_bias_attr=None,
+                   act=None,
+                   rnn_layer_attr=None):
+        def __rnn_step__(ipt):
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
+           return rnn_out
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
+
+
+PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory.
+
+A memory can also be a sequence. In this case, at each time step, we have a sequence as the state of the recurrent neural network. This can be useful when constructing very complex recurrent neural network. Other advanced functions include defining multiple memories, and defining hierarchical recurrent neural network architecture using sub-sequence.
+
+We return :code:`rnn_out` at the end of the function. It means that the output of the layer :code:`rnn_out` is utilized as the **output** function of the gated recurrent neural network.
+
+-----------------------------------------
+Sequence to Sequence Model with Attention
+-----------------------------------------
+We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
+
+.. image:: src/encoder-decoder-attention-model.png
+      :align: center
+
+In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
+
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :ref:`api_trainer_config_helpers_layers` for more details.
+
+We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:
+
+.. code-block:: python
+
+    # Define the data layer of the source sentence.
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    # Calculate the word embedding of each word.
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    # Apply forward recurrent neural network.
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    # Mix the forward and backward parts of the recurrent neural network together.
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    # Project encoding vector to decoder_size.
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
+
+    # Compute the first instance of the backward RNN.
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    # Project the first instance of backward RNN to decoder size.
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
+
+
+The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
+
+.. code-block:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
+
+    # For decoder equipped with attention mechanism, in training,
+    # target embedding (the groudtruth) is the data input,
+    # while encoded source sequence is accessed to as an unbounded memory.
+    # StaticInput means the same value is utilized at different time steps.
+    # Otherwise, it is a sequence input. Inputs at different time steps are different.
+    # All sequence inputs should have the same length.
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+
+The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
+
+.. code-block:: python
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        # Defines the memory of the decoder.
+        # The output of this memory is defined in gru_step.
+        # Notice that the name of gru_step should be the same as the name of this memory.
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        # Compute attention weighted encoder vector.
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        # Mix the current word embedding and the attention weighted encoder vector.
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
+        # Define Gated recurrent unit recurrent neural network step function.
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        # Defines the output function.
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
+        return out
+
+
+=================
+Generate Sequence
+=================
+After training the model, we can use it to generate sequences. A common practice is to use **beam search** to generate sequences. The following code snippets defines a beam search algorithm. Notice that :code:`beam_search` function assumes the output function of the :code:`step` returns a softmax normalized probability vector of the next token. We made the following changes to the model.
+
+* use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step.
+* use :code:`beam_search` function. This function needs to set:
+
+  - :code:`bos_id`: the start token. Every sentence starts with the start token.
+  - :code:`eos_id`: the end token. Every sentence ends with the end token.
+  - :code:`beam_size`: the beam size used in beam search.
+  - :code:`max_length`: the maximum length of the generated sentences.
+    
+The code is listed below:
+
+.. code-block:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    # In generation, decoder predicts a next target word based on
+    # the encoded source sequence and the last generated target word.
+    # The encoded source sequence (encoder's output) must be specified by
+    # StaticInput which is a read-only memory.
+    # Here, GeneratedInputs automatically fetchs the last generated word,
+    # which is initialized by a start mark, such as <s>.
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.
+
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
diff --git a/doc/v2/howto/rnn/src/bi_lstm.jpg b/doc/v2/howto/rnn/src/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/v2/howto/rnn/src/bi_lstm.jpg differ
diff --git a/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png b/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..79f911d4ba12ac0c0d1a936c9df639c302786914
Binary files /dev/null and b/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png differ
diff --git a/doc/v2/howto/rnn/src/glossary_rnn.dot b/doc/v2/howto/rnn/src/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/v2/howto/rnn/src/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot b/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/howto/rnn/src/simple_full_recurrent.dot b/doc/v2/howto/rnn/src/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/v2/howto/rnn/src/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/FullyConnected.jpg b/doc/v2/images/FullyConnected.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b2241f401434e527f95ee4e0e541a3f2ff78fd1e
Binary files /dev/null and b/doc/v2/images/FullyConnected.jpg differ
diff --git a/doc/v2/images/add_security_group.png b/doc/v2/images/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/v2/images/add_security_group.png differ
diff --git a/doc/v2/images/bi_lstm.jpg b/doc/v2/images/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/v2/images/bi_lstm.jpg differ
diff --git a/doc/v2/images/checkpointing.png b/doc/v2/images/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/v2/images/checkpointing.png differ
diff --git a/doc/v2/images/create_efs.png b/doc/v2/images/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/v2/images/create_efs.png differ
diff --git a/doc/v2/images/csr.png b/doc/v2/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/v2/images/csr.png differ
diff --git a/doc/v2/images/data_dispatch.png b/doc/v2/images/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/v2/images/data_dispatch.png differ
diff --git a/doc/v2/images/dataset.graffle b/doc/v2/images/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/v2/images/dataset.graffle differ
diff --git a/doc/v2/images/dataset.png b/doc/v2/images/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/v2/images/dataset.png differ
diff --git a/doc/v2/images/doc_en.png b/doc/v2/images/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/images/doc_en.png differ
diff --git a/doc/v2/images/efs_mount.png b/doc/v2/images/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/v2/images/efs_mount.png differ
diff --git a/doc/v2/images/encoder-decoder-attention-model.png b/doc/v2/images/encoder-decoder-attention-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..79f911d4ba12ac0c0d1a936c9df639c302786914
Binary files /dev/null and b/doc/v2/images/encoder-decoder-attention-model.png differ
diff --git a/doc/v2/images/engine.png b/doc/v2/images/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/v2/images/engine.png differ
diff --git a/doc/v2/images/file_storage.graffle b/doc/v2/images/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/v2/images/file_storage.graffle differ
diff --git a/doc/v2/images/file_storage.png b/doc/v2/images/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/v2/images/file_storage.png differ
diff --git a/doc/v2/images/glossary_rnn.dot b/doc/v2/images/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/v2/images/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/glossary_rnn_with_memory.dot b/doc/v2/images/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/v2/images/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/gradients.png b/doc/v2/images/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/v2/images/gradients.png differ
diff --git a/doc/v2/images/init_lock.graffle b/doc/v2/images/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/v2/images/init_lock.graffle differ
diff --git a/doc/v2/images/init_lock.png b/doc/v2/images/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/v2/images/init_lock.png differ
diff --git a/doc/v2/images/k8s-paddle-arch.png b/doc/v2/images/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/v2/images/k8s-paddle-arch.png differ
diff --git a/doc/v2/images/layers.png b/doc/v2/images/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/v2/images/layers.png differ
diff --git a/doc/v2/images/managed_policy.png b/doc/v2/images/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/v2/images/managed_policy.png differ
diff --git a/doc/v2/images/matrix.png b/doc/v2/images/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/v2/images/matrix.png differ
diff --git a/doc/v2/images/nvvp1.png b/doc/v2/images/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/v2/images/nvvp1.png differ
diff --git a/doc/v2/images/nvvp2.png b/doc/v2/images/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/v2/images/nvvp2.png differ
diff --git a/doc/v2/images/nvvp3.png b/doc/v2/images/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/v2/images/nvvp3.png differ
diff --git a/doc/v2/images/nvvp4.png b/doc/v2/images/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/v2/images/nvvp4.png differ
diff --git a/doc/v2/images/overview.png b/doc/v2/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/v2/images/overview.png differ
diff --git a/doc/v2/images/paddle-cloud-in-data-center.png b/doc/v2/images/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/v2/images/paddle-cloud-in-data-center.png differ
diff --git a/doc/v2/images/paddle-etcd.graffle b/doc/v2/images/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9
Binary files /dev/null and b/doc/v2/images/paddle-etcd.graffle differ
diff --git a/doc/v2/images/paddle-etcd.png b/doc/v2/images/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31
Binary files /dev/null and b/doc/v2/images/paddle-etcd.png differ
diff --git a/doc/v2/images/paddle-model-sharding.graffle b/doc/v2/images/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.graffle differ
diff --git a/doc/v2/images/paddle-model-sharding.png b/doc/v2/images/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.png differ
diff --git a/doc/v2/images/paddle-ps-0.png b/doc/v2/images/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/v2/images/paddle-ps-0.png differ
diff --git a/doc/v2/images/paddle-ps-1.png b/doc/v2/images/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/v2/images/paddle-ps-1.png differ
diff --git a/doc/v2/images/paddle-ps.graffle b/doc/v2/images/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/v2/images/paddle-ps.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.graffle b/doc/v2/images/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.png b/doc/v2/images/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.png differ
diff --git a/doc/v2/images/paddle-task-states.graffle b/doc/v2/images/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/v2/images/paddle-task-states.graffle differ
diff --git a/doc/v2/images/paddle-task-states.png b/doc/v2/images/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/v2/images/paddle-task-states.png differ
diff --git a/doc/v2/images/ps_cn.png b/doc/v2/images/ps_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/ps_cn.png differ
diff --git a/doc/v2/images/ps_en.png b/doc/v2/images/ps_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/ps_en.png differ
diff --git a/doc/v2/images/pserver_and_trainer.png b/doc/v2/images/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/v2/images/pserver_and_trainer.png differ
diff --git a/doc/v2/images/pserver_init.graffle b/doc/v2/images/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/v2/images/pserver_init.graffle differ
diff --git a/doc/v2/images/pserver_init.png b/doc/v2/images/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/v2/images/pserver_init.png differ
diff --git a/doc/v2/images/route53_create_recordset.png b/doc/v2/images/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/v2/images/route53_create_recordset.png differ
diff --git a/doc/v2/images/route53_create_zone.png b/doc/v2/images/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/v2/images/route53_create_zone.png differ
diff --git a/doc/v2/images/sequence_data.png b/doc/v2/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/v2/images/sequence_data.png differ
diff --git a/doc/v2/images/simple_full_hierarchical_recurrent.dot b/doc/v2/images/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/v2/images/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/simple_full_recurrent.dot b/doc/v2/images/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/v2/images/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/submit-job.graffle b/doc/v2/images/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/v2/images/submit-job.graffle differ
diff --git a/doc/v2/images/submit-job.png b/doc/v2/images/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/v2/images/submit-job.png differ
diff --git a/doc/v2/images/trainer.graffle b/doc/v2/images/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254
Binary files /dev/null and b/doc/v2/images/trainer.graffle differ
diff --git a/doc/v2/images/trainer.png b/doc/v2/images/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/trainer.png differ
diff --git a/doc/v2/images/trainer_cn.png b/doc/v2/images/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/trainer_cn.png differ
diff --git a/doc/v2/images/worker_security_group.png b/doc/v2/images/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/v2/images/worker_security_group.png differ
diff --git a/doc/v2/images/workflow_of_CAPI.png b/doc/v2/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/v2/images/workflow_of_CAPI.png differ
diff --git a/doc/v2/index_cn.rst b/doc/v2/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f645db6fc5d0f84bbe0cbb335677752e3a355ea
--- /dev/null
+++ b/doc/v2/index_cn.rst
@@ -0,0 +1,11 @@
+PaddlePaddle 文档
+======================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  build_and_install/index_cn.rst
+  howto/index_cn.rst
+  dev/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/v2/index_en.rst b/doc/v2/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..909f035cca3db2a02fd38462acc451375eceff40
--- /dev/null
+++ b/doc/v2/index_en.rst
@@ -0,0 +1,11 @@
+PaddlePaddle Documentation
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  build_and_install/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/external/Anakin b/external/Anakin
new file mode 160000
index 0000000000000000000000000000000000000000..beec126e4cfe762e4b6b542496069323dca35ee7
--- /dev/null
+++ b/external/Anakin
@@ -0,0 +1 @@
+Subproject commit beec126e4cfe762e4b6b542496069323dca35ee7
diff --git a/external/Paddle b/external/Paddle
new file mode 160000
index 0000000000000000000000000000000000000000..cb27a9219d8dfc02be49484ce697495886a3e6fb
--- /dev/null
+++ b/external/Paddle
@@ -0,0 +1 @@
+Subproject commit cb27a9219d8dfc02be49484ce697495886a3e6fb
diff --git a/external/book b/external/book
new file mode 160000
index 0000000000000000000000000000000000000000..13c6f692513771afb50e86a5de2d1cf9a3a53975
--- /dev/null
+++ b/external/book
@@ -0,0 +1 @@
+Subproject commit 13c6f692513771afb50e86a5de2d1cf9a3a53975
diff --git a/external/models b/external/models
new file mode 160000
index 0000000000000000000000000000000000000000..bc0200b971b0e951b4a3f13822a1e1db33388b29
--- /dev/null
+++ b/external/models
@@ -0,0 +1 @@
+Subproject commit bc0200b971b0e951b4a3f13822a1e1db33388b29
diff --git a/external/paddle-mobile b/external/paddle-mobile
new file mode 160000
index 0000000000000000000000000000000000000000..73e2f989e78e59e6fafbf5d973e36ad17418c64a
--- /dev/null
+++ b/external/paddle-mobile
@@ -0,0 +1 @@
+Subproject commit 73e2f989e78e59e6fafbf5d973e36ad17418c64a
diff --git a/paddle b/paddle
deleted file mode 160000
index 3ff9ba0e6ba1eec282b6e89fb7bea2e2046f01c5..0000000000000000000000000000000000000000
--- a/paddle
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3ff9ba0e6ba1eec282b6e89fb7bea2e2046f01c5
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 16435dc01885bdd17fcbadc9d95055a9750bffad..0000000000000000000000000000000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-alabaster==0.7.10
-Babel==2.6.0
-backports.functools-lru-cache==1.5
-certifi==2018.4.16
-chardet==3.0.4
-CommonMark==0.5.4
-cycler==0.10.0
-docutils==0.14
-graphviz==0.8.3
-idna==2.6
-imagesize==1.0.0
-Jinja2==2.10
-kiwisolver==1.0.1
-LinkChecker==9.3
-Markdown==2.6.11
-MarkupSafe==1.0
-matplotlib==2.2.2
-nltk==3.3
-numpy==1.14.4
-opencv-python==3.4.1.15
-packaging==17.1
-paddlepaddle
-Pillow==5.1.0
-protobuf==3.1.0
-Pygments==2.2.0
-pyparsing==2.2.0
-python-dateutil==2.7.3
-pytz==2018.4
-rarfile==3.0
-recommonmark==0.4.0
-recordio==0.1.5
-requests==2.9.2
-scipy==1.1.0
-six==1.11.0
-snowballstemmer==1.2.1
-Sphinx==1.7.5
-sphinx-markdown-tables==0.0.3
-sphinx-rtd-theme==0.4.0
-sphinxcontrib-websupport==1.1.0
-subprocess32==3.5.1
-typing==3.6.4
-urllib3==1.22
diff --git a/scripts/deploy_docs.sh b/scripts/deploy_docs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..31132552ac78a193f493132d46be48ff2b59f849
--- /dev/null
+++ b/scripts/deploy_docs.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exit_code=0
+
+if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit $exit_code; fi;
+    
+# Deploy to the the content server if its a "develop" or "release/version" branch
+# The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+    PPO_SCRIPT_BRANCH=develop
+elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+    PPO_SCRIPT_BRANCH=master
+else
+    # Early exit, this branch doesn't require documentation build
+    echo "This branch doesn't require documentation build";
+    exit $exit_code;
+fi
+
+echo "Build Paddle library $1. This step is needed to compile Paddle API documents"
+cd external/Paddle
+git branch
+paddle/scripts/paddle_docker_build.sh gen_doc_lib $1
+cd ../..
+
+if [[ "$1" == "pybind" || "$1" == "proto" ]]; then
+  echo "Finish building lite library";
+  exit $exit_code;
+fi
+
+export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+
+echo "Deploy under docker environment"
+docker run -it \
+    -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+    -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+    -e DEPLOY_DOCS_SH=$DEPLOY_DOCS_SH \
+    -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
+    -e PPO_SCRIPT_BRANCH=$PPO_SCRIPT_BRANCH \
+    -e PADDLE_ROOT=/FluidDoc/external/Paddle \
+    -e PYTHONPATH=/FluidDoc/external/Paddle/build/python \
+    -v "$PWD:/FluidDoc" \
+    -w /FluidDoc \
+    paddlepaddle/paddle:latest-dev \
+    /bin/bash -c 'curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH /FluidDoc /FluidDoc/build/doc/ $PPO_SCRIPT_BRANCH' || exit_code=$(( exit_code | $? ))
+
+exit $exit_code
diff --git a/scripts/deploy_en_external_docs.sh b/scripts/deploy_en_external_docs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c275adf4612c9683981d887a98ff2cb914e7f6b0
--- /dev/null
+++ b/scripts/deploy_en_external_docs.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script is used to deploy the English specific documents. 
+# EX: Book, Mobile and Models. They are not yet consolidated into one Doc tree. 
+
+exit_code=0
+
+if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit $exit_code; fi;
+    
+# Deploy to the the content server if its a "develop" or "release/version" branch
+# The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+    PPO_SCRIPT_BRANCH=develop
+elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+    PPO_SCRIPT_BRANCH=master
+else
+    # Early exit, this branch doesn't require documentation build
+    echo "This branch doesn't require documentation build"
+    exit $exit_code;
+fi
+
+export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+
+echo "Deploy book under docker environment"
+docker run -it \
+    -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+    -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+    -e DEPLOY_DOCS_SH=$DEPLOY_DOCS_SH \
+    -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
+    -e PPO_SCRIPT_BRANCH=$PPO_SCRIPT_BRANCH \
+    -e PADDLE_ROOT=/FluidDoc/external/Paddle \
+    -e PYTHONPATH=/FluidDoc/external/Paddle/build/python \
+    -v "$PWD:/FluidDoc" \
+    -w /FluidDoc \
+    paddlepaddle/paddle:latest-dev \
+    /bin/bash -c 'curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH /FluidDoc/external /FluidDoc/external $PPO_SCRIPT_BRANCH' || exit_code=$(( exit_code | $? ))
+
+exit $exit_code
diff --git a/source/advanced_usage/deploy/index.rst b/source/advanced_usage/deploy/index.rst
deleted file mode 100644
index 0a391acfd38e36cb98d8eedbc40b1330e498e62c..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-########
-预测部署
-########
-
-服务端
-######
-
-
-移动端
-######
\ No newline at end of file
diff --git a/source/advanced_usage/development/index.rst b/source/advanced_usage/development/index.rst
deleted file mode 100644
index efd5413234a63449f0c76b5b1682552931cda9a3..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/index.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-####################
-如何开发PaddlePaddle
-####################
-
-
-如何贡献代码
-############
-
-如何贡献文档
-############
-
-如何写新的operator
-##################
-
-CPU性能调优
-###########
-
-GPU性能调优
-###########
\ No newline at end of file
diff --git a/source/advanced_usage/index.rst b/source/advanced_usage/index.rst
deleted file mode 100644
index ef05a7c36927c6ca5362e2c9608c1e795df2b7d8..0000000000000000000000000000000000000000
--- a/source/advanced_usage/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-########
-进阶使用
-########
-
-
-..  todo::
-
-    Complete this guide
-
-..  toctree::
-    :maxdepth: 2
-
-    deploy/index.rst
-    development/index.rst
\ No newline at end of file
diff --git a/source/api_guides/high_level/index.rst b/source/api_guides/high_level/index.rst
deleted file mode 100644
index 946e2156e05f082907655195a8a456a4bcb9cda0..0000000000000000000000000000000000000000
--- a/source/api_guides/high_level/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-##############
-High level API
-##############
-
-..  todo::
-
-    Complete this doc
\ No newline at end of file
diff --git a/source/api_guides/index.rst b/source/api_guides/index.rst
deleted file mode 100644
index fe624a6ae20297d4cc4f4b91d51d7cee721ef82a..0000000000000000000000000000000000000000
--- a/source/api_guides/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-#########
-API Guide
-#########
-
-..  todo::
-
-    Complete this doc
-
-
-..  toctree::
-    :maxdepth: 4
-
-    high_level/index.rst
-    low_level/index.rst
\ No newline at end of file
diff --git a/source/api_guides/low_level/executor/executor.rst b/source/api_guides/low_level/executor/executor.rst
deleted file mode 100644
index 090dde218a064a8208c6eac9e173b3961b8d3312..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/executor/executor.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-########
-Executor
-########
\ No newline at end of file
diff --git a/source/api_guides/low_level/executor/parallel_executor.rst b/source/api_guides/low_level/executor/parallel_executor.rst
deleted file mode 100644
index 5f14a6b3646012ed8522a981f464c324f7d19d18..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/executor/parallel_executor.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-################
-ParallelExecutor
-################
\ No newline at end of file
diff --git a/source/api_guides/low_level/index.rst b/source/api_guides/low_level/index.rst
deleted file mode 100644
index 3f2c5a2fc490c23c3faf5ed0cc9b0020992f8ca7..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/index.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-#############
-Low level API
-#############
-
-Layers
-######
-
-神经网络的主体API是一些层函数，他们包括
-
-..  toctree::
-    :maxdepth: 1
-
-    layers/math.rst
-    layers/activations.rst
-    layers/convolution.rst
-    layers/pooling.rst
-    layers/preprocessing.rst
-    layers/io.rst
-    layers/metrics.rst
-    layers/detection.rst
-
-执行引擎
-########
-
-..  toctree::
-
-    executor/executor.rst
-    executor/parallel_executor.rst
-
-数据读取
-########
-
-参数属性与参数初始化(ParamAttr)
-###############################
-
-
-预测引擎
-########
-
-Program/Block/Variable
-######################
-
-Scope
-#####
-
-CreateOperator
-##############
-
-Backward
-########
-
-模型平均(Model Average)
-#######################
-
-Optimizers
-##########
-
-正则化
-######
-
-Transpiler
-##########
-
-Gradient Clipping
-#################
-
-调试工具/VisualDL
-#################
-
diff --git a/source/api_guides/low_level/layers/activations.rst b/source/api_guides/low_level/layers/activations.rst
deleted file mode 100644
index 88191af24241abe525547fe1069e502ccdc4d9c1..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/activations.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-########
-激活函数
-########
-
diff --git a/source/api_guides/low_level/layers/convolution.rst b/source/api_guides/low_level/layers/convolution.rst
deleted file mode 100644
index 64df924a86171020adc90f6fd17d98d3e2e671d8..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/convolution.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-########
-卷积操作
-########
diff --git a/source/api_guides/low_level/layers/detection.rst b/source/api_guides/low_level/layers/detection.rst
deleted file mode 100644
index 00a3fc40cb62d349f0d94b16c782cc263ddc9739..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/detection.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-########
-图像检测
-########
-
diff --git a/source/api_guides/low_level/layers/io.rst b/source/api_guides/low_level/layers/io.rst
deleted file mode 100644
index e48bafe1c2bd38a7afcb66cf3b1c7801f774d1c7..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/io.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-########
-输入输出
-########
\ No newline at end of file
diff --git a/source/api_guides/low_level/layers/math.rst b/source/api_guides/low_level/layers/math.rst
deleted file mode 100644
index 2c8cc56091a7adcf10f2bbc24a32ca8c522b93cc..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/math.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-########
-数学算子
-########
-
diff --git a/source/api_guides/low_level/layers/metrics.rst b/source/api_guides/low_level/layers/metrics.rst
deleted file mode 100644
index 42a82f606a862144e4a92af08cc3e99f2d689148..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/metrics.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-########
-评价指标
-########
\ No newline at end of file
diff --git a/source/api_guides/low_level/layers/pooling.rst b/source/api_guides/low_level/layers/pooling.rst
deleted file mode 100644
index ecca8a465debba9173e6ab1b1cb2c40ecc641f06..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/pooling.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-########
-池化操作
-########
-
-
diff --git a/source/api_guides/low_level/layers/preprocessing.rst b/source/api_guides/low_level/layers/preprocessing.rst
deleted file mode 100644
index 7ab81de5759a2de1dbdad27c794ab7d99ae89373..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/preprocessing.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-##########
-预处理操作
-##########
-
-
-
-图像预处理操作
-##############
-
-
-语音预处理操作
-##############
\ No newline at end of file
diff --git a/source/api_reference/data b/source/api_reference/data
deleted file mode 120000
index 5aa0dd5cb56f08380cc5a6a23ad057f8c1184fa2..0000000000000000000000000000000000000000
--- a/source/api_reference/data
+++ /dev/null
@@ -1 +0,0 @@
-../../paddle/doc/fluid/api/data
\ No newline at end of file
diff --git a/source/api_reference/gen_doc.py b/source/api_reference/gen_doc.py
deleted file mode 120000
index e804d42315f0b4ef3a911af79690aa92c8ac73b9..0000000000000000000000000000000000000000
--- a/source/api_reference/gen_doc.py
+++ /dev/null
@@ -1 +0,0 @@
-../../paddle/doc/fluid/api/gen_doc.py
\ No newline at end of file
diff --git a/source/api_reference/initializer.rst b/source/api_reference/initializer.rst
deleted file mode 100644
index c49a98c744cdf907630ea8c74791ff2021d996e8..0000000000000000000000000000000000000000
--- a/source/api_reference/initializer.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-===========
-initializer
-===========
-
-Constant
---------
-
-..  autoclass:: paddle.fluid.initializer.Constant
-    :members:
-    :noindex:
-
-Uniform
--------
-
-..  autoclass:: paddle.fluid.initializer.Uniform
-    :members:
-    :noindex:
-
-Normal
-------
-
-..  autoclass:: paddle.fluid.initializer.Normal
-    :members:
-    :noindex:
-
-Xavier
-------
-
-..  autoclass:: paddle.fluid.initializer.Xavier
-    :members:
-    :noindex:
-
-force_init_on_cpu
------------------
-
-..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
-    :noindex:
-
-init_on_cpu
------------
-
-..  autofunction:: paddle.fluid.initializer.init_on_cpu
-    :noindex:
-
-ConstantInitializer
--------------------
-
-..  autoclass:: paddle.fluid.initializer.ConstantInitializer
-    :members:
-    :noindex:
-
-UniformInitializer
-------------------
-
-..  autoclass:: paddle.fluid.initializer.UniformInitializer
-    :members:
-    :noindex:
-
-NormalInitializer
------------------
-
-..  autoclass:: paddle.fluid.initializer.NormalInitializer
-    :members:
-    :noindex:
-
-XavierInitializer
------------------
-
-..  autoclass:: paddle.fluid.initializer.XavierInitializer
-    :members:
-    :noindex:
-
diff --git a/source/conf.py b/source/conf.py
deleted file mode 100644
index b5f5bfa95b653853d6891c023dab5690912857ba..0000000000000000000000000000000000000000
--- a/source/conf.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# PaddlePaddle Fluid documentation build configuration file, created by
-# sphinx-quickstart on Thu Jun  7 17:04:53 2018.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import sys
-import os
-import shlex
-from recommonmark.parser import CommonMarkParser
-from recommonmark.transform import AutoStructify
-import paddle.fluid
-import sphinx.ext.napoleon
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.todo',
-    'sphinx_markdown_tables',
-    'sphinx.ext.napoleon',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-source_parsers = {
-    '.md': CommonMarkParser,
-}
-
-source_suffix = ['.rst', '.md']
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'PaddlePaddle Fluid'
-copyright = u'2018, paddle-dev@baidu.com'
-author = u'paddle-dev@baidu.com'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '0.13.0'
-# The full version, including alpha/beta/rc tags.
-release = '0.13.0'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = 'zh_CN'
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = []
-
-# The reST default role (used for this markup: `text`) to use for all
-# documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Add any extra paths that contain custom files (such as robots.txt or
-# .htaccess) here, relative to this directory. These files are copied
-# directly to the root of the documentation.
-#html_extra_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Language to be used for generating the HTML full-text search index.
-# Sphinx supports the following languages:
-#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
-#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
-#html_search_language = 'en'
-
-# A dictionary with options for the search language support, empty by default.
-# Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
-
-# The name of a javascript file (relative to the configuration directory) that
-# implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'PaddlePaddleFluiddoc'
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
-
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-  (master_doc, 'PaddlePaddleFluid.tex', u'PaddlePaddle Fluid Documentation',
-   u'paddle-dev@baidu.com', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'paddlepaddlefluid', u'PaddlePaddle Fluid Documentation',
-     [author], 1)
-]
-
-# If true, show URL addresses after external links.
-#man_show_urls = False
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-  (master_doc, 'PaddlePaddleFluid', u'PaddlePaddle Fluid Documentation',
-   author, 'PaddlePaddleFluid', 'One line description of project.',
-   'Miscellaneous'),
-]
-
-# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
-
-# If false, no module index is generated.
-#texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
-
-# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
-
-def setup(app):
-    app.add_config_value('recommonmark_config', {
-            'auto_toc_tree_section': 'Contents',
-            'enable_inline_math': True,
-            'enable_eval_rst': True,
-            'enable_math': True
-            }, True)
-    app.add_transform(AutoStructify)
diff --git a/source/faq.rst b/source/faq.rst
deleted file mode 100644
index 9d43c91a8544c3b281b2e8d556cb8b8e069d7e0a..0000000000000000000000000000000000000000
--- a/source/faq.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-###
-FAQ
-###
diff --git a/source/quick_start/fit_a_line/image/predictions.png b/source/quick_start/fit_a_line/image/predictions.png
deleted file mode 120000
index 502fae7606695048cb344877adc346b664a43133..0000000000000000000000000000000000000000
--- a/source/quick_start/fit_a_line/image/predictions.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/01.fit_a_line/image/predictions.png
\ No newline at end of file
diff --git a/source/quick_start/fit_a_line/image/ranges.png b/source/quick_start/fit_a_line/image/ranges.png
deleted file mode 120000
index b2c08b70d73c46ba59c696acc87347c306fe90c0..0000000000000000000000000000000000000000
--- a/source/quick_start/fit_a_line/image/ranges.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/01.fit_a_line/image/ranges.png
\ No newline at end of file
diff --git a/source/quick_start/fit_a_line/image/train_and_test.png b/source/quick_start/fit_a_line/image/train_and_test.png
deleted file mode 120000
index 717de74cac70afc83976e3b25f0ec1637c8dfdab..0000000000000000000000000000000000000000
--- a/source/quick_start/fit_a_line/image/train_and_test.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/01.fit_a_line/image/train_and_test.png
\ No newline at end of file
diff --git a/source/quick_start/fit_a_line/index.md b/source/quick_start/fit_a_line/index.md
deleted file mode 120000
index 651911b2858c0edfd8a3b86459d1de83a4d98a8b..0000000000000000000000000000000000000000
--- a/source/quick_start/fit_a_line/index.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../book/01.fit_a_line/README.cn.md
\ No newline at end of file
diff --git a/source/quick_start/index.rst b/source/quick_start/index.rst
deleted file mode 100644
index add34a76ee41202a5e6e88f25d4cd7a80314d459..0000000000000000000000000000000000000000
--- a/source/quick_start/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-########
-新手入门
-########
-
-..  todo::
-
-    新手入门的导引文字，需要完善。
-
-..  toctree::
-    :maxdepth: 2
-
-    install/index.rst
-    quick_start.rst
-    theoretical_background.rst
\ No newline at end of file
diff --git a/source/quick_start/install/build_from_source_cn.rst b/source/quick_start/install/build_from_source_cn.rst
deleted file mode 120000
index 1a3d2eb36dcc1aaaafc15fe694b8dcd29051e368..0000000000000000000000000000000000000000
--- a/source/quick_start/install/build_from_source_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/build_and_install/build_from_source_cn.rst
\ No newline at end of file
diff --git a/source/quick_start/install/docker_install_cn.rst b/source/quick_start/install/docker_install_cn.rst
deleted file mode 120000
index 341a52cf5e0d79adf5379abc72ab8cb8fc2566e8..0000000000000000000000000000000000000000
--- a/source/quick_start/install/docker_install_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/build_and_install/docker_install_cn.rst
\ No newline at end of file
diff --git a/source/quick_start/install/index.rst b/source/quick_start/install/index.rst
deleted file mode 120000
index 1169dd292e070de20bce4161a80a22f2a7a5863a..0000000000000000000000000000000000000000
--- a/source/quick_start/install/index.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/build_and_install/index_cn.rst
\ No newline at end of file
diff --git a/source/quick_start/install/paddleci.png b/source/quick_start/install/paddleci.png
deleted file mode 120000
index 8b2c075b22d09ff01e9ead7613dab5ab9090e9cd..0000000000000000000000000000000000000000
--- a/source/quick_start/install/paddleci.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/build_and_install/paddleci.png
\ No newline at end of file
diff --git a/source/quick_start/install/pip_install_cn.rst b/source/quick_start/install/pip_install_cn.rst
deleted file mode 120000
index 4270f25a6321a3005043136711586f37e8426222..0000000000000000000000000000000000000000
--- a/source/quick_start/install/pip_install_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/build_and_install/pip_install_cn.rst
\ No newline at end of file
diff --git a/source/quick_start/quick_start.rst b/source/quick_start/quick_start.rst
deleted file mode 100644
index 08a5937f9d83f04d1be5b982f46707b94f4e9adc..0000000000000000000000000000000000000000
--- a/source/quick_start/quick_start.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-########
-快速入门
-########
-
-..  todo::
-
-    概述
-
-..  toctree::
-    :maxdepth: 2
-
-    fit_a_line/index.md
-    recognize_digits/index.md
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/cnn.png b/source/quick_start/recognize_digits/image/cnn.png
deleted file mode 120000
index 05f7e4e577d9a19031c61a6e74e10f5de3c4469b..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/cnn.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/cnn.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/cnn_train_log.png b/source/quick_start/recognize_digits/image/cnn_train_log.png
deleted file mode 120000
index 0f19ea6d651ba943dea06978f3e7b6fdda9c7f0e..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/cnn_train_log.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/cnn_train_log.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/conv_layer.png b/source/quick_start/recognize_digits/image/conv_layer.png
deleted file mode 120000
index 03d7d4301a9de2efa5d47a10f973b963dc12bbda..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/conv_layer.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/conv_layer.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/infer_3.png b/source/quick_start/recognize_digits/image/infer_3.png
deleted file mode 120000
index b668970d975c09af68b46b16c1237fcc2fc0fdd2..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/infer_3.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/infer_3.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/max_pooling.png b/source/quick_start/recognize_digits/image/max_pooling.png
deleted file mode 120000
index ba1b46d602e74c8e518995917f11d2683249ec18..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/max_pooling.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/max_pooling.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/mlp.png b/source/quick_start/recognize_digits/image/mlp.png
deleted file mode 120000
index 8ead6e38ff66f92abf40c745c083b279b710ea88..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/mlp.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/mlp.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/mlp_train_log.png b/source/quick_start/recognize_digits/image/mlp_train_log.png
deleted file mode 120000
index c26d9576aca3f285715241e2528d7c912ed10d72..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/mlp_train_log.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/mlp_train_log.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/mnist_example_image.png b/source/quick_start/recognize_digits/image/mnist_example_image.png
deleted file mode 120000
index 4040f4011f6c259201257d1bc1a2e4414d3c07e8..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/mnist_example_image.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/mnist_example_image.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/softmax_regression.png b/source/quick_start/recognize_digits/image/softmax_regression.png
deleted file mode 120000
index 5d049f681102552fc322cef7f8bc68190ba032f2..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/softmax_regression.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/softmax_regression.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/softmax_train_log.png b/source/quick_start/recognize_digits/image/softmax_train_log.png
deleted file mode 120000
index 2edd1fb0e3d78b91dcfd0c55198c33c37af253db..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/softmax_train_log.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/softmax_train_log.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/image/train_and_test.png b/source/quick_start/recognize_digits/image/train_and_test.png
deleted file mode 120000
index 6b61f05c6eef390b4433e07d9947f3a97789985f..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/image/train_and_test.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../book/02.recognize_digits/image/train_and_test.png
\ No newline at end of file
diff --git a/source/quick_start/recognize_digits/index.md b/source/quick_start/recognize_digits/index.md
deleted file mode 120000
index 80cb36835c230d6930b4f05c1cdb83fe20844544..0000000000000000000000000000000000000000
--- a/source/quick_start/recognize_digits/index.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../book/02.recognize_digits/README.cn.md
\ No newline at end of file
diff --git a/source/quick_start/theoretical_background.rst b/source/quick_start/theoretical_background.rst
deleted file mode 100644
index 3ecfd03a8dc89886d82968f0ac2adb536f210e5f..0000000000000000000000000000000000000000
--- a/source/quick_start/theoretical_background.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-########
-理论知识
-########
-
-
-..  todo::
-
-    完善这个页面
\ No newline at end of file
diff --git a/source/user_guides/howto/index.rst b/source/user_guides/howto/index.rst
deleted file mode 100644
index 4b6988665901bacf3fcdf993faf913bf163d7ba4..0000000000000000000000000000000000000000
--- a/source/user_guides/howto/index.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-####################
-如何使用PaddlePaddle
-####################
-
-
-概述
-####
-
-
-
-数据预处理
-##########
-
-
-配置简单的网络
-##############
-
-
-训练
-####
-
-
-
-调试
-####
-
-模型评估
-########
-
diff --git a/source/user_guides/index.rst b/source/user_guides/index.rst
deleted file mode 100644
index 64c8f3bd4e6896e84be86bbf76fcb43e57700f17..0000000000000000000000000000000000000000
--- a/source/user_guides/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-########
-使用指南
-########
-
-
-..  todo::
-
-    完善导引介绍
-
-..  toctree::
-    :maxdepth: 2
-
-    howto/index.rst
-    model_bank/index.rst
diff --git a/source/user_guides/model_bank/index.rst b/source/user_guides/model_bank/index.rst
deleted file mode 100644
index cf1691245ff4895eb699c9216d3bb91a8a5d9d2c..0000000000000000000000000000000000000000
--- a/source/user_guides/model_bank/index.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-######
-模型库
-######
-
-
-图像
-####
-
-
-NLP
-###
-
-
-语音
-####
-
-其他
-####
\ No newline at end of file

依赖包名称	版本	说明	安装命令
CMake	3.4
GCC	4.8 / 5.4	推荐使用CentOS的devtools2
Python	2.7.x.	依赖libpython2.7.so	`apt install python-dev` 或 `yum install python-devel`
SWIG	最低 2.0		`apt install swig` 或 `yum install swig`
wget	any		`apt install wget` 或 `yum install wget`
openblas	any
pip	最低9.0.1		`apt install python-pip` 或 `yum install Python-pip`
numpy	>=1.12.0		`pip install numpy==1.14.0`
protobuf	3.1.0		`pip install protobuf==3.1.0`
wheel	any		`pip install wheel`
patchELF	any		`apt install patchelf` 或参见github patchELF 官方文档
go	>=1.8	可选
选项	说明	默认值
WITH_GPU	是否支持GPU	ON
WITH_C_API	是否仅编译CAPI	OFF
WITH_DOUBLE	是否使用双精度浮点数	OFF
WITH_DSO	是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库	ON
WITH_AVX	是否编译含有AVX指令集的PaddlePaddle二进制文件	ON
WITH_PYTHON	是否内嵌PYTHON解释器	ON
WITH_STYLE_CHECK	是否编译时进行代码风格检查	ON
WITH_TESTING	是否开启单元测试	OFF
WITH_DOC	是否编译中英文文档	OFF
WITH_SWIG_PY	是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练	Auto
WITH_GOLANG	是否编译go语言的可容错parameter server	OFF
WITH_MKL	是否使用MKL数学库，如果为否则是用OpenBLAS	ON
版本号	版本说明
paddlepaddle==[版本号] 如 paddlepaddle==1.1.0(下载1.1.0版本只支持CPU的PaddlePaddle)	只支持CPU对应版本的PaddlePaddle，具体版本请参见Pypi
paddlepaddle-gpu==1.1.0	使用CUDA 9.0和cuDNN 7编译的1.1.0版本
paddlepaddle-gpu==1.1.0.post87	使用CUDA 8.0和cuDNN 7编译的1.1.0版本
paddlepaddle-gpu==1.1.0.post85	使用CUDA 8.0和cuDNN 5编译的1.1.0版本
paddlepaddle-gpu==1.0.0	使用CUDA 9.0和cuDNN 7编译的1.0.0版本
paddlepaddle-gpu==1.0.0.post87	使用CUDA 8.0和cuDNN 7编译的1.0.0版本
paddlepaddle-gpu==1.0.0.post85	使用CUDA 8.0和cuDNN 5编译的1.0.0版本
paddlepaddle-gpu==0.15.0	使用CUDA 9.0和cuDNN 7编译的0.15.0版本
paddlepaddle-gpu==0.15.0.post87	使用CUDA 8.0和cuDNN 7编译的0.15.0版本
paddlepaddle-gpu==0.15.0.post85	使用CUDA 8.0和cuDNN 5编译的0.15.0版本
paddlepaddle-gpu==0.14.0	使用CUDA 9.0和cuDNN 7编译的0.15.0版本
paddlepaddle-gpu==0.14.0.post87	使用CUDA 8.0和cuDNN 7编译的0.15.0版本
paddlepaddle-gpu==0.14.0.post85	使用CUDA 8.0和cuDNN 5编译的0.15.0版本
paddlepaddle-gpu==0.13.0	使用CUDA 9.0和cuDNN 7编译的0.13.0版本
paddlepaddle-gpu==0.12.0	使用CUDA 8.0和cuDNN 5编译的0.12.0版本
paddlepaddle-gpu==0.11.0.post87	使用CUDA 8.0和cuDNN 7编译的0.11.0版本
paddlepaddle-gpu==0.11.0.post85	使用CUDA 8.0和cuDNN 5编译的0.11.0版本
paddlepaddle-gpu==0.11.0	使用CUDA 7.5和cuDNN 5编译的0.11.0版本
版本号	版本说明
hub.baidubce.com/paddlepaddle/paddle:latest	最新的预先安装好PaddlePaddle CPU版本的镜像
hub.baidubce.com/paddlepaddle/paddle:latest-dev	最新的PaddlePaddle的开发环境
hub.baidubce.com/paddlepaddle/paddle:[Version]	将version换成具体的版本，历史版本的预安装好PaddlePaddle的镜像
hub.baidubce.com/paddlepaddle/paddle:latest-gpu	最新的预先安装好PaddlePaddle GPU版本的镜像
版本说明	cp27-cp27mu	cp27-cp27m
cpu_avx_mkl	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl
cpu_avx_openblas	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl	paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl
cpu_noavx_openblas	paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl	paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl
cuda8.0_cudnn5_avx_mkl	paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl	paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl
cuda8.0_cudnn7_avx_mkl	paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl	paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl
C++	CUDA C++	Go
cc_library	nv_library	go_library
cc_binary	nv_binary	go_binary
cc_test	nv_test	go_test
programming languages	PaddlePaddle
for, while loop	RNN, WhileOp
if, if-else, switch	IfElseOp, SwitchOp
sequential execution	a sequence of layers
programming languages	PaddlePaddle
stack	scope hierarchy
stack frame	scope
push at entering block	push at entering block
pop at leaving block	destroy when minibatch completes
C++ functions/functors	mul	add
C++ operator class	mulOp	addOp	FCOp
Python binding	operator.mul	operator.add	operator.fc
Python function				layer.fc
	TensorFlow	PaddlePaddle
RNN	Support	Support
recursive RNN	Support	Support
padding zeros	Must	No need
blob data type	Tensor	LoDTensor