diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 378cc88c9cc708fd797c97a91ca59f0d57bd570a..0000000000000000000000000000000000000000
--- a/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-.env
-.DS_Store
-._.DS_Store
-*.mo
diff --git a/.gitmodules b/.gitmodules
index ea6254755af221ea0d76d82bbf0bef054587c96e..3bc190175db8837a22f2b255a00f66176415ec9c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,15 @@
-[submodule "paddle"]
-	path = paddle
-	url = https://github.com/PaddlePaddle/Paddle.git
-[submodule "book"]
-	path = book
-	url = https://github.com/PaddlePaddle/book.git
-[submodule "anakin"]
-	path = anakin
-	url = https://github.com/PaddlePaddle/Anakin.git
-[submodule "mobile"]
-	path = mobile
-	url = https://github.com/PaddlePaddle/paddle-mobile.git
+[submodule "external/Paddle"]
+	path = external/Paddle
+	url = https://github.com/PaddlePaddle/Paddle
+[submodule "external/book"]
+	path = external/book
+	url = https://github.com/PaddlePaddle/book
+[submodule "external/Anakin"]
+	path = external/Anakin
+	url = https://github.com/PaddlePaddle/Anakin
+[submodule "external/paddle-mobile"]
+	path = external/paddle-mobile
+	url = https://github.com/PaddlePaddle/paddle-mobile
+[submodule "external/models"]
+	path = external/models
+	url = https://github.com/PaddlePaddle/models
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ccab516189044741e083b7034530f3737b5470e9
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,51 @@
+language: cpp
+cache:
+  bundler: true
+  directories:
+    - $HOME/.ccache
+    - $HOME/.cache/pip
+    - $HOME/docker
+ #   - $TRAVIS_BUILD_DIR/external/
+    - $TRAVIS_BUILD_DIR/external/Paddle/build/third_party
+
+sudo: required
+dist: trusty
+services:
+  - docker
+os:
+  - linux
+env:
+  - JOB=doc
+  - JOB=lite_lib
+
+addons:
+  apt:
+    packages:
+      - git
+      - python
+      - python-pip
+      - python2.7-dev
+      - golang
+  ssh_known_hosts: 13.229.163.131
+before_install:
+  -  sudo pip install pylint pytest astroid isort 
+  # Load cached docker images
+  #- if [[ -d $HOME/docker ]]; then ls $HOME/docker/*.tar.gz | xargs -I {file} sh -c "zcat {file} | docker load"; fi
+  
+script:
+  - |
+     if [ $JOB == "doc" ]; then scripts/deploy_docs.sh 
+     fi
+     
+     if [ $JOB == "lite_lib" ]; then scripts/build_doc_lib_lite.sh 
+     fi 
+#before_cache:
+#  # Save tagged docker images
+#  - >
+#    mkdir -p $HOME/docker && docker images -a --filter='dangling=false' --format 'paddlepaddle/paddle:latest-dev {{.ID}}'
+#    | xargs -n 2 -t sh -c 'test -e $HOME/docker/$1.tar.gz || docker save $0 | gzip -2 > $HOME/docker/$1.tar.gz'
+    
+notifications:
+  email:
+    on_success: change
+    on_failure: always
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 36ee0a07dbab2ea7c9c1e7031ab3a871bcf1a008..0000000000000000000000000000000000000000
--- a/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  applehelp  to make an Apple Help Book"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-	@echo "  coverage   to run coverage check of the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PaddlePaddleFluid.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PaddlePaddleFluid.qhc"
-
-applehelp:
-	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
-	@echo
-	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
-	@echo "N.B. You won't be able to view it unless you put it in" \
-	      "~/Library/Documentation/Help or install it in your application" \
-	      "bundle."
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/PaddlePaddleFluid"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PaddlePaddleFluid"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
-
-coverage:
-	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
-	@echo "Testing of coverage in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/coverage/python.txt."
-
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/README.md b/README.md
deleted file mode 100644
index e744e193bd198d3f6f6b001f0a906c580173cd38..0000000000000000000000000000000000000000
--- a/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Fluid Documentation Skeleton
-
-## Build
-
-To build documentation, you need have a linux machine and have python2, virtualenv, gmake installed.
-
-### Preparation
-
-You need to create a `virtualenv` instead of polute the global python library path
-
-```bash
-virtualenv .env
-```
-
-You can enter virtualenv by
-
-```bash
-source .env/bin/activate
-```
-
-You can exit virtualenv by
-
-```bash
-deactivate
-```
-
-### Install dependencies
-
-```bash
-# enter virtualenv
-source .env/bin/activate
-# install dependencies
-pip install -r requirements.txt
-```
-
-### Make HTML
-
-```bash
-# make clean  # make clean to regenerate toctree. Just `make html` may have a cache.
-make html
-```
-and the html files will be generated to `build/html`. You can open `build/html/index.html` with your browser to see the documentation.
-
-## Edit
-
-### Edit documentation
-
-It is suggested to use `reStructuredText` because it is the only official markup language supportted by our documentation generating system, sphinx. `markdown` can also be used. However, since the `markdown` has so many dialects, there is no guarantee that the `markdown` source file can be rendered well.
-
-The `reStructuredText` cheatsheet is [here](http://docutils.sourceforge.net/docs/user/rst/quickref.html).
-
-
-### Edit structure
-
-The `sphinx` (our documentation generating system) uses `toctree` to organize documentation. `toctree` means `table of content tree`. 
-
-Please see the [sphinx documentation](http://www.sphinx-doc.org/en/master/), especially [`toctree` directives](http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html)
diff --git a/anakin b/anakin
deleted file mode 160000
index b9d95555a73f3e02aa169251cd319053b6d7d642..0000000000000000000000000000000000000000
--- a/anakin
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b9d95555a73f3e02aa169251cd319053b6d7d642
diff --git a/book b/book
deleted file mode 160000
index f4b5cc835ef77e55cfc001d51f8f77565475dc45..0000000000000000000000000000000000000000
--- a/book
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f4b5cc835ef77e55cfc001d51f8f77565475dc45
diff --git a/build/.gitignore b/build/.gitignore
deleted file mode 100644
index 72e8ffc0db8aad71a934dd11e5968bd5109e54b4..0000000000000000000000000000000000000000
--- a/build/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/source/advanced_usage/benchmark.rst b/doc/fluid/advanced_usage/benchmark.rst
similarity index 100%
rename from source/advanced_usage/benchmark.rst
rename to doc/fluid/advanced_usage/benchmark.rst
diff --git a/source/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
similarity index 100%
rename from source/advanced_usage/deploy/anakin_arm_benchmark.md
rename to doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
diff --git a/doc/fluid/advanced_usage/deploy/anakin_example.md b/doc/fluid/advanced_usage/deploy/anakin_example.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_example.md
@@ -0,0 +1,28 @@
+# Example
+Anakin目前只支持NCHW的格式
+示例文件在test/framework/net下
+
+## 在NV的GPU上运行CNN模型
+示例文件为打开example_nv_cnn_net.cpp，整体流程如下：
+- 将模型的的path设置为anakin模型的路径，初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
+- 根据模型设置网络图的输入尺寸，进行图优化
+- 根据优化后的网络图初始化网络执行器
+- 取出网络的输入tensor，将数据拷贝到输入tensor
+- 运行推导
+- 取出网络的输出tensor
+
+以NV平台为例演示Anakin框架的使用方法，注意编译时需要打开GPU编译开关
+
+## 在X86上运行RNN模型
+示例文件为example_x86_rnn_net.cpp
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+- 使用X86标识初始化图对象和网络执行器对象
+- rnn模型的输入尺寸是可变的，初始化图时的输入维度是维度的最大值，输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词，其中第0到第4个词是第一句话，第5到第11个词是第二句话
+
+以X86平台为例演示Anakin框架的使用方法，注意编译时需要打开X86编译开关
+
+## 在NV的GPU上使用Anakin的线程池运行CNN模型
+示例文件为example_nv_cnn_net_multi_thread.cpp ，示例使用worker的同步预测接口
+整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
+- 用模型地址和线程池大小初始化worker对象
+- 将输入tensor注入任务队列,获得输出tensor
diff --git a/source/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
similarity index 100%
rename from source/advanced_usage/deploy/anakin_gpu_benchmark.md
rename to doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
diff --git a/doc/fluid/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5efbc89abd469871b318c306e8cb03dd95f0c85b
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
@@ -0,0 +1,639 @@
+# Anakin 使用教程 ##
+
+本教程将会简略的介绍Anakin的工作原理，一些基本的Anakin API，以及如何调用这些API。
+  
+## 内容 ###
+
+- [Anakin的工作原理](#principle)
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## <span id = 'principle'> Anakin的工作原理</span> ###
+
+![Anakin_principle](../pics/anakin_fm_ch.png)
+
+用Anakin来进行前向计算主要分为三个步骤：
+
+- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型  
+  在使用Anakin之前，用户必须将所有其他模型转换成Anakin模型，我们提供了转换脚本，用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。
+- 生成Anakin计算图
+  加载Anakin模型生成原始计算图，然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
+- 执行计算图  
+  Anakin会选择不同硬件平台执行计算图。
+
+
+## <span id ='api'>Anakin APIs </span> ###
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理，为ops提供统一的数据接口。`Tensor`包含以下几个属性：   
+
+- Buffer  
+   数据存储区
+- Shape  
+   数据的维度信息
+- Event  
+   用于异步计算的同步
+
+ `Tensor` 类包含三个`Shape`对象， 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息，`_valid_shape`表示当前`tensor`使用的空间信息， `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
+
+
+Dimentions | Math entity |
+ :----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template<typename TargetType, DataType datatype, typename LayOutType = NCHW>
+ class Tensor .../* Inherit other class */{
+  //some implements
+  ...
+ };
+```
+
+TargetType是平台类型，如X86，GPU等等，在Anakin内部有相应的标识与之对应；datatype是普通的数据类型，在Anakin内部也有相应的标志与之对应；[LayOutType](#layout)是数据分布类型，如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
+
+1. <span id='target'>TargetType</sapn>
+
+ Anakin TargetType | platform
+  :----: | :----:|
+  NV | NVIDIA GPU
+  ARM | ARM
+  AMD | AMD GPU
+  X86 | X86
+  NVHX86 | NVIDIA GPU with Pinned Memory
+
+2. <sapn id='datatype'>DataType</span>
+
+Anakin DataType | C++ | Description 
+:---: | :---: | :---: |
+AK_HALF | short | fp16
+AK_FLOAT | float | fp32
+AK_DOUBLE | double | fp64
+AK_INT8 | char | int8
+AK_INT16 | short | int16
+AK_INT32 | int | int32
+AK_INT64 | long | int64
+AK_UINT8 | unsigned char | uint8
+AK_UINT16 | unsigned short | uint8
+AK_UINT32 | unsigned int | uint32
+AK_STRING | std::string | /
+AK_BOOL | bool | /
+AK_SHAPE | / | Anakin Shape 
+AK_TENSOR | / | Anakin Tensor 
+
+
+3. <span id = 'layout'>LayOutType </span>
+
+Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+:---: | :---: | :---: | :---: |
+W | 1-D | YES | NO
+HW | 2-D | YES | NO
+WH | 2-D | YES | NO
+NW | 2-D | YES | YES
+NHW | 3-D | YES |YES
+NCHW ( default ) | 4-D | YES | YES
+NHWC | 4-D | YES | NO
+NCHW_C4 | 5-D | YES | YES
+
+
+理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOutType，NCHW_C4是专门针对于int8这种数据类型的。
+
+
+例子
+
+> 下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
+
+> 要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
+
+> 1. 使用shape对象初始化tensor
+``` c++  
+  //create a null tensor. A null tensor holds for nothing.
+  //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
+  //tensor's Layout is NCHW(default)
+   Tensor<X86, AK_FLOAT> mytensor;
+
+   //1. using shape object to create a tensor.
+   Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+   Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
+
+  // A 4-D shape
+   Shape shape2(N, C, H, W); // batch x channel x height x width
+```
+
+>`注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`  
+
+
+```c++
+   // A 4-D tensor.
+   Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+
+   //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+   Tensor<NV, AK_INT8> mytensor3(shape2);   //right
+   
+   Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+   Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
+
+```
+
+> 2. 使用现有的数据和shape初始化tensor
+
+```c++
+
+   /**
+   *  A construtor of Tensor.
+   *  data_ptr is a pointer to any data type of data
+   *  TargetType is type of a platform [Anakin TargetType]
+   *  id : device id
+   *  shape: a Anakin shape
+   */
+   Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+   //using existing data feed to a tensor
+   Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
+```
+
+> 3. 使用tensor初始化tensor
+
+```c++
+   Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+```
+
+
+> 提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
+
+
+#### 填充tensor数据区
+
+
+填充数据区得看你申明tensor的方式， 下面展示了如何填充tensor的数据区。
+
+```c++
+首先来看看tensor的四种声明方式：
+
+1. Tensor<X86, AK_FLOAT> mytensor;
+2. Tensor<X86, AK_FLOAT, W> mytensor1(shape1);
+3. Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape);
+4. Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+
+
+相关的声明方式的数据填充方法如下：
+
+1：声明一个空的tensor，此时没有为其分配内存，所以，我们需要手动的为其分配内存。
+            
+            //parama shape
+            mytensor.re_alloc(Shape shape); 
+
+            //Get writable pointer to mytensor.
+            //parama index (int): where you start to write.
+            //Dtype is your data type such int, float or double.
+            Dtype *p = mytensor.mutable_data(index/*=0*/);
+            //write data to mytensor
+            for(int i = 0; i < mytensor.size(); i++){
+              p[i] = 1.0f;
+            }
+            //do something ...
+
+2: 这种声明方式会自动分配内存 
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor1.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+ 
+3：在该种声明方式中，我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存，得依情况而定。如果data_ptr和申明的
+tensor都在都一个目标平台上，那么该tensor就会与data_ptr共享内存空间，相反，如果他们不在同一个平台上（如data_ptr在X86上，而
+tensor在GPU上），那么此时tensor就会开辟一个新的内存空间，并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+4：该种方式仍不需要手动分配内存
+
+          //Get writable pointer to mytensor.
+          //parama index (int): where you start to write.
+          //Dtype is your data type such int, float or double.
+          Dtype *p = mytensor.mutable_data(index/*=0*/);
+          //write data to mytensor
+          for(int i = 0; i < mytensor.size(); i++){
+            p[i] = 1.0f;
+          }
+          //do something ...
+
+
+另外，你还可以获取一个tensor的可读指针，示例如下：
+        //Get read-only pointer to mytensor.
+        //parama index (int): where you start to read.
+        //Dtype is your data type such int, float or double.
+         Dtype *p = mytensor.data(index/*=0*/);
+        //do something ...
+```
+
+如果想更详细的了解tensor，请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+//some declarations
+// ...
+Shape shape = mytensor.shape();
+
+//Get a first dimetion size of tesor, if it has.
+int d1 = shape[0];
+
+//Get a second dimention size of tensor, if it has.
+int d2 = shape[1];
+
+...
+
+//Get a n-th dimention size of tensor, if it has.
+int dn = shape[n-1];
+
+
+//Get a tensor's dimention
+int dims = mytensor.dims();
+
+//Get the size of tensor.
+//size = d1 x d2 x ... x dn.
+int size = mytensor.size();
+
+//Get the size of tensor at interval [Di, Dj)
+// form i-th dimention to j-th dimention, but not including the j-th dimention.
+// which means di x (di+1) x ... x (dj -1)
+int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+
+```c++
+/**
+ * \brief set a tensor's shape
+ * \param valid_shape [a Shape object]
+ * \param shape [a Shape object]
+ * \param offset [a Shape object]
+ * \return the status of this operation, that means whether it success * or not.
+ */
+SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value)); 
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同，如果不同就会出错，返回SaberInvalidValue。 如果相同，那么将成功设置tensor的shape。
+
+```c++
+
+// some declarations
+// ...
+//valid_shape, shape , offset are Shape object;
+//All these Shape object's LayOutType must be equal to mytensor's.
+mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+//some declarations
+Shape shape, valid_shape, offset;
+
+//do some initializations
+... 
+mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意： Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样，graph也接受三个模板参数。
+
+```c++
+
+template<typename TargetType, DataType Dtype, Precision Ptype>
+class Graph ... /* inherit other class*/{
+  
+  //some implements
+  ...
+
+};
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+
+```c++
+
+//Create a empty graph object.
+Graph graph = Graph<NV, AK_FLOAT, Precision::FP32> tmp();
+
+//Create a pointer to a empty graph.
+Graph *graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+//Create a pointer to a empty graph.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+//some declarations
+...
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+std::string model_path = "the/path/to/where/your/models/are";
+const char *model_path1 = "the/path/to/where/your/models/are";
+
+//Loading Anakin model to generate a compute graph.
+auto status = graph->load(model_path);
+
+//Or this way.
+auto status = graph->load(model_path1);
+//Check whether load operation success.
+if(!status){
+  std::cout << "error" << endl;
+  //do something...
+}
+
+```
+
+#### 优化计算图
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//According to the ops of loaded graph, optimize compute graph.
+graph->Optimize();
+
+```
+
+> 注意： 第一次加载原始图，必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型， 特别的， 你可以保存一个优化的模型，这样，下次再加载模型时，就不必进行优化操作。
+
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+// save a model
+//save_model_path: the path to where your model is.
+auto status = graph->save(save_model_path);
+
+//Checking
+if(!status){
+  cout << "error" << endl;
+  //do somethin...
+}
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+vector<int> shape{10, 256, 256, 10};
+//input_name : std::string.
+//Reshape a tensor named input_name.
+graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//input_name : std::string.
+//Reset a tensor named input_name.
+int new_batch_size = 4;
+graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+###  Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。  
+
+
+```c++
+template<typename TargetType, DataType Dtype, Precision PType OpRunType RunType = OpRunType::ASYNC>
+class Net{
+  //some implements
+  ...
+
+};
+```
+由于有些Op可能支持多种精度，我们可以通过Precision来指定。OpRunType表示同步或异步类型，异步是默认类型。OpRunType::SYNC表示同步，在GPU上只有单个流；OpRunType::ASYNC表示异步，在GPU上有多个流并以异步方式执行。实际上，Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. <span id = 'precision'> Precision </span>
+
+Precision | Op support
+:---: | :---:
+Precision::INT4 | NO
+Precision::INT8 | NO
+Precision::FP16 | NO
+Precision::FP32 | YES
+Precision::FP64 | NO
+
+现在Op的精度只支持FP32， 但在将来我们会支持剩下的Precision.
+
+
+
+2. OpRunType
+
+OpRunType | Sync/Aync |Description
+:---: | :---: | :---:
+OpRunType::SYNC | Synchronization | single-stream on GPU
+OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器。
+```c++
+//some declarations
+...
+//Create a pointer to a graph.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+//do something...
+...
+
+//create a executor
+Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+
+获取输入输出tensor，并填充输入tensor的buffer。如果想要获取输入和输出tensor，那么必须指定输入的名字，如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外，如果想知道input_i对应哪个输入，你需要去dash board查看，如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
+
+```c++
+//some declaratinos
+...
+
+//create a executor
+//TargetType is NV [NVIDIA GPU]
+Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
+
+//Get the first input tensor.
+//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+//Note: Member function get_in returns an pointer to tensor.
+Tensor<NV, AK_FLOAT>* tensor_in0 = executor.get_in("input_0");
+
+//If you have multiple input tensors
+//You just type this code below.
+Tensor<NV, AK_FLOAT>* tensor_in1 = executor.get_in("input_1");
+...
+auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后，就可以填充它的数据区了。
+
+```c++
+//This tensor is resident at GPU.
+auto tensor_d_in = executor.get_in("input_0");
+
+//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+//using Tensor4d = Tensor<Ttype, Dtype>;
+Tensor4d<X86, AK_FLOAT> tensor_h_in; //host tensor;
+//Tensor<X86, AK_FLOAT> tensor_h_in; 
+
+//Allocate memory for host tensor.
+tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+//Get a writable pointer to tensor.
+float *h_data = tensor_h_in.mutable_data();
+
+//Feed your tensor.
+/** example
+for(int i = 0; i < tensor_h_in.size(); i++){
+  h_data[i] = 1.0f;
+}
+*/
+//Copy host tensor's data to device tensor.
+tensor_d_in->copy_from(tensor_h_in);
+
+// And then
+```
+
+
+类似的，我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是， 我们需要指定输入tensor结点的名字，这个可以从dash board中看到，请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor：
+```c++
+//Note: this tensor are resident at GPU.
+Tensor<NV, AK_FLOAT>* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+
+#### Executing graph
+
+
+当一切准备就绪后，我们就可以执行真正的计算了！
+```c++
+executor.prediction();
+```
+ 
+## <span id='example'> 示例代码 </span> ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前， 请确保你已经有了Anakin模型。如果还没有，那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+// Create an empty graph object.
+auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
+// Load Anakin model.
+auto status = graph->load(model_path);
+if(!status ) {
+    LOG(FATAL) << " [ERROR] " << status.info();
+}
+// Reshape
+graph->Reshape("input_0", {10, 384, 960, 10});
+// You must optimize graph for the first time.
+graph->Optimize();
+// Create a executer.
+Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
+
+//Get your input tensors through some specific string such as "input_0", "input_1", and 
+//so on. 
+//And then, feed the input tensor.
+//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+auto d_tensor_in_p = net_executer.get_in("input_0");
+Tensor4d<X86, AK_FLOAT> h_tensor_in;
+auto valid_shape_in = d_tensor_in_p->valid_shape();
+for (int i=0; i<valid_shape_in.size(); i++) {
+    LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; //see tensor's dimentions
+}
+h_tensor_in.re_alloc(valid_shape_in);
+float* h_data = h_tensor_in.mutable_data();
+for (int i=0; i<h_tensor_in.size(); i++) {
+    h_data[i] = 1.0f;
+}
+d_tensor_in_p->copy_from(h_tensor_in);
+
+//Do inference.
+net_executer.prediction();
+
+//Get result tensor through the name of output node.
+//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+//For example, you've got a output node named obj_pre_out
+//Then, you can get an output tensor.
+auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+//......
+// do something else ...
+//...
+//save model.
+//You might not optimize the graph when you load the saved model again.
+std::string save_model_path = model_path + std::string(".saved");
+auto status = graph->save(save_model_path);
+if (!status ) {
+    LOG(FATAL) << " [ERROR] " << status.info();
+}
+
+```
diff --git a/source/advanced_usage/deploy/build_and_install_lib_cn.rst b/doc/fluid/advanced_usage/deploy/build_and_install_lib_cn.rst
similarity index 100%
rename from source/advanced_usage/deploy/build_and_install_lib_cn.rst
rename to doc/fluid/advanced_usage/deploy/build_and_install_lib_cn.rst
diff --git a/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..56ca582b2b47f404ede777712830731ea7f4e9b5
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
@@ -0,0 +1,73 @@
+# 模型转换指南
+
+Anakin 支持不同框架的模型预测。但由于格式的差别，Anakin 需要您预先转换模型。本文档介绍如何转换模型。
+
+## 简介
+
+Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型，模型包含网络结构（model 或 prototxt）和权重参数（param 或 caffemodel）。   
+
+模型转换的输出是一个 bin 文件，它作为 Anakin 框架的 graph 参数导入。   
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。   
+
+
+## 系统要求
+
+- python 2.7+
+- pyyaml
+- flask
+- protobuf 3.5+
+
+
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于 *系统要求* 一节。
+
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例，下面作进一步说明。
+
+#### config.yaml
+```bash
+OPTIONS:
+    Framework: CAFFE       # 依框架类型填写 CAFFE 或 FLUID
+    SavePath: ./output     # 转换结束后模型的保存位置
+    ResultName: googlenet  # 输出模型的名字
+    Config:
+        LaunchBoard: ON    # 是否生成网络结构预览页面
+        Server:
+            ip: 0.0.0.0
+            port: 8888     # 从一个可用端口访问预览页面
+        OptimizedGraph:    # 当您使用了 Anakin 框架的 Optimized 功能时，才应该打开此项
+            enable: OFF
+            path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
+    LOGGER:
+        LogToPath: ./log/  # 生成日志的路径
+        WithColor: ON
+
+TARGET:
+    CAFFE:
+        # 当 Framework 为 CAFFE 时需填写
+        ProtoPaths:
+            - /path/to/caffe/src/caffe/proto/caffe.proto
+        PrototxtPath: /path/to/your/googlenet.prototxt
+        ModelPath: /path/to/your/googlenet.caffemodel
+
+    FLUID:
+        # 当 Framework 为 FLUID 时需填写
+        Debug: NULL
+        ProtoPaths:
+            - /
+        PrototxtPath: /path/to/fluid/inference_model
+        ModelPath: /path/to/fluid/inference_model
+	# ...
+```
+
+### 3、转换
+在完成配置文件的修改后，您只需执行 ```python converter.py``` 就可以进行模型转换了。
+
+
+### 4、预览
+最后一步，就是在浏览器中查看令人振奋的转换结果！网址是在 *config.yaml* 中配置的，例如 http://0.0.0.0:8888 。
+
+> 注意：若您使用了默认的 IP 地址 0.0.0.0，请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2783eb9f591a31443f2a692ce0eb1bcc9b1063a
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念，详情请参考设计文档。
+
+```framework```: 上层的逻辑代码，负责从parser中获取参数及weights，添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码，Anakin通过saber封装了不同的backends，不同的实现(impl)分别特化出自己的实现，外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中，增加op主要修改saber/funcs下的内容。
+
+saber的文件结构：
+* saber/funcs下的是各个funcs的外部接口，这一层的op与具体的设备实现无关，只与各op完成的功能有关。由于跟实现(impl)无关，本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明，特定设备需要完成该层声明的特化版本，如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本，saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关，均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件，添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类，位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类，提供统一的对外接口，位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape，并通过```tensor```的```set_shape```接口(只设置shape，不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口，所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类，一类以```vender_```为前缀，带有```vender_```代码意为使用第三方库来实现该op，如cudnn的conv，或mkl的conv等等，这类op的性能我们难以调优，因此单独列为一类。另一类是带有源码的saber实现，这些实现都带有```saber_```为前缀，此类实现带有源码，能够通过后续优化不断提升性能，实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步：
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步，以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下：$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件：```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param，这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数，含参数的构造函数，复制构造函数，```operator=()```及```operator==()```。
+```
+template <typename opTensor> // 能够获得target, datatype, layout
+struct MulParam{
+  MulParam()
+    : alpha(0)
+  {}
+  MulParam(float alpha_in)
+    : alpha(alpha_in)
+  {}
+  MulParam(const MulParam& right)
+    : alpha(right.alpha)
+  {}
+  MulParam &operator=(const MulParam &right) {
+    alpha = right.alpha;
+  }
+  bool operator==(const MulParam &right) {
+    return alpha == right.alpha;
+  }
+  float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类，这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现，需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template<typename TargetType,
+        DataType OpDtype,
+        DataType inDtype = AK_FLOAT,
+        DataType outDtype = AK_FLOAT,
+        typename LayOutType_op = NCHW,
+        typename LayOutType_in = NCHW,
+        typename LayOutType_out = NCHW>
+class Mul : public BaseFunc<
+        Tensor<TargetType, inDtype, LayOutType_in>,
+        Tensor<TargetType, outDtype, LayOutType_out>,
+        Tensor<TargetType, OpDtype, LayOutType_op>,
+        ImplBase, MulParam> {
+public:
+    using BaseFunc<
+            Tensor<TargetType, inDtype, LayOutType_in>,
+            Tensor<TargetType, outDtype, LayOutType_out>,
+            Tensor<TargetType, OpDtype, LayOutType_op>,
+            ImplBase, MulParam>::BaseFunc;
+    Mul() = default;
+    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
+    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
+    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
+    typedef MulParam<OpTensor> Param_t;
+    typedef std::vector<InDataTensor *> Input_v;
+    typedef std::vector<OutDataTensor *> Output_v;
+    typedef std::vector<Shape> Shape_v;
+
+    virtual SaberStatus compute_output_shape(const Input_v &input,
+                                             Output_v &output, Param_t &param) override {
+        //计算输出的shape，
+        Shape output_shape = (input[0]->valid_shape());
+        /* code */
+        return output[0]->set_shape(output_shape);
+    }
+    virtual SaberStatus init_impl(ImplEnum implenum) override {
+      // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+      switch (implenum) {
+            case VENDER_IMPL:
+                this->_impl.push_back(new VenderMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            case SABER_IMPL:
+                this->_impl.push_back(new SaberMul <TargetType,
+                OpDtype, inDtype, outDtype,
+                LayOutType_op, LayOutType_in, LayOutType_out>);
+                return SaberSuccess;
+            default:
+                return SaberUnImplError;
+        }
+    }
+private:
+    virtual void pick_best_static() override {
+        if (true) // some condition?
+            this->_best_impl = this->_impl[0];
+    }
+    virtual void pick_best_specify(ImplEnum implenum) override {
+        this->_best_impl = this->_impl[0];
+    }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl<span id="declare">声明</span>
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明，特化版本放在对应的文件夹下，这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字，第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端<span id="impl">实现</span>
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op，```saber```指的源码实现的op。这里以cuda的vender实现为例，简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template <DataType OpDtype,
+    DataType inDtype,
+    DataType outDtype,
+    typename LayOutType_op,
+    typename LayOutType_in,
+    typename LayOutType_out>
+class VenderMul<NV, //偏特化出需要的后端。
+    OpDtype, inDtype, outDtype,
+    LayOutType_op, LayOutType_in, LayOutType_out> :
+    public ImplBase<
+        Tensor<NV, inDtype, LayOutType_in>,
+        Tensor<NV, outDtype, LayOutType_out>,
+        Tensor<NV, OpDtype, LayOutType_op>,
+        MulParam<Tensor<NV, OpDtype, LayOutType_op> > >
+{
+public:
+    typedef Tensor<NV, inDtype, LayOutType_in> DataTensor_in;
+    typedef Tensor<NV, outDtype, LayOutType_out> DataTensor_out;
+    typedef Tensor<NV, OpDtype, LayOutType_op> OpTensor;
+    typedef typename DataTensor_in::Dtype InDataType;
+    typedef typename DataTensor_out::Dtype OutDataType;
+    typedef typename OpTensor::Dtype OpDataType;
+    VenderMul(){}
+    ~VenderMul() {}
+
+    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        this->_ctx = ctx;
+        create(inputs, outputs, param, ctx);
+    }
+
+    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
+                            std::vector<DataTensor_out *>& outputs,
+                            MulParam<OpTensor>& param, Context<NV>& ctx) {
+        // set内部参数
+    }
+
+    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
+                          std::vector<DataTensor_out*>& outputs,
+                        MulParam<OpTensor>& param) {
+        // dispatch kernel.
+    }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别：```init```接口是第一次初始化op的时候进入的接口，此函数只在第一次初始化op时调用，这个接口一般放一些只需要执行一次的代码，如malloc或者create之类的函数。```create```函数除了第一次init执行外，在输入发生变化或者param发生变化时会再次触发，create一般放置set函数，设置内部变量，当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内，如果```create```函数执行了一些严重耗时的操作，这里会拖慢整个op的执行时间，需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper;
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class Mul : public Operator<Ttype, Dtype, Ptype> {
+public:
+    Mul() {}
+    /// forward impl
+    virtual void operator() (OpContext<Ttype> &ctx,
+                             const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                             std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+        LOG(ERROR) << "Not Impl Yet Operator power<TargetType:"<<"unknown"<<","
+                   <<type_id<typename DataTypeWarpper<Dtype>::type>().type_info()<<">";
+    }
+    friend class MulHelper<Ttype, Dtype, Ptype>;
+};
+template<typename Ttype, DataType Dtype, Precision Ptype>
+class MulHelper : public OperatorHelper<Ttype, Dtype, Ptype> {
+public:
+    MulHelper() = default;
+    ~MulHelper();
+    Status InitParam() override;
+
+    Status Init(OpContext<Ttype> &ctx,
+                const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+    Status InferShape(const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+                      std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
+
+public:
+    saber::MulParam<Tensor4d<Ttype, Dtype>> _param_mul;
+    saber::Mul<Ttype, Dtype> _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下：
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul<NV, AK_FLOAT, Precision::FP32>::operator()(
+    OpContext<NV>& ctx,
+    const std::vector<Tensor4dPtr<NV, AK_FLOAT> >& ins,
+    std::vector<Tensor4dPtr<NV, AK_FLOAT> >& outs) {
+    auto* impl =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper);
+    auto& param =
+        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper)->_param_mul;
+    impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InitParam() {
+    auto alpha = GET_PARAMETER(float, alpha);
+    MulParam<Tensor4d<Ttype, Dtype>> param_mul(alpha);
+    _param_mul = param_mul;
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::Init(OpContext<Ttype>& ctx,
+        const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+
+    SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+    return Status::OK();
+}
+
+template<typename Ttype, DataType Dtype, Precision Ptype>
+Status MulHelper<Ttype, Dtype, Ptype>::InferShape(const
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
+        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
+    SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+    return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper<NV, AK_FLOAT, Precision::FP32>;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper<ARM, AK_FLOAT, Precision::FP32>;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__<NV, AK_FLOAT, Precision::FP32>("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__<ARM, AK_FLOAT, Precision::FP32>("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args<float>("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+    // init tensors and some param.
+
+    // start Reshape & doInfer
+    Context<NV> ctx1(0, 1, 1);
+
+    // create param
+    MulParam<Tensor<NV, AK_FLOAT, NCHW> > param(alpha);
+
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> input;
+    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> output;
+
+    // create saber op
+    Mul<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> mul;
+
+    // compute output shape
+    mul.compute_output_shape(input, output, param);
+
+    // re_alloc output tensors memory based on output shape
+    output[0]->re_alloc(output[0]->shape());
+
+    // init saber op(calling init and create)
+    mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+    // call operator()
+    mul(input, output, param, ctx1);
+
+    // cuda specified, record events
+    cudaStream_t cuda_stream = ctx1.get_compute_stream();
+    output[0]->record_event(cuda_stream);
+    output_dev.sync();
+    
+    // param changed 
+    param.alpha = 2.0;
+    // auto calling saber op(create and dispatch)
+    mul(input, output, param, ctx1);
+
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+    anakin::saber::Env<NV>::env_init();
+
+    // initial logger
+    //logger::init(argv[0]);
+    InitTest();
+    RUN_ALL_TESTS(argv[0]);
+    return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现，由于存在saber/funcs/impl的非特化版本声明，当有op在某种设备下没有对应实现时，也能够编译，但此时是没有任何实现的空实现，
diff --git a/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f75f5e95cfb90f26d3782ba30a6d1887a70424
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤：
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## <span id = '0001'> 在`CMakeList`中添加设备的支持 </span> ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下，修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+    anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项    
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## <span id = '0002'> 在`saber`中添加设备的实现 </span> ##
+`saber`是`Anakin`的基础计算库，对外提供设备无关的统一的API，设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+    eINVALID = -1,
+    eNV = 1,
+    eAMD = 2,
+    eARM = 3,
+    eX86 = 4,
+    eNVHX86 = 5,
+    eTNEW = 6
+};
+
+typedef TargetType<eNV> NV;
+typedef TargetType<eARM> ARM;
+typedef TargetType<eAMD> AMD;
+typedef TargetType<eX86> X86;
+typedef TargetType<eTNEW> TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits<TNEW> {
+    typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+    typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型，则特化出设备的`DataTrait`类的实现，例如opencl数据类型的实现如下：
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+    ClMem(){
+        dmem = nullptr;
+        offset = 0;
+    }
+
+    ClMem(cl_mem* mem_in, int offset_in = 0) {
+        dmem = mem_in;
+        offset = offset_in;
+    }
+
+    ClMem(ClMem& right) {
+        dmem = right.dmem;
+        offset = right.offset;
+    }
+
+    ClMem& operator=(ClMem& right) {
+        this->dmem = right.dmem;
+        this->offset = right.offset;
+        return *this;
+    }
+
+    ClMem& operator+(int offset_in) {
+        this->offset += offset_in;
+        return *this;
+    }
+
+    int offset{0};
+    cl_mem* dmem;
+};
+
+template <>
+struct DataTrait<AMD, AK_FLOAT> {
+    typedef ClMem Dtype;
+    typedef float dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_DOUBLE> {
+    typedef ClMem Dtype;
+    typedef double dtype;
+};
+
+template <>
+struct DataTrait<AMD, AK_INT8> {
+    typedef ClMem Dtype;
+    typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类，在`target_wrapper.h`中声明函数，具体如下：
+```c++
+template <>
+struct TargetWrapper<TNEW, __xxx_target> { //根据TNEW的具体类型修改__xxx_target，__host_target或者__device_target
+
+    typedef xxx_event event_t;          //根据设备实现xxx_event
+    typedef xxx_stream stream_t;        //根据设备实现xxx_stream
+
+    static void get_device_count(int& count);
+
+    static void set_device(int id);
+
+    //We should add strategy to avoid malloc directly
+    static void mem_alloc(void** ptr, size_t n);
+
+    static void mem_free(void* ptr);
+
+    static void mem_set(void* ptr, int value, size_t n);
+
+    static void create_event(event_t& event, bool flag = false);
+
+    static void create_stream(stream_t& stream);
+
+    static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+    static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+    static void destroy_stream(stream_t& stream);
+
+    static void destroy_event(event_t& event);
+
+    static void record_event(event_t& event, stream_t stream);
+
+    static void query_event(event_t& event);
+
+    static void sync_event(event_t& event);
+
+    static void sync_stream(event_t& event, stream_t& stream);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __HtoD);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __HtoD);
+
+    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                            size_t count, __DtoH);
+
+    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+                             size_t count, stream_t& stream, __DtoH);
+
+    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                int src_dev, size_t count);
+
+    static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+                                 int src_dev, size_t count, stream_t& stream);
+
+    static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper<TNEW, __xxx_target>`结构体中各函数的定义。    
+如果`TargetWrapper<TNEW, __xxx_target>`的实现与默认的模板类一致，则不用特化出该类。
+
+```c++
+typedef TargetWrapper<TNEW, __xxx_target> TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+    // add implementation
+}
+
+void TNEW_API::set_device(int id){
+    // add implementation
+}
+        
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+    // add implementation
+}
+        
+void TNEW_API::mem_free(void* ptr){
+    if(ptr != nullptr){
+        // add implementation
+    }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device<TNEW>`
+
+```c++
+template <>
+void Device<TNEW>::create_stream() {
+    // add implementation
+}
+
+template <>
+void Device<TNEW>::get_info() {
+
+    // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](addCustomOp.md)
+
+
+## <span id = '0003'> 在`framework`中添加设备的具体化或实例化 </span> ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc<TNEW, AK_FLOAT, Precision::FP32>;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
+template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP32>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP16>(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::INT8>(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template<typename Dtype>
+class PBlock<Dtype, TNEW> {
+public:
+	typedef Tensor4d<TNEW, DataTypeRecover<Dtype>::type> type;
+
+	PBlock() {
+		_inner_tensor = std::make_shared<type>(); 
+	}
+	...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host<saber::TNEW> {
+    typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+  
+```c++
+  #ifdef USE_TNEW_PLACE
+  template class Graph<TNEW, AK_FLOAT, Precision::FP32>;
+  template class Graph<TNEW, AK_FLOAT, Precision::FP16>;
+  template class Graph<TNEW, AK_FLOAT, Precision::INT8>;
+  #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+  
+```c++
+  #ifdef USE_TNEW_PLACE
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+  
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          std::string& model_path);
+  template
+  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          std::string& model_path);
+  
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
+          const char* model_path);
+  template
+  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
+          const char* model_path);
+  #endif
+```
+
+* `model_io.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP32>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::FP16>;
+template class NodeIO<TNEW, AK_FLOAT, Precision::INT8>;
+#endif
+```
+
+### `framework/operators`
+
+为`framework/operators`目录下所有op添加实例化或具体化
+以`activation.cpp`为例，实例化如下：
+
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template class ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>;
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+如果TNEW设备函数的实现与现有模板实现不一致，可以特化实现如下（以init()为例）：
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template <>
+Status ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>::Init(OpContext<TNEW> &ctx,\
+        const std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& ins, \
+                std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& outs) {
+    SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
+    return Status::OK();
+}
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
+
+```c++
+#ifdef USE_TNEW_PLACE
+.__alias__<TNEW, AK_FLOAT, Precision::FP32>("activation")
+#endif
+```
+
+## 注意事项
+不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/source/advanced_usage/deploy/index_anakin.rst b/doc/fluid/advanced_usage/deploy/index_anakin.rst
similarity index 100%
rename from source/advanced_usage/deploy/index_anakin.rst
rename to doc/fluid/advanced_usage/deploy/index_anakin.rst
diff --git a/source/advanced_usage/deploy/index_mobile.rst b/doc/fluid/advanced_usage/deploy/index_mobile.rst
similarity index 100%
rename from source/advanced_usage/deploy/index_mobile.rst
rename to doc/fluid/advanced_usage/deploy/index_mobile.rst
diff --git a/source/advanced_usage/deploy/index_native.rst b/doc/fluid/advanced_usage/deploy/index_native.rst
similarity index 100%
rename from source/advanced_usage/deploy/index_native.rst
rename to doc/fluid/advanced_usage/deploy/index_native.rst
diff --git a/doc/fluid/advanced_usage/deploy/install_anakin.md b/doc/fluid/advanced_usage/deploy/install_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb7c1950308622e3de292268a718e6ec688e6ae6
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/install_anakin.md
@@ -0,0 +1,69 @@
+## 从源码编译安装Anakin ##
+
+我们已经在CentOS 7.3上成功的安装和测试了Anakin，对于其他操作系统，我们将很快支持。
+
+### 安装概览 ###
+
+* [在CentOS上安装 Anakin]()
+* [在Ubuntu上安装 Anakin]()
+* [在ARM上安装 Anakin](run_on_arm_ch.md)
+* [验证安装]()
+
+
+### 在CentOS上安装 Anakin ###
+#### 1. 系统要求 ####
+
+*  make 3.82+
+*  cmake 2.8.12+
+*  gcc 4.8.2+
+*  g++ 4.8.2+
+*  其他需要补充的。。。
+
+#### 2. 编译CPU版Anakin ####
+
+暂时不支持
+
+#### 3. 编译支持NVIDIA GPU的Anakin ####
+
+- 3.1. 安装依赖
+  - 3.1.1 protobuf  
+    >$ git clone https://github.com/google/protobuf  
+    >$ cd protobuf  
+    >$ git submodule update --init --recursive  
+    >$ ./autogen.sh  
+    >$ ./configure --prefix=/path/to/your/insall_dir  
+    >$ make  
+    >$ make check  
+    >$ make install  
+    >$ sudo ldconfig
+
+
+    如安装protobuf遇到任何问题，请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
+
+- 3.2 CUDA Toolkit
+  - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher. 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  - [cuDNN v7](https://developer.nvidia.com/cudnn). 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). 
+- 3.3  编译Anakin
+  >$ git clone https:/xxxxx  
+  >$ cd anakin  
+  >$ mkdir build  
+  >$ camke ..  
+  >$ make
+
+
+#### 4. 编译支持AMD GPU的Anakin ####
+
+暂时还不支持
+
+
+### 在Ubuntu上安装 Anakin ###
+
+暂时还不支持
+
+
+### 在ARM上安装 Anakin ###
+
+暂时还不支持
+
+### 验证安装 ###
+we are coming soon...
diff --git a/doc/fluid/advanced_usage/deploy/mobile_build.md b/doc/fluid/advanced_usage/deploy/mobile_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..e51593164987d548e256ddebbc5fa8d960fb5255
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/mobile_build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/doc/fluid/advanced_usage/deploy/mobile_dev.md b/doc/fluid/advanced_usage/deploy/mobile_dev.md
new file mode 100644
index 0000000000000000000000000000000000000000..474380f9dbfd2fb8a06630cb1ca3ca5cd14ca9d9
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/mobile_dev.md
@@ -0,0 +1,72 @@
+# iOS开发文档
+
+## 编译
+
+### 一. 使用 build.sh 编译
+
+```sh 
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+
+### 二. 使用 xcode 编译
+
+我们提供了 ios 开发更为熟悉的 xcode 编译环境:
+在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+
+### 三. 集成
+
+#### 如使用 c++ 接口
+将 
+
+```
+libpaddle-mobile.a 
+io.h  
+program.h 
+types.h 
+lod_tensor.h 
+tensor.h
+```
+拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+
+#### 如使用 oc 接口
+将在xcode 编译生成的
+```
+libPaddleMobile.a 
+PaddleMobile.h
+```
+拖入工程, 接口如下:
+
+```
+/*
+	创建单例对象
+*/
++ (instancetype)sharedInstance;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
diff --git a/doc/fluid/advanced_usage/deploy/native_infer.rst b/doc/fluid/advanced_usage/deploy/native_infer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aa9377c112856693cda72779bd399f2415d716f0
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/native_infer.rst
@@ -0,0 +1,105 @@
+Paddle 预测 API
+===============
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API
+用来隐藏底层不同的优化实现。
+
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
+包括
+
+-  头文件 ``paddle_inference_api.h`` 定义了所有的接口
+-  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
+
+编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
+
+下面是一些 API 概念的介绍
+
+PaddleTensor
+------------
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+.. code:: cpp
+
+    struct PaddleTensor {
+      std::string name;  // variable name.
+      std::vector<int> shape;
+      PaddleBuf data;  // blob of data.
+      PaddleDType dtype;
+    };
+
+-  ``name`` 用于指定输入数据对应的 模型中variable 的名字
+   （暂时没有用，但会在后续支持任意 target 时启用）
+-  ``shape`` 表示一个 Tensor 的 shape
+-  ``data`` 数据以连续内存的方式存储在\ ``PaddleBuf``
+   中，\ ``PaddleBuf``
+   可以接收外面的数据或者独立\ ``malloc``\ 内存，详细可以参考头文件中相关定义。
+-  ``dtype`` 表示 Tensor 的数据类型
+
+engine
+------
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+-  原生 engine，由 paddle 原生的 forward operator
+   组成，可以天然支持所有paddle 训练出的模型，
+-  Anakin engine，封装了
+   `Anakin <https://github.com/PaddlePaddle/Anakin>`__
+   ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle
+   模型，
+-  TensorRT mixed engine，用子图的方式支持了
+   `TensorRT <https://developer.nvidia.com/tensorrt>`__ ，支持所有paddle
+   模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+.. code:: cpp
+
+    enum class PaddleEngineKind {
+      kNative = 0,       // Use the native Fluid facility.
+      kAnakin,           // Use Anakin for inference.
+      kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+    };
+
+预测部署过程
+------------
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 ``PaddlePredictor``
+2. 创建输入用的 ``PaddleTensor``\ ，传入到 ``PaddlePredictor`` 中
+3. 获取输出的 ``PaddleTensor`` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+.. code:: cpp
+
+    #include "paddle_inference_api.h"
+
+    // 创建一个 config，并修改相关设置
+    paddle::NativeConfig config;
+    config.model_dir = "xxx";
+    config.use_gpu = false;
+    // 创建一个原生的 PaddlePredictor
+    auto predictor =
+          paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+    // 创建输入 tensor
+    int64_t data[4] = {1, 2, 3, 4};
+    paddle::PaddleTensor tensor{.name = "",
+                                .shape = std::vector<int>({4, 1}),
+                                .data = PaddleBuf(data, sizeof(data)),
+                                .dtype = PaddleDType::INT64};
+    // 创建输出 tensor，输出 tensor 的内存可以复用
+    std::vector<paddle::PaddleTensor> outputs;
+    // 执行预测
+    CHECK(predictor->Run(slots, &outputs));
+    // 获取 outputs ...
+
+编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
+
+详细代码参考
+------------
+
+-  `inference
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
diff --git a/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
new file mode 100644
index 0000000000000000000000000000000000000000..ebeb38f534ebfc8cb5a41d103abe3bb1de7e379a
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
@@ -0,0 +1,151 @@
+## 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台，采用Android NDK交叉编译工具链，已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### <span id = '0001'> 1. 系统需求 </span> ###
+
+*  宿主机: linux, mac    
+*  cmake 3.8.2+    
+*  Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### <span id = '0002'> 2. 安装第三方依赖 </span> ###
+
+- 2.1 protobuf3.4.0     
+   源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)    
+ - 2.1.1 为宿主机编译protobuf     
+ ```bash
+   $ tar -xzf protobuf-3.4.0.tar.gz  
+   $ cd protobuf-3.4.0   
+   $ ./autogen.sh  
+   $ ./configure    
+   $ make  
+   $ make check   
+   $ make install
+   ```
+   上述 $make install 执行后，可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下，
+   如有问题，请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
+   然后将已经生成文件清除。
+ ```bash
+   $ make distclean
+   ```
+ - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf，注意设置ANDROID_NDK的路径，以及ARCH_ABI、HOSTOSN的值，   
+ ```bash
+
+   $ export ANDROID_NDK=your_ndk_path 
+   $ ARCH_ABI="arm-linux-androideabi-4.9"
+   $ HOSTOSN="darwin-x86_64"
+   $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm  
+   $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+   $ export LDFLAGS="--sysroot=$SYSROOT"
+   $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+   $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+   $ export CPPFLAGS=""
+   $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+   $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+   $ export CCFLAGS="$CXXFLAGS"
+   $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+   $ export CC="$CXX"
+   $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"  
+   $ ./autogen.sh  
+   $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"  
+   $ make
+  ```
+  
+  编译生成 *.a 静态库，若希望编译*.so 动态链接库 ，请在./configure参数中改--disable-shared为--disable-static --enable-shared。  
+  生成文件在src/.libs/下，将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。  
+  在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。        
+  ```cmake
+  set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+  ```
+  
+- 2.2 opencv 2.4.3+(optional)    
+    Anakin只在examples示例中使用opencv   
+    Android系统的opencv从[这里下载](https://opencv.org/releases.html)    
+    解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`    
+    在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`, 
+    并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。   
+    ```cmake
+    include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+    LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+    ```
+### <span id = '0003'> 3. Anakin源码编译 </span> ###
+
+#### 编译Android版本
+
+   克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+```bash
+    cd your_dir
+    git clone https://github.com/PaddlePaddle/Anakin.git
+    cd Anakin
+    git fetch origin arm
+    git checkout arm
+  ```
+  修改`android_build.sh`    
+- 修改NDK路径    
+  ```bash
+    #modify "your_ndk_path" to your NDK path
+    export ANDROID_NDK=your_ndk_path
+  ```
+- 修改ARM 处理器架构     
+  对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`， 
+  对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。        
+  目前我们只支持 `armeabi-v7a with NEON`；`arm64-v8a` 还在开发中。      
+  ```bash
+      -DANDROID_ABI="armeabi-v7a with NEON"
+  ```
+- 设置Android API    
+  根据Android系统的版本设置API level， 例如API Level 21 -> Android 5.0.1    
+  ```bash
+      -DANDROID_NATIVE_API_LEVEL=21
+  ```
+
+- 选择编译静态库或动态库    
+  设置`BUILD_SHARED=NO`编译静态库    
+  设置`BUILD_SHARED=YES`编译动态库    
+  ```bash
+      -DBUILD_SHARED=NO
+  ```
+- OpenMP多线程支持    
+  设置`USE_OPENMP=YES`开启OpenMP多线程    
+  ```bash
+      -DUSE_OPENMP=YES
+  ```
+  
+- 编译单测文件    
+  设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件    
+    ```bash
+        -DBUILD_WITH_UNIT_TEST=YES
+    ```
+
+- 编译示例文件    
+  设置`BUILD_EXAMPLES=YES`将会编译示例文件    
+    ```bash
+        -DBUILD_EXAMPLES=YES
+    ```
+  
+- 开启opencv    
+  如果使用opencv，设置`USE_OPENCV=YES`    
+    ```bash
+        -DUSE_OPENCV=YES
+    ```
+    
+- 开始编译    
+  运行脚本 `android_build.sh` 将自动编译Anakin     
+  ```bash
+      ./android_build.sh
+  ```
+
+### <span id = '0004'> 4. 验证安装 </span> ###    
+  编译好的库会放在目录`${Anakin_root}/output`下；    
+  编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下；    
+  编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
+  
+  对于Android系统，打开设备的调试模式，通过ADB可以访问的目录是`data/local/tmp`，通过ADB push将测试文件、模型和数据发送到设备目录， 运行测试文件。
diff --git a/doc/fluid/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/advanced_usage/development/contribute_to_paddle.md
new file mode 100644
index 0000000000000000000000000000000000000000..3244eedf918b93f9351258f1218dfb2d507c1a9c
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/contribute_to_paddle.md
@@ -0,0 +1,243 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
+
+## 克隆（Clone）
+
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
+```
+
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
+
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像，在源码目录树的根目录中运行：
+
+```bash
+➜  docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
+
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试，可以用如下命令：
+
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
+
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
+
+选择目标分支：
+
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
+
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
+
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后，删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+
+至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/doc/fluid/advanced_usage/development/cpu_profiling_cn.md b/doc/fluid/advanced_usage/development/cpu_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..198a05a79e19227e90eaafe116217a164cd51a7d
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/cpu_profiling_cn.md
@@ -0,0 +1,183 @@
+# CPU性能调优
+
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/doc/fluid/advanced_usage/development/gpu_profiling_cn.rst b/doc/fluid/advanced_usage/development/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2396716bddd4810fa77c738d41f5482aa6d6055
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+============
+GPU性能调优
+============
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/fluid/advanced_usage/development/host_memory_profiling_cn.md b/doc/fluid/advanced_usage/development/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb0883dd937465d15479b29df95078edb50e069
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+# 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+
+
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能：
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 环境
+
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+
+## 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+	
diff --git a/doc/fluid/advanced_usage/development/new_op.md b/doc/fluid/advanced_usage/development/new_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff7408111fa20a7a6a3a2fe9f9ba20835918f399
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/new_op.md
@@ -0,0 +1,435 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
+   - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
+
+
+## 概念简介
+
+简单介绍需要用到基类，详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写，Op)基类。
+- `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+- `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+- `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> .cc 文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
+</tr>
+</tbody>
+</table>
+
+
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 定义ProtoMaker类
+
+矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释：
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+
+   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
+   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
+
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
+
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
+)DOC");
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
+  }
+};
+```
+
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker，若未定制对应前向Op的GradProtoMaker，fluid提供了DefaultGradProtoMaker，默认注册会使用全部输入输出，包括Input, Output, Output@Grad等，使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+```
+
+### 定义Operator类
+
+下面实现了MulOp的定义：
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
+
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
+
+### 定义OpKernel类
+
+`MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型，如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数：`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比，`ExecutionContext`增加了设备类型，同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现：
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
+
+### 注册Operator
+
+- 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   在上面的代码中：
+
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
+
+
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### 编译
+
+运行下面命令可以进行编译：
+
+```
+make mul_op
+```
+
+## 绑定Python
+
+系统会对新增的op自动绑定Python，并链接到生成的lib库中。
+
+## 实现单元测试
+
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
+
+### 前向Operator单测
+
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
+
+1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
+
+
+  ```python
+  import unittest
+  import numpy as np
+  from op_test import OpTest
+
+
+  class TestMulOp(OpTest):
+      def setUp(self):
+          self.op_type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
+
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
+
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
+
+### 反向operator单测
+
+而反向测试中：
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
+
+
+### 编译和执行
+
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+或者:
+
+```bash
+ctest -R test_mul_op
+```
+
+## 注意事项
+
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义，基本格式如下：
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真，或者比较对象A=B，则检查通过，否则会终止程序运行，向用户反馈相应的错误提示信息。
+为了确保提示友好易懂，开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方，必须有详略得当的备注解释！**错误提示信息**不能为空！
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了？为什么错了？
+    - 例如：`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的？实际的输入是怎样的？
+    - 例如：`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见？
+    - 例如：`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点，根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单，不能给用户提供有效的提示！
+
+问题示例1 ：未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 ：提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
+
+问题示例：
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+                    "Fail to find eltwise_fwd_pd in device context");  //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口：Op内部如果出现Output = ShareDataWith(Input) 
+问题示例：
+```cpp
+auto *out = ctx.Output<framework::LoDTensor>("Out");
+auto *in = ctx.Input<framework::LoDTensor>("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input)，相当于operator图的中有一条隐藏边，连接了Input和Output，这条边无法在图分析中表达，引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作，性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen，gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量，请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`  
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+                        "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查，要写明反向Op的名字
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+                        "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/source/advanced_usage/development/nvvp1.png b/doc/fluid/advanced_usage/development/nvvp1.png
similarity index 100%
rename from source/advanced_usage/development/nvvp1.png
rename to doc/fluid/advanced_usage/development/nvvp1.png
diff --git a/source/advanced_usage/development/nvvp2.png b/doc/fluid/advanced_usage/development/nvvp2.png
similarity index 100%
rename from source/advanced_usage/development/nvvp2.png
rename to doc/fluid/advanced_usage/development/nvvp2.png
diff --git a/source/advanced_usage/development/nvvp3.png b/doc/fluid/advanced_usage/development/nvvp3.png
similarity index 100%
rename from source/advanced_usage/development/nvvp3.png
rename to doc/fluid/advanced_usage/development/nvvp3.png
diff --git a/source/advanced_usage/development/nvvp4.png b/doc/fluid/advanced_usage/development/nvvp4.png
similarity index 100%
rename from source/advanced_usage/development/nvvp4.png
rename to doc/fluid/advanced_usage/development/nvvp4.png
diff --git a/source/advanced_usage/development/pprof_1.png b/doc/fluid/advanced_usage/development/pprof_1.png
similarity index 100%
rename from source/advanced_usage/development/pprof_1.png
rename to doc/fluid/advanced_usage/development/pprof_1.png
diff --git a/source/advanced_usage/development/pprof_2.png b/doc/fluid/advanced_usage/development/pprof_2.png
similarity index 100%
rename from source/advanced_usage/development/pprof_2.png
rename to doc/fluid/advanced_usage/development/pprof_2.png
diff --git a/source/advanced_usage/development/timeline.jpeg b/doc/fluid/advanced_usage/development/timeline.jpeg
similarity index 100%
rename from source/advanced_usage/development/timeline.jpeg
rename to doc/fluid/advanced_usage/development/timeline.jpeg
diff --git a/doc/fluid/advanced_usage/development/timeline_cn.md b/doc/fluid/advanced_usage/development/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..faf39f276dbddcd4961407ba2d082c9826051cbe
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/timeline_cn.md
@@ -0,0 +1,32 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/source/advanced_usage/development/tracing.jpeg b/doc/fluid/advanced_usage/development/tracing.jpeg
similarity index 100%
rename from source/advanced_usage/development/tracing.jpeg
rename to doc/fluid/advanced_usage/development/tracing.jpeg
diff --git a/doc/fluid/advanced_usage/development/write_docs.rst b/doc/fluid/advanced_usage/development/write_docs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4231f2bb5cd800c0cd86835b5d07e491fcde4989
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/write_docs.rst
@@ -0,0 +1,136 @@
+#############
+如何贡献文档
+#############
+
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
+
+如何构建文档
+============
+
+PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
+
+我们建议使用PaddlePaddle.org工具来构建文档。
+
+使用PaddlePaddle.org工具
+------------------------
+这个是目前推荐的使用方法。除了可以自动编译文档，还可以直接在网页中预览文档，需要注意的是，采用后续说明的其它方式虽然也可以预览文档，但是文档的样式与官网文档是不一致的，使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。
+
+PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后即可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
+
+不使用PaddlePaddle.org工具
+--------------------------
+
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
+
+如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # 如果只需要构建使用文档，则执行以下命令
+   make -j $processors paddle_docs
+
+   # 如果只需要构建API，则执行以下命令
+   make -j $processors paddle_apis
+
+其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
+
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+如何书写文档
+============
+
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
+
+如何更新www.paddlepaddle.org
+============================
+
+更新的文档以PR的形式提交到github中，提交方式参见 `如何贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/dev/write_docs_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/source/advanced_usage/index.rst b/doc/fluid/advanced_usage/index.rst
similarity index 100%
rename from source/advanced_usage/index.rst
rename to doc/fluid/advanced_usage/index.rst
diff --git a/source/advanced_usage/pics/anakin_fm_ch.png b/doc/fluid/advanced_usage/pics/anakin_fm_ch.png
similarity index 100%
rename from source/advanced_usage/pics/anakin_fm_ch.png
rename to doc/fluid/advanced_usage/pics/anakin_fm_ch.png
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..435d6e10fb02e9b2a8147f37da33e8848cc9b98a
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,25 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..115e0d24b39928cfc349f72e0a21d6374cd8cd75
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
+
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aeefbb95a46e5d5ed46375e388a720fad2711779
--- /dev/null
+++ b/doc/fluid/api/clip.rst
@@ -0,0 +1,43 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
+
+ErrorClipByValue
+----------------
+
+..  autoclass:: paddle.fluid.clip.ErrorClipByValue
+    :members:
+    :noindex:
+
+.. _api_fluid_clip_GradientClipByValue:
+
+GradientClipByValue
+-------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByValue
+    :members:
+    :noindex:
+
+.. _api_fluid_clip_GradientClipByNorm:
+
+GradientClipByNorm
+------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByNorm
+    :members:
+    :noindex:
+
+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
+GradientClipByGlobalNorm
+------------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0
--- /dev/null
+++ b/doc/fluid/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/fluid/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/fluid/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8
--- /dev/null
+++ b/doc/fluid/api/data_feeder.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.data_feeder.DataFeeder
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f23ecc1f80030f20359ce9675130a167722606c9
--- /dev/null
+++ b/doc/fluid/api/executor.rst
@@ -0,0 +1,40 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.executor.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_executor_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.executor.global_scope
+    :noindex:
+
+.. _api_fluid_executor_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.executor.scope_guard
+    :noindex:
+
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid.executor._switch_scope
+    :noindex:
+
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7eab58355c3648d929d3b5d98984adce9034f016
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,362 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..02efce2bf8392c62a7600c272bedcadc6563f927
--- /dev/null
+++ b/doc/fluid/api/gen_doc.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
+        self.stream = stream
+        if module_name is None:
+            self.module_name = "fluid"
+        else:
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(self.module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in submodule.__all__:
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            pass
+
+    def print_class(self, name):
+        self._print_ref_(name)
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_ref_(name)
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/source/api_reference/gen_doc.sh b/doc/fluid/api/gen_doc.sh
similarity index 100%
rename from source/api_reference/gen_doc.sh
rename to doc/fluid/api/gen_doc.sh
diff --git a/source/api_reference/index.rst b/doc/fluid/api/index_en.rst
similarity index 100%
rename from source/api_reference/index.rst
rename to doc/fluid/api/index_en.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607
--- /dev/null
+++ b/doc/fluid/api/initializer.rst
@@ -0,0 +1,131 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:
+
+Constant
+--------
+
+..  autoclass:: paddle.fluid.initializer.Constant
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Uniform:
+
+Uniform
+-------
+
+..  autoclass:: paddle.fluid.initializer.Uniform
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Normal:
+
+Normal
+------
+
+..  autoclass:: paddle.fluid.initializer.Normal
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Xavier:
+
+Xavier
+------
+
+..  autoclass:: paddle.fluid.initializer.Xavier
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Bilinear:
+
+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
+force_init_on_cpu
+-----------------
+
+..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
+    :noindex:
+
+.. _api_fluid_initializer_init_on_cpu:
+
+init_on_cpu
+-----------
+
+..  autofunction:: paddle.fluid.initializer.init_on_cpu
+    :noindex:
+
+.. _api_fluid_initializer_ConstantInitializer:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_UniformInitializer:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_NormalInitializer:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_XavierInitializer:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_BilinearInitializer:
+
+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7cee0bc4d9aa2c51517d23a381f14a8f63cc3681
--- /dev/null
+++ b/doc/fluid/api/io.rst
@@ -0,0 +1,127 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:
+
+save_vars
+---------
+
+..  autofunction:: paddle.fluid.io.save_vars
+    :noindex:
+
+.. _api_fluid_io_save_params:
+
+save_params
+-----------
+
+..  autofunction:: paddle.fluid.io.save_params
+    :noindex:
+
+.. _api_fluid_io_save_persistables:
+
+save_persistables
+-----------------
+
+..  autofunction:: paddle.fluid.io.save_persistables
+    :noindex:
+
+.. _api_fluid_io_load_vars:
+
+load_vars
+---------
+
+..  autofunction:: paddle.fluid.io.load_vars
+    :noindex:
+
+.. _api_fluid_io_load_params:
+
+load_params
+-----------
+
+..  autofunction:: paddle.fluid.io.load_params
+    :noindex:
+
+.. _api_fluid_io_load_persistables:
+
+load_persistables
+-----------------
+
+..  autofunction:: paddle.fluid.io.load_persistables
+    :noindex:
+
+.. _api_fluid_io_save_inference_model:
+
+save_inference_model
+--------------------
+
+..  autofunction:: paddle.fluid.io.save_inference_model
+    :noindex:
+
+.. _api_fluid_io_load_inference_model:
+
+load_inference_model
+--------------------
+
+..  autofunction:: paddle.fluid.io.load_inference_model
+    :noindex:
+
+.. _api_fluid_io_get_inference_program:
+
+get_inference_program
+---------------------
+
+..  autofunction:: paddle.fluid.io.get_inference_program
+    :noindex:
+
+.. _api_fluid_io_save_checkpoint:
+
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+.. _api_fluid_io_load_checkpoint:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+.. _api_fluid_io_clean_checkpoint:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
+.. _api_fluid_io_load_persist_vars_without_grad:
+
+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+.. _api_fluid_io_save_persist_vars_without_grad:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+.. _api_fluid_io_get_latest_checkpoint_serial:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ecbd8191ccf5aa6046e7875fe8afa2ed0105e4a0
--- /dev/null
+++ b/doc/fluid/api/layers.rst
@@ -0,0 +1,1778 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+============
+fluid.layers
+============
+
+control_flow
+============
+
+.. _api_fluid_layers_split_lod_tensor:
+
+split_lod_tensor
+----------------
+
+..  autofunction:: paddle.fluid.layers.split_lod_tensor
+    :noindex:
+
+.. _api_fluid_layers_merge_lod_tensor:
+
+merge_lod_tensor
+----------------
+
+..  autofunction:: paddle.fluid.layers.merge_lod_tensor
+    :noindex:
+
+.. _api_fluid_layers_BlockGuard:
+
+BlockGuard
+----------
+
+..  autoclass:: paddle.fluid.layers.BlockGuard
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_BlockGuardWithCompletion:
+
+BlockGuardWithCompletion
+------------------------
+
+..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_WhileGuard:
+
+WhileGuard
+----------
+
+..  autoclass:: paddle.fluid.layers.WhileGuard
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_While:
+
+While
+-----
+
+..  autoclass:: paddle.fluid.layers.While
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_Switch:
+
+Switch
+------
+
+..  autoclass:: paddle.fluid.layers.Switch
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_lod_rank_table:
+
+lod_rank_table
+--------------
+
+..  autofunction:: paddle.fluid.layers.lod_rank_table
+    :noindex:
+
+.. _api_fluid_layers_max_sequence_len:
+
+max_sequence_len
+----------------
+
+..  autofunction:: paddle.fluid.layers.max_sequence_len
+    :noindex:
+
+.. _api_fluid_layers_lod_tensor_to_array:
+
+lod_tensor_to_array
+-------------------
+
+..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+.. _api_fluid_layers_array_to_lod_tensor:
+
+array_to_lod_tensor
+-------------------
+
+..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+.. _api_fluid_layers_increment:
+
+increment
+---------
+
+..  autofunction:: paddle.fluid.layers.increment
+    :noindex:
+
+.. _api_fluid_layers_array_write:
+
+array_write
+-----------
+
+..  autofunction:: paddle.fluid.layers.array_write
+    :noindex:
+
+.. _api_fluid_layers_create_array:
+
+create_array
+------------
+
+..  autofunction:: paddle.fluid.layers.create_array
+    :noindex:
+
+.. _api_fluid_layers_less_than:
+
+less_than
+---------
+
+..  autofunction:: paddle.fluid.layers.less_than
+    :noindex:
+
+.. _api_fluid_layers_equal:
+
+equal
+-----
+
+..  autofunction:: paddle.fluid.layers.equal
+    :noindex:
+
+.. _api_fluid_layers_array_read:
+
+array_read
+----------
+
+..  autofunction:: paddle.fluid.layers.array_read
+    :noindex:
+
+.. _api_fluid_layers_shrink_memory:
+
+shrink_memory
+-------------
+
+..  autofunction:: paddle.fluid.layers.shrink_memory
+    :noindex:
+
+.. _api_fluid_layers_array_length:
+
+array_length
+------------
+
+..  autofunction:: paddle.fluid.layers.array_length
+    :noindex:
+
+.. _api_fluid_layers_IfElse:
+
+IfElse
+------
+
+..  autoclass:: paddle.fluid.layers.IfElse
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_DynamicRNN:
+
+DynamicRNN
+----------
+
+..  autoclass:: paddle.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_ConditionalBlock:
+
+ConditionalBlock
+----------------
+
+..  autoclass:: paddle.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_StaticRNN:
+
+StaticRNN
+---------
+
+..  autoclass:: paddle.fluid.layers.StaticRNN
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
+reorder_lod_tensor_by_rank
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
+    :noindex:
+
+.. _api_fluid_layers_ParallelDo:
+
+ParallelDo
+----------
+
+..  autoclass:: paddle.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_Print:
+
+Print
+-----
+
+..  autofunction:: paddle.fluid.layers.Print
+    :noindex:
+
+.. _api_fluid_layers_is_empty:
+
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
+device
+======
+
+.. _api_fluid_layers_get_places:
+
+get_places
+----------
+
+..  autofunction:: paddle.fluid.layers.get_places
+    :noindex:
+
+io
+==
+
+.. _api_fluid_layers_data:
+
+data
+----
+
+..  autofunction:: paddle.fluid.layers.data
+    :noindex:
+
+.. _api_fluid_layers_BlockGuardServ:
+
+BlockGuardServ
+--------------
+
+..  autoclass:: paddle.fluid.layers.BlockGuardServ
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_ListenAndServ:
+
+ListenAndServ
+-------------
+
+..  autoclass:: paddle.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_Send:
+
+Send
+----
+
+..  autofunction:: paddle.fluid.layers.Send
+    :noindex:
+
+.. _api_fluid_layers_Recv:
+
+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
+.. _api_fluid_layers_open_recordio_file:
+
+open_recordio_file
+------------------
+
+..  autofunction:: paddle.fluid.layers.open_recordio_file
+    :noindex:
+
+.. _api_fluid_layers_open_files:
+
+open_files
+----------
+
+..  autofunction:: paddle.fluid.layers.open_files
+    :noindex:
+
+.. _api_fluid_layers_read_file:
+
+read_file
+---------
+
+..  autofunction:: paddle.fluid.layers.read_file
+    :noindex:
+
+.. _api_fluid_layers_shuffle:
+
+shuffle
+-------
+
+..  autofunction:: paddle.fluid.layers.shuffle
+    :noindex:
+
+.. _api_fluid_layers_batch:
+
+batch
+-----
+
+..  autofunction:: paddle.fluid.layers.batch
+    :noindex:
+
+.. _api_fluid_layers_double_buffer:
+
+double_buffer
+-------------
+
+..  autofunction:: paddle.fluid.layers.double_buffer
+    :noindex:
+
+.. _api_fluid_layers_random_data_generator:
+
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+.. _api_fluid_layers_Preprocessor:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_load:
+
+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
+nn
+==
+
+.. _api_fluid_layers_fc:
+
+fc
+--
+
+..  autofunction:: paddle.fluid.layers.fc
+    :noindex:
+
+.. _api_fluid_layers_embedding:
+
+embedding
+---------
+
+..  autofunction:: paddle.fluid.layers.embedding
+    :noindex:
+
+.. _api_fluid_layers_dynamic_lstm:
+
+dynamic_lstm
+------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstm
+    :noindex:
+
+.. _api_fluid_layers_dynamic_lstmp:
+
+dynamic_lstmp
+-------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstmp
+    :noindex:
+
+.. _api_fluid_layers_dynamic_gru:
+
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.fluid.layers.dynamic_gru
+    :noindex:
+
+.. _api_fluid_layers_gru_unit:
+
+gru_unit
+--------
+
+..  autofunction:: paddle.fluid.layers.gru_unit
+    :noindex:
+
+.. _api_fluid_layers_linear_chain_crf:
+
+linear_chain_crf
+----------------
+
+..  autofunction:: paddle.fluid.layers.linear_chain_crf
+    :noindex:
+
+.. _api_fluid_layers_crf_decoding:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.fluid.layers.crf_decoding
+    :noindex:
+
+.. _api_fluid_layers_cos_sim:
+
+cos_sim
+-------
+
+..  autofunction:: paddle.fluid.layers.cos_sim
+    :noindex:
+
+.. _api_fluid_layers_cross_entropy:
+
+cross_entropy
+-------------
+
+..  autofunction:: paddle.fluid.layers.cross_entropy
+    :noindex:
+
+.. _api_fluid_layers_square_error_cost:
+
+square_error_cost
+-----------------
+
+..  autofunction:: paddle.fluid.layers.square_error_cost
+    :noindex:
+
+.. _api_fluid_layers_chunk_eval:
+
+chunk_eval
+----------
+
+..  autofunction:: paddle.fluid.layers.chunk_eval
+    :noindex:
+
+.. _api_fluid_layers_sequence_conv:
+
+sequence_conv
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_conv
+    :noindex:
+
+.. _api_fluid_layers_conv2d:
+
+conv2d
+------
+
+..  autofunction:: paddle.fluid.layers.conv2d
+    :noindex:
+
+.. _api_fluid_layers_conv3d:
+
+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
+    :noindex:
+
+.. _api_fluid_layers_sequence_pool:
+
+sequence_pool
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_pool
+    :noindex:
+
+.. _api_fluid_layers_sequence_softmax:
+
+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
+    :noindex:
+
+.. _api_fluid_layers_softmax:
+
+softmax
+-------
+
+..  autofunction:: paddle.fluid.layers.softmax
+    :noindex:
+
+.. _api_fluid_layers_pool2d:
+
+pool2d
+------
+
+..  autofunction:: paddle.fluid.layers.pool2d
+    :noindex:
+
+.. _api_fluid_layers_pool3d:
+
+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
+.. _api_fluid_layers_batch_norm:
+
+batch_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.batch_norm
+    :noindex:
+
+.. _api_fluid_layers_beam_search_decode:
+
+beam_search_decode
+------------------
+
+..  autofunction:: paddle.fluid.layers.beam_search_decode
+    :noindex:
+
+.. _api_fluid_layers_conv2d_transpose:
+
+conv2d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
+    :noindex:
+
+.. _api_fluid_layers_conv3d_transpose:
+
+conv3d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
+    :noindex:
+
+.. _api_fluid_layers_sequence_expand:
+
+sequence_expand
+---------------
+
+..  autofunction:: paddle.fluid.layers.sequence_expand
+    :noindex:
+
+.. _api_fluid_layers_lstm_unit:
+
+lstm_unit
+---------
+
+..  autofunction:: paddle.fluid.layers.lstm_unit
+    :noindex:
+
+.. _api_fluid_layers_reduce_sum:
+
+reduce_sum
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_sum
+    :noindex:
+
+.. _api_fluid_layers_reduce_mean:
+
+reduce_mean
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_mean
+    :noindex:
+
+.. _api_fluid_layers_reduce_max:
+
+reduce_max
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_max
+    :noindex:
+
+.. _api_fluid_layers_reduce_min:
+
+reduce_min
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_min
+    :noindex:
+
+.. _api_fluid_layers_reduce_prod:
+
+reduce_prod
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_prod
+    :noindex:
+
+.. _api_fluid_layers_sequence_first_step:
+
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_first_step
+    :noindex:
+
+.. _api_fluid_layers_sequence_last_step:
+
+sequence_last_step
+------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_last_step
+    :noindex:
+
+.. _api_fluid_layers_dropout:
+
+dropout
+-------
+
+..  autofunction:: paddle.fluid.layers.dropout
+    :noindex:
+
+.. _api_fluid_layers_split:
+
+split
+-----
+
+..  autofunction:: paddle.fluid.layers.split
+    :noindex:
+
+.. _api_fluid_layers_ctc_greedy_decoder:
+
+ctc_greedy_decoder
+------------------
+
+..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
+    :noindex:
+
+.. _api_fluid_layers_edit_distance:
+
+edit_distance
+-------------
+
+..  autofunction:: paddle.fluid.layers.edit_distance
+    :noindex:
+
+.. _api_fluid_layers_l2_normalize:
+
+l2_normalize
+------------
+
+..  autofunction:: paddle.fluid.layers.l2_normalize
+    :noindex:
+
+.. _api_fluid_layers_matmul:
+
+matmul
+------
+
+..  autofunction:: paddle.fluid.layers.matmul
+    :noindex:
+
+.. _api_fluid_layers_topk:
+
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
+
+.. _api_fluid_layers_warpctc:
+
+warpctc
+-------
+
+..  autofunction:: paddle.fluid.layers.warpctc
+    :noindex:
+
+.. _api_fluid_layers_sequence_reshape:
+
+sequence_reshape
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_reshape
+    :noindex:
+
+.. _api_fluid_layers_transpose:
+
+transpose
+---------
+
+..  autofunction:: paddle.fluid.layers.transpose
+    :noindex:
+
+.. _api_fluid_layers_im2sequence:
+
+im2sequence
+-----------
+
+..  autofunction:: paddle.fluid.layers.im2sequence
+    :noindex:
+
+.. _api_fluid_layers_nce:
+
+nce
+---
+
+..  autofunction:: paddle.fluid.layers.nce
+    :noindex:
+
+.. _api_fluid_layers_beam_search:
+
+beam_search
+-----------
+
+..  autofunction:: paddle.fluid.layers.beam_search
+    :noindex:
+
+.. _api_fluid_layers_row_conv:
+
+row_conv
+--------
+
+..  autofunction:: paddle.fluid.layers.row_conv
+    :noindex:
+
+.. _api_fluid_layers_multiplex:
+
+multiplex
+---------
+
+..  autofunction:: paddle.fluid.layers.multiplex
+    :noindex:
+
+.. _api_fluid_layers_layer_norm:
+
+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
+    :noindex:
+
+.. _api_fluid_layers_softmax_with_cross_entropy:
+
+softmax_with_cross_entropy
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+    :noindex:
+
+.. _api_fluid_layers_smooth_l1:
+
+smooth_l1
+---------
+
+..  autofunction:: paddle.fluid.layers.smooth_l1
+    :noindex:
+
+.. _api_fluid_layers_one_hot:
+
+one_hot
+-------
+
+..  autofunction:: paddle.fluid.layers.one_hot
+    :noindex:
+
+.. _api_fluid_layers_autoincreased_step_counter:
+
+autoincreased_step_counter
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
+    :noindex:
+
+.. _api_fluid_layers_reshape:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+.. _api_fluid_layers_lod_reset:
+
+lod_reset
+---------
+
+..  autofunction:: paddle.fluid.layers.lod_reset
+    :noindex:
+
+.. _api_fluid_layers_lrn:
+
+lrn
+---
+
+..  autofunction:: paddle.fluid.layers.lrn
+    :noindex:
+
+.. _api_fluid_layers_pad:
+
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
+.. _api_fluid_layers_label_smooth:
+
+label_smooth
+------------
+
+..  autofunction:: paddle.fluid.layers.label_smooth
+    :noindex:
+
+.. _api_fluid_layers_roi_pool:
+
+roi_pool
+--------
+
+..  autofunction:: paddle.fluid.layers.roi_pool
+    :noindex:
+
+.. _api_fluid_layers_dice_loss:
+
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+.. _api_fluid_layers_image_resize:
+
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+.. _api_fluid_layers_image_resize_short:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
+.. _api_fluid_layers_resize_bilinear:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+.. _api_fluid_layers_gather:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+.. _api_fluid_layers_random_crop:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
+.. _api_fluid_layers_mean_iou:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
+.. _api_fluid_layers_relu:
+
+relu
+----
+
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
+
+ops
+===
+
+.. _api_fluid_layers_mean:
+
+mean
+----
+
+..  autofunction:: paddle.fluid.layers.mean
+    :noindex:
+
+.. _api_fluid_layers_mul:
+
+mul
+---
+
+..  autofunction:: paddle.fluid.layers.mul
+    :noindex:
+
+.. _api_fluid_layers_scale:
+
+scale
+-----
+
+..  autofunction:: paddle.fluid.layers.scale
+    :noindex:
+
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+
+..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+    :noindex:
+
+.. _api_fluid_layers_elementwise_add:
+
+elementwise_add
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_add
+    :noindex:
+
+.. _api_fluid_layers_elementwise_div:
+
+elementwise_div
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_div
+    :noindex:
+
+.. _api_fluid_layers_elementwise_sub:
+
+elementwise_sub
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_sub
+    :noindex:
+
+.. _api_fluid_layers_elementwise_mul:
+
+elementwise_mul
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_mul
+    :noindex:
+
+.. _api_fluid_layers_elementwise_max:
+
+elementwise_max
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_max
+    :noindex:
+
+.. _api_fluid_layers_elementwise_min:
+
+elementwise_min
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_min
+    :noindex:
+
+.. _api_fluid_layers_elementwise_pow:
+
+elementwise_pow
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_pow
+    :noindex:
+
+.. _api_fluid_layers_clip:
+
+clip
+----
+
+..  autofunction:: paddle.fluid.layers.clip
+    :noindex:
+
+.. _api_fluid_layers_clip_by_norm:
+
+clip_by_norm
+------------
+
+..  autofunction:: paddle.fluid.layers.clip_by_norm
+    :noindex:
+
+.. _api_fluid_layers_logical_and:
+
+logical_and
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_and
+    :noindex:
+
+.. _api_fluid_layers_logical_or:
+
+logical_or
+----------
+
+..  autofunction:: paddle.fluid.layers.logical_or
+    :noindex:
+
+.. _api_fluid_layers_logical_xor:
+
+logical_xor
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_xor
+    :noindex:
+
+.. _api_fluid_layers_logical_not:
+
+logical_not
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_not
+    :noindex:
+
+.. _api_fluid_layers_uniform_random_batch_size_like:
+
+uniform_random_batch_size_like
+------------------------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_gaussian_random:
+
+gaussian_random
+---------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random
+    :noindex:
+
+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_scatter:
+
+scatter
+-------
+
+..  autofunction:: paddle.fluid.layers.scatter
+    :noindex:
+
+.. _api_fluid_layers_sum:
+
+sum
+---
+
+..  autofunction:: paddle.fluid.layers.sum
+    :noindex:
+
+.. _api_fluid_layers_slice:
+
+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+.. _api_fluid_layers_polygon_box_transform:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+.. _api_fluid_layers_shape:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_maxout:
+
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
+.. _api_fluid_layers_sigmoid:
+
+sigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.sigmoid
+    :noindex:
+
+.. _api_fluid_layers_logsigmoid:
+
+logsigmoid
+----------
+
+..  autofunction:: paddle.fluid.layers.logsigmoid
+    :noindex:
+
+.. _api_fluid_layers_exp:
+
+exp
+---
+
+..  autofunction:: paddle.fluid.layers.exp
+    :noindex:
+
+.. _api_fluid_layers_tanh:
+
+tanh
+----
+
+..  autofunction:: paddle.fluid.layers.tanh
+    :noindex:
+
+.. _api_fluid_layers_tanh_shrink:
+
+tanh_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.tanh_shrink
+    :noindex:
+
+.. _api_fluid_layers_softshrink:
+
+softshrink
+----------
+
+..  autofunction:: paddle.fluid.layers.softshrink
+    :noindex:
+
+.. _api_fluid_layers_sqrt:
+
+sqrt
+----
+
+..  autofunction:: paddle.fluid.layers.sqrt
+    :noindex:
+
+.. _api_fluid_layers_abs:
+
+abs
+---
+
+..  autofunction:: paddle.fluid.layers.abs
+    :noindex:
+
+.. _api_fluid_layers_ceil:
+
+ceil
+----
+
+..  autofunction:: paddle.fluid.layers.ceil
+    :noindex:
+
+.. _api_fluid_layers_floor:
+
+floor
+-----
+
+..  autofunction:: paddle.fluid.layers.floor
+    :noindex:
+
+.. _api_fluid_layers_cos:
+
+cos
+---
+
+..  autofunction:: paddle.fluid.layers.cos
+    :noindex:
+
+.. _api_fluid_layers_sin:
+
+sin
+---
+
+..  autofunction:: paddle.fluid.layers.sin
+    :noindex:
+
+.. _api_fluid_layers_round:
+
+round
+-----
+
+..  autofunction:: paddle.fluid.layers.round
+    :noindex:
+
+.. _api_fluid_layers_reciprocal:
+
+reciprocal
+----------
+
+..  autofunction:: paddle.fluid.layers.reciprocal
+    :noindex:
+
+.. _api_fluid_layers_square:
+
+square
+------
+
+..  autofunction:: paddle.fluid.layers.square
+    :noindex:
+
+.. _api_fluid_layers_softplus:
+
+softplus
+--------
+
+..  autofunction:: paddle.fluid.layers.softplus
+    :noindex:
+
+.. _api_fluid_layers_softsign:
+
+softsign
+--------
+
+..  autofunction:: paddle.fluid.layers.softsign
+    :noindex:
+
+.. _api_fluid_layers_brelu:
+
+brelu
+-----
+
+..  autofunction:: paddle.fluid.layers.brelu
+    :noindex:
+
+.. _api_fluid_layers_leaky_relu:
+
+leaky_relu
+----------
+
+..  autofunction:: paddle.fluid.layers.leaky_relu
+    :noindex:
+
+.. _api_fluid_layers_soft_relu:
+
+soft_relu
+---------
+
+..  autofunction:: paddle.fluid.layers.soft_relu
+    :noindex:
+
+.. _api_fluid_layers_elu:
+
+elu
+---
+
+..  autofunction:: paddle.fluid.layers.elu
+    :noindex:
+
+.. _api_fluid_layers_relu6:
+
+relu6
+-----
+
+..  autofunction:: paddle.fluid.layers.relu6
+    :noindex:
+
+.. _api_fluid_layers_pow:
+
+pow
+---
+
+..  autofunction:: paddle.fluid.layers.pow
+    :noindex:
+
+.. _api_fluid_layers_stanh:
+
+stanh
+-----
+
+..  autofunction:: paddle.fluid.layers.stanh
+    :noindex:
+
+.. _api_fluid_layers_hard_sigmoid:
+
+hard_sigmoid
+------------
+
+..  autofunction:: paddle.fluid.layers.hard_sigmoid
+    :noindex:
+
+.. _api_fluid_layers_swish:
+
+swish
+-----
+
+..  autofunction:: paddle.fluid.layers.swish
+    :noindex:
+
+.. _api_fluid_layers_uniform_random:
+
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+.. _api_fluid_layers_hard_shrink:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+.. _api_fluid_layers_cumsum:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+.. _api_fluid_layers_thresholded_relu:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
+tensor
+======
+
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
+    :noindex:
+
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.fluid.layers.create_parameter
+    :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
+    :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
+    :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+..  autofunction:: paddle.fluid.layers.concat
+    :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+..  autofunction:: paddle.fluid.layers.argsort
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+..  autofunction:: paddle.fluid.layers.append_LARS
+    :noindex:
+
+detection
+=========
+
+.. _api_fluid_layers_prior_box:
+
+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
+.. _api_fluid_layers_multi_box_head:
+
+multi_box_head
+--------------
+
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+
+.. _api_fluid_layers_bipartite_match:
+
+bipartite_match
+---------------
+
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+
+.. _api_fluid_layers_target_assign:
+
+target_assign
+-------------
+
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+
+.. _api_fluid_layers_detection_output:
+
+detection_output
+----------------
+
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+
+.. _api_fluid_layers_ssd_loss:
+
+ssd_loss
+--------
+
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+
+.. _api_fluid_layers_detection_map:
+
+detection_map
+-------------
+
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_box_coder:
+
+box_coder
+---------
+
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+
+metric_op
+=========
+
+.. _api_fluid_layers_accuracy:
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+.. _api_fluid_layers_auc:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
+    :noindex:
+
+tensor
+======
+
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
+    :noindex:
+
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.fluid.layers.create_parameter
+    :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
+    :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
+    :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+..  autofunction:: paddle.fluid.layers.concat
+    :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+.. _api_fluid_layers_rank_loss:
+
+rank_loss
+-------
+
+..  autofunction:: paddle.fluid.layers.rank_loss
+    :noindex:
+
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f54b2e2eb7ead353215c5dbd529293794e37123
--- /dev/null
+++ b/doc/fluid/api/metrics.rst
@@ -0,0 +1,88 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:
+
+MetricBase
+----------
+
+..  autoclass:: paddle.fluid.metrics.MetricBase
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_CompositeMetric:
+
+CompositeMetric
+---------------
+
+..  autoclass:: paddle.fluid.metrics.CompositeMetric
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
+Accuracy
+--------
+
+..  autoclass:: paddle.fluid.metrics.Accuracy
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_ChunkEvaluator:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_EditDistance:
+
+EditDistance
+------------
+
+..  autoclass:: paddle.fluid.metrics.EditDistance
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_DetectionMAP:
+
+DetectionMAP
+------------
+
+..  autoclass:: paddle.fluid.metrics.DetectionMAP
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Auc:
+
+Auc
+---
+
+..  autoclass:: paddle.fluid.metrics.Auc
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
new file mode 100644
index 0000000000000000000000000000000000000000..059733af18517257b6821d95fd628a9e13e6e98e
--- /dev/null
+++ b/doc/fluid/api/nets.rst
@@ -0,0 +1,39 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+
+..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
+    :noindex:
+
+.. _api_fluid_nets_sequence_conv_pool:
+
+sequence_conv_pool
+------------------
+
+..  autofunction:: paddle.fluid.nets.sequence_conv_pool
+    :noindex:
+
+.. _api_fluid_nets_glu:
+
+glu
+---
+
+..  autofunction:: paddle.fluid.nets.glu
+    :noindex:
+
+.. _api_fluid_nets_scaled_dot_product_attention:
+
+scaled_dot_product_attention
+----------------------------
+
+..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8d792120f2f16a8c92606b343eb4c3d4368bed14
--- /dev/null
+++ b/doc/fluid/api/optimizer.rst
@@ -0,0 +1,178 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:
+
+SGD
+---
+
+..  autoclass:: paddle.fluid.optimizer.SGD
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Momentum:
+
+Momentum
+--------
+
+..  autoclass:: paddle.fluid.optimizer.Momentum
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adagrad:
+
+Adagrad
+-------
+
+..  autoclass:: paddle.fluid.optimizer.Adagrad
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adam:
+
+Adam
+----
+
+..  autoclass:: paddle.fluid.optimizer.Adam
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adamax:
+
+Adamax
+------
+
+..  autoclass:: paddle.fluid.optimizer.Adamax
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagrad:
+
+DecayedAdagrad
+--------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
+SGDOptimizer
+------------
+
+..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_MomentumOptimizer:
+
+MomentumOptimizer
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdagradOptimizer:
+
+AdagradOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdamOptimizer:
+
+AdamOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
+AdamaxOptimizer
+---------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
+DecayedAdagradOptimizer
+-----------------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
+Adadelta
+--------
+
+..  autoclass:: paddle.fluid.optimizer.Adadelta
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_ModelAverage:
+
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Optimizer:
+
+Optimizer
+---------
+
+..  autoclass:: paddle.fluid.optimizer.Optimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..33035bbc7ca5c8d000adeaf1cb79806a3ea64604
--- /dev/null
+++ b/doc/fluid/api/param_attr.rst
@@ -0,0 +1,25 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c750a2d588df56728ac7f73051ab7a9e44dee232
--- /dev/null
+++ b/doc/fluid/api/profiler.rst
@@ -0,0 +1,47 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
+
+cuda_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.cuda_profiler
+    :noindex:
+
+.. _api_fluid_profiler_reset_profiler:
+
+reset_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.reset_profiler
+    :noindex:
+
+.. _api_fluid_profiler_profiler:
+
+profiler
+--------
+
+..  autofunction:: paddle.fluid.profiler.profiler
+    :noindex:
+
+.. _api_fluid_profiler_start_profiler:
+
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+.. _api_fluid_profiler_stop_profiler:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717
--- /dev/null
+++ b/doc/fluid/api/recordio_writer.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..987eaea903520d91c284c8da7a8cb066a1648069
--- /dev/null
+++ b/doc/fluid/api/regularizer.rst
@@ -0,0 +1,51 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:
+
+append_regularization_ops
+-------------------------
+
+..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
+    :noindex:
+
+.. _api_fluid_regularizer_L1Decay:
+
+L1Decay
+-------
+
+..  autoclass:: paddle.fluid.regularizer.L1Decay
+    :members:
+    :noindex:
+
+.. _api_fluid_regularizer_L2Decay:
+
+L2Decay
+-------
+
+..  autoclass:: paddle.fluid.regularizer.L2Decay
+    :members:
+    :noindex:
+
+.. _api_fluid_regularizer_L1DecayRegularizer:
+
+L1DecayRegularizer
+------------------
+
+..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
+    :members:
+    :noindex:
+
+.. _api_fluid_regularizer_L2DecayRegularizer:
+
+L2DecayRegularizer
+------------------
+
+..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d2ac04f1449c32cb414cea1b76d7469bbe9ccb85
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,59 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+
+.. _api_fluid_transpiler_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+
+.. _api_fluid_transpiler_HashName:
+
+HashName
+--------
+
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_RoundRobin:
+
+RoundRobin
+----------
+
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
+
diff --git a/source/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/beginners_guide/basics/image_classification/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/image_classification/.gitignore
rename to doc/fluid/beginners_guide/basics/image_classification/.gitignore
diff --git a/source/beginners_guide/basics/image_classification/image/dog.png b/doc/fluid/beginners_guide/basics/image_classification/image/dog.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/dog.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/dog.png
diff --git a/source/beginners_guide/basics/image_classification/image/dog_cat.png b/doc/fluid/beginners_guide/basics/image_classification/image/dog_cat.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/dog_cat.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/dog_cat.png
diff --git a/source/beginners_guide/basics/image_classification/image/fea_conv0.png b/doc/fluid/beginners_guide/basics/image_classification/image/fea_conv0.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/fea_conv0.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/fea_conv0.png
diff --git a/source/beginners_guide/basics/image_classification/image/flowers.png b/doc/fluid/beginners_guide/basics/image_classification/image/flowers.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/flowers.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/flowers.png
diff --git a/source/beginners_guide/basics/image_classification/image/googlenet.jpeg b/doc/fluid/beginners_guide/basics/image_classification/image/googlenet.jpeg
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/googlenet.jpeg
rename to doc/fluid/beginners_guide/basics/image_classification/image/googlenet.jpeg
diff --git a/source/beginners_guide/basics/image_classification/image/ilsvrc.png b/doc/fluid/beginners_guide/basics/image_classification/image/ilsvrc.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/ilsvrc.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/ilsvrc.png
diff --git a/source/beginners_guide/basics/image_classification/image/inception.png b/doc/fluid/beginners_guide/basics/image_classification/image/inception.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/inception.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/inception.png
diff --git a/source/beginners_guide/basics/image_classification/image/lenet.png b/doc/fluid/beginners_guide/basics/image_classification/image/lenet.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/lenet.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/lenet.png
diff --git a/source/beginners_guide/basics/image_classification/image/plot.png b/doc/fluid/beginners_guide/basics/image_classification/image/plot.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/plot.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/plot.png
diff --git a/source/beginners_guide/basics/image_classification/image/resnet.png b/doc/fluid/beginners_guide/basics/image_classification/image/resnet.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/resnet.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/resnet.png
diff --git a/source/beginners_guide/basics/image_classification/image/resnet_block.jpg b/doc/fluid/beginners_guide/basics/image_classification/image/resnet_block.jpg
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/resnet_block.jpg
rename to doc/fluid/beginners_guide/basics/image_classification/image/resnet_block.jpg
diff --git a/source/beginners_guide/basics/image_classification/image/train_and_test.png b/doc/fluid/beginners_guide/basics/image_classification/image/train_and_test.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/train_and_test.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/train_and_test.png
diff --git a/source/beginners_guide/basics/image_classification/image/vgg16.png b/doc/fluid/beginners_guide/basics/image_classification/image/vgg16.png
similarity index 100%
rename from source/beginners_guide/basics/image_classification/image/vgg16.png
rename to doc/fluid/beginners_guide/basics/image_classification/image/vgg16.png
diff --git a/doc/fluid/beginners_guide/basics/image_classification/index.md b/doc/fluid/beginners_guide/basics/image_classification/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..18ab749ec38e835f14299c09c03192919bda41bb
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/image_classification/index.md
@@ -0,0 +1 @@
+../../../../../external/book/03.image_classification/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/index.rst b/doc/fluid/beginners_guide/basics/index.rst
similarity index 100%
rename from source/beginners_guide/basics/index.rst
rename to doc/fluid/beginners_guide/basics/index.rst
diff --git a/source/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/.gitignore
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/bio_example.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/bio_example.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/bio_example.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/bio_example.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
diff --git a/source/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png b/doc/fluid/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
diff --git a/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md b/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..8e482e13129cade3153b79fc4c334a8bff858af5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md
@@ -0,0 +1 @@
+../../../../../external/book/07.label_semantic_roles/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/learning_materials.md b/doc/fluid/beginners_guide/basics/learning_materials.md
similarity index 100%
rename from source/beginners_guide/basics/learning_materials.md
rename to doc/fluid/beginners_guide/basics/learning_materials.md
diff --git a/source/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/beginners_guide/basics/machine_translation/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/.gitignore
rename to doc/fluid/beginners_guide/basics/machine_translation/.gitignore
diff --git a/source/beginners_guide/basics/machine_translation/image/bi_rnn.png b/doc/fluid/beginners_guide/basics/machine_translation/image/bi_rnn.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/bi_rnn.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/bi_rnn.png
diff --git a/source/beginners_guide/basics/machine_translation/image/bi_rnn_en.png b/doc/fluid/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
diff --git a/source/beginners_guide/basics/machine_translation/image/decoder_attention.png b/doc/fluid/beginners_guide/basics/machine_translation/image/decoder_attention.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/decoder_attention.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/decoder_attention.png
diff --git a/source/beginners_guide/basics/machine_translation/image/decoder_attention_en.png b/doc/fluid/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
diff --git a/source/beginners_guide/basics/machine_translation/image/encoder_attention.png b/doc/fluid/beginners_guide/basics/machine_translation/image/encoder_attention.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/encoder_attention.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/encoder_attention.png
diff --git a/source/beginners_guide/basics/machine_translation/image/encoder_attention_en.png b/doc/fluid/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
diff --git a/source/beginners_guide/basics/machine_translation/image/encoder_decoder.png b/doc/fluid/beginners_guide/basics/machine_translation/image/encoder_decoder.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/encoder_decoder.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/encoder_decoder.png
diff --git a/source/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png b/doc/fluid/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
diff --git a/source/beginners_guide/basics/machine_translation/image/gru.png b/doc/fluid/beginners_guide/basics/machine_translation/image/gru.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/gru.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/gru.png
diff --git a/source/beginners_guide/basics/machine_translation/image/gru_en.png b/doc/fluid/beginners_guide/basics/machine_translation/image/gru_en.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/gru_en.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/gru_en.png
diff --git a/source/beginners_guide/basics/machine_translation/image/nmt.png b/doc/fluid/beginners_guide/basics/machine_translation/image/nmt.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/nmt.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/nmt.png
diff --git a/source/beginners_guide/basics/machine_translation/image/nmt_en.png b/doc/fluid/beginners_guide/basics/machine_translation/image/nmt_en.png
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/image/nmt_en.png
rename to doc/fluid/beginners_guide/basics/machine_translation/image/nmt_en.png
diff --git a/doc/fluid/beginners_guide/basics/machine_translation/index.md b/doc/fluid/beginners_guide/basics/machine_translation/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..fad1225ac49b1084e9d9a6e8e1df9367053c346b
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/machine_translation/index.md
@@ -0,0 +1 @@
+../../../../../external/book/08.machine_translation/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/beginners_guide/basics/recommender_system/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/.gitignore
rename to doc/fluid/beginners_guide/basics/recommender_system/.gitignore
diff --git a/source/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/doc/fluid/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
diff --git a/source/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png b/doc/fluid/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
diff --git a/source/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png b/doc/fluid/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
diff --git a/source/beginners_guide/basics/recommender_system/image/YouTube_Overview.png b/doc/fluid/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
diff --git a/source/beginners_guide/basics/recommender_system/image/output_32_0.png b/doc/fluid/beginners_guide/basics/recommender_system/image/output_32_0.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/output_32_0.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/output_32_0.png
diff --git a/source/beginners_guide/basics/recommender_system/image/rec_regression_network.png b/doc/fluid/beginners_guide/basics/recommender_system/image/rec_regression_network.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/rec_regression_network.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/rec_regression_network.png
diff --git a/source/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png b/doc/fluid/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
diff --git a/source/beginners_guide/basics/recommender_system/image/text_cnn.png b/doc/fluid/beginners_guide/basics/recommender_system/image/text_cnn.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/text_cnn.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/text_cnn.png
diff --git a/source/beginners_guide/basics/recommender_system/image/text_cnn_en.png b/doc/fluid/beginners_guide/basics/recommender_system/image/text_cnn_en.png
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/image/text_cnn_en.png
rename to doc/fluid/beginners_guide/basics/recommender_system/image/text_cnn_en.png
diff --git a/doc/fluid/beginners_guide/basics/recommender_system/index.md b/doc/fluid/beginners_guide/basics/recommender_system/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..2bbbdc54e0b27d2a437530b255091312390371d0
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/recommender_system/index.md
@@ -0,0 +1 @@
+../../../../../external/book/05.recommender_system/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/.gitignore
rename to doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore
diff --git a/source/beginners_guide/basics/understand_sentiment/image/lstm.png b/doc/fluid/beginners_guide/basics/understand_sentiment/image/lstm.png
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/image/lstm.png
rename to doc/fluid/beginners_guide/basics/understand_sentiment/image/lstm.png
diff --git a/source/beginners_guide/basics/understand_sentiment/image/lstm_en.png b/doc/fluid/beginners_guide/basics/understand_sentiment/image/lstm_en.png
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/image/lstm_en.png
rename to doc/fluid/beginners_guide/basics/understand_sentiment/image/lstm_en.png
diff --git a/source/beginners_guide/basics/understand_sentiment/image/rnn.png b/doc/fluid/beginners_guide/basics/understand_sentiment/image/rnn.png
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/image/rnn.png
rename to doc/fluid/beginners_guide/basics/understand_sentiment/image/rnn.png
diff --git a/source/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg b/doc/fluid/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
rename to doc/fluid/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
diff --git a/source/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png b/doc/fluid/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
rename to doc/fluid/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
diff --git a/doc/fluid/beginners_guide/basics/understand_sentiment/index.md b/doc/fluid/beginners_guide/basics/understand_sentiment/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..db728d7ba2f547d759dd9854546cb818974920d5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/understand_sentiment/index.md
@@ -0,0 +1 @@
+../../../../../external/book/06.understand_sentiment/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/beginners_guide/basics/word2vec/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/word2vec/.gitignore
rename to doc/fluid/beginners_guide/basics/word2vec/.gitignore
diff --git a/source/beginners_guide/basics/word2vec/image/2d_similarity.png b/doc/fluid/beginners_guide/basics/word2vec/image/2d_similarity.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/2d_similarity.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/2d_similarity.png
diff --git a/source/beginners_guide/basics/word2vec/image/cbow.png b/doc/fluid/beginners_guide/basics/word2vec/image/cbow.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/cbow.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/cbow.png
diff --git a/source/beginners_guide/basics/word2vec/image/cbow_en.png b/doc/fluid/beginners_guide/basics/word2vec/image/cbow_en.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/cbow_en.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/cbow_en.png
diff --git a/source/beginners_guide/basics/word2vec/image/ngram.en.png b/doc/fluid/beginners_guide/basics/word2vec/image/ngram.en.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/ngram.en.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/ngram.en.png
diff --git a/source/beginners_guide/basics/word2vec/image/ngram.png b/doc/fluid/beginners_guide/basics/word2vec/image/ngram.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/ngram.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/ngram.png
diff --git a/source/beginners_guide/basics/word2vec/image/nnlm.png b/doc/fluid/beginners_guide/basics/word2vec/image/nnlm.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/nnlm.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/nnlm.png
diff --git a/source/beginners_guide/basics/word2vec/image/nnlm_en.png b/doc/fluid/beginners_guide/basics/word2vec/image/nnlm_en.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/nnlm_en.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/nnlm_en.png
diff --git a/source/beginners_guide/basics/word2vec/image/sentence_emb.png b/doc/fluid/beginners_guide/basics/word2vec/image/sentence_emb.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/sentence_emb.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/sentence_emb.png
diff --git a/source/beginners_guide/basics/word2vec/image/skipgram.png b/doc/fluid/beginners_guide/basics/word2vec/image/skipgram.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/skipgram.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/skipgram.png
diff --git a/source/beginners_guide/basics/word2vec/image/skipgram_en.png b/doc/fluid/beginners_guide/basics/word2vec/image/skipgram_en.png
similarity index 100%
rename from source/beginners_guide/basics/word2vec/image/skipgram_en.png
rename to doc/fluid/beginners_guide/basics/word2vec/image/skipgram_en.png
diff --git a/doc/fluid/beginners_guide/basics/word2vec/index.md b/doc/fluid/beginners_guide/basics/word2vec/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..19186f4fee4a763bc1e4efcfa812694ca3975372
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/word2vec/index.md
@@ -0,0 +1 @@
+../../../../../external/book/04.word2vec/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/index.rst b/doc/fluid/beginners_guide/index.rst
similarity index 100%
rename from source/beginners_guide/index.rst
rename to doc/fluid/beginners_guide/index.rst
diff --git a/doc/fluid/beginners_guide/install/install_doc.rst b/doc/fluid/beginners_guide/install/install_doc.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8a66a95f45ea18dbfdc2450694517d5df8c47efd
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/install_doc.rst
@@ -0,0 +1,543 @@
+.. _how_to_install:
+
+安装说明
+^^^^^^^^
+
+若您的系统为Linux或Windows，您可以使用我们提供的安装包来安装PaddlePaddle。
+
+对于MacOS系统，我们暂未提供安装包，您可以使用 **从源码编译** 的方式安装。
+
+
+.. _install_linux:
+
+在Linux安装PaddlePaddle
+--------
+
+推荐您使用 `pip <https://pypi.org/project/pip/>`_
+安装，它是Linux系统下最简单的安装方式。
+
+注意事项：
+
+- PaddlePaddle Python API 依赖Python 2.7版本。
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+您可以通过指定版本号来安装其它版本，例如：
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.13.0
+
+
+如果需要安装支持GPU的版本（cuda9.0_cudnn7_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.14.0            使用CUDA 9.0和cuDNN 7编译的0.14.0版本
+paddlepaddle-gpu==0.14.0.post87     使用CUDA 8.0和cuDNN 7编译的0.14.0版本
+paddlepaddle-gpu==0.14.0.post85     使用CUDA 8.0和cuDNN 5编译的0.14.0版本
+paddlepaddle-gpu==0.13.0            使用CUDA 9.0和cuDNN 7编译的0.13.0版本
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_
+中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的PaddlePaddle开发分支，可以从我们的 `CI系统 <https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview>`_ 中下载最新的whl安装包和c-api开发包并安装。如需登录，请点击“Log in as guest”。
+
+.. _FAQ:
+
+安装常见问题和解决方法
+======================
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。
+请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，
+需要使用最新的pip (>9.0.0) 才可以安装。
+
+可以使用下面的命令更新您的pip：
+
+  .. code-block:: bash
+
+      pip install --upgrade pip
+
+如果仍然存在问题，可以执行：
+
+    .. code-block:: bash
+
+        python -c "import pip; print(pip.pep425tags.get_supported())"
+
+获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包
+可以在 `这里 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 找到。
+
+如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新；
+如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64，
+可以重命名这个whl包为 manylinux1_x86_64 再安装。
+
+
+.. _install_windows:
+
+在Windows安装PaddlePaddle
+------------------------------
+Windows系统需要通过Docker来使用PaddleaPaddle。Docker是一个虚拟容器，使用Docker可以简化复杂的环境配置工作。
+
+我们提供了 `PaddlePaddle_Windows快速安装包 <http://paddle-windows.bj.bcebos.com/PaddlePaddle-windows.zip>`_，
+它能够帮助您安装Docker和PaddlePaddle。
+
+* 安装包支持的系统：Windows7，Windows8的所有版本，Windows10的专业版、企业版。
+
+* 如果您希望使用GPU提升训练速度，请使用Linux系统安装，Windows系统暂不支持。
+   
+.. _install_mac:
+
+在MacOS安装PaddlePaddle
+--------
+
+对于MacOS系统，我们暂未提供pip安装方式，您可以使用 **源码编译** 的方式安装。
+
+.. _others:
+
+其他安装方式
+-------------
+
+.. _source:
+源码编译（使用Docker镜像）
+==========
+
+.. _requirements:
+
+需要的软硬件
+"""""""""""""
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+"""""""""""""
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+
+**I. 编译CPU-Only版本的PaddlePaddle，需要执行：**
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 执行如下命令下载最新版本的docker镜像
+   docker run --name paddle-test -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash
+   # 3. 进入docker内执行如下命令编译CPU-Only的二进制安装包
+   mkdir -p /paddle/build && cd /paddle/build
+   cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+   make -j$(nproc)
+
+**II. 编译GPU版本的PaddlePaddle，需要执行：**
+
+.. code-block:: bash
+
+  # 1. 获取源码 
+  git clone https://github.com/PaddlePaddle/Paddle.git 
+  cd Paddle
+  # 2. 安装nvidia-docker
+  apt-get install nvidia-docker
+  # 3. 执行如下命令下载支持GPU运行的docker容器
+  nvidia-docker run --name paddle-test-gpu -v $PWD:/paddle --network=host -it docker.paddlepaddlehub.com/paddle:latest-dev /bin/bash
+  # 4. 进入docker内执行如下命令编译GPU版本的PaddlePaddle
+  mkdir -p /paddle/build && cd /paddle/build
+  cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF
+  make -j$(nproc)
+
+**注意事项：**
+
+* 上述有关 :code:`docker` 的命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+* 进入 :code:`docker` 后执行 :code:`cmake` 命令，若是出现 :code:`patchelf not found, please install it.` 错误，则执行 :code:`apt-get install -y patchelf` 命令即可解决问题。
+* 若您在使用Docker编译PaddlePaddle遇到问题时， `这个issue <https://github.com/PaddlePaddle/Paddle/issues/12079>`_ 可能会对您有所帮助。
+
+
+.. _source:
+源码编译（不使用Docker镜像）
+==========
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `附录：编译依赖`_ 之后才能开始编译的步骤。
+
+.. _build_step:
+
+编译方法
+"""""""""""""
+
+在本机上编译CPU-Only版本的PaddlePaddle，需要执行如下命令：
+
+.. code-block:: bash
+
+   # 1. 使用virtualenvwrapper创建python虚环境并将工作空间切换到虚环境 [可选]
+   mkvirtualenv paddle-venv
+   workon paddle-venv
+   # 2. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   mkdir build && cd build
+   cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF
+   make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译
+
+
+**注意事项：**
+
+* MacOS系统下因为默认安装了cblas库，所以编译时可能会遇到 :code:`use of undeclared identifier 'openblas_set_num_threads'` 错误。因此，在执行cmake命令时需要指定所使用openblas库的头文件路径，具体操作如下：
+
+  .. code-block:: bash
+
+    cd Paddle/build && rm -rf *
+    cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF -DOPENBLAS_INC_DIR=/usr/local/Cellar/openblas/[本机所安装的openblas版本号]/include/
+    make -j4 # 根据机器配备CPU的核心数开启相应的多线程进行编译
+* 若您在MacOS系统下从源码编译PaddlePaddle遇到问题时， `这个issue <https://github.com/PaddlePaddle/Paddle/issues/12078>`_ 可能会对您有所帮助。
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+"""""""""""""
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" docker.paddlepaddlehub.com/paddle:latest-dev bash -x /paddle/paddle/scripts/paddle_build.sh build
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+"""""""""""""
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_。
+  这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: bash
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command
+     "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Bash 脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/paddle/scripts/paddle_build.sh>`_。这个脚本调用 :code:`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_。
+
+- 磁盘不够
+
+  本文中的例子里， :code:`docker run` 命令里都用了 :code:`--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 :code:`docker ps -a` 命令看到停止后但是没有删除的 containers。 :code:`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+"""""""""""""
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", "3.4", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "SWIG", ">=2.0", ""
+   "wget","",""
+   "openblas","",""
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "protobuf","3.1.0",""
+   "wheel","",""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+"""""""""""""
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（ :code:`rm -rf` ）后，再指定。
+
+.. _install_docker:
+
+使用Docker安装运行
+==================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_
+获得基本的Docker安装和使用方法。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+""""""""""""""""""""""""""""
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+"""""""""""""""""""""""""""""""""""
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+""""""""""""""""""""""""""""""""""""
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+""""""""""""""""""""""""""""
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/source/beginners_guide/install/paddleci.png b/doc/fluid/beginners_guide/install/paddleci.png
similarity index 100%
rename from source/beginners_guide/install/paddleci.png
rename to doc/fluid/beginners_guide/install/paddleci.png
diff --git a/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..0074b2df726b61a02f9a8e98116b639ab7e562e4
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -0,0 +1 @@
+../../../../../external/book/01.fit_a_line/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/quick_start/fit_a_line/image/predictions.png b/doc/fluid/beginners_guide/quick_start/fit_a_line/image/predictions.png
similarity index 100%
rename from source/beginners_guide/quick_start/fit_a_line/image/predictions.png
rename to doc/fluid/beginners_guide/quick_start/fit_a_line/image/predictions.png
diff --git a/source/beginners_guide/quick_start/fit_a_line/image/ranges.png b/doc/fluid/beginners_guide/quick_start/fit_a_line/image/ranges.png
similarity index 100%
rename from source/beginners_guide/quick_start/fit_a_line/image/ranges.png
rename to doc/fluid/beginners_guide/quick_start/fit_a_line/image/ranges.png
diff --git a/source/beginners_guide/quick_start/fit_a_line/image/train_and_test.png b/doc/fluid/beginners_guide/quick_start/fit_a_line/image/train_and_test.png
similarity index 100%
rename from source/beginners_guide/quick_start/fit_a_line/image/train_and_test.png
rename to doc/fluid/beginners_guide/quick_start/fit_a_line/image/train_and_test.png
diff --git a/source/beginners_guide/quick_start/index.rst b/doc/fluid/beginners_guide/quick_start/index.rst
similarity index 100%
rename from source/beginners_guide/quick_start/index.rst
rename to doc/fluid/beginners_guide/quick_start/index.rst
diff --git a/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..c8b9a16180e19dabfebdbc07f8145e7e4c873a63
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -0,0 +1 @@
+../../../../../external/book/02.recognize_digits/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/cnn.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/cnn.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/cnn.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/cnn.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/infer_3.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/infer_3.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/infer_3.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/infer_3.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/max_pooling.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/mlp.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/mlp.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/mlp.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/mlp.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/train_and_test.png b/doc/fluid/beginners_guide/quick_start/recognize_digits/image/train_and_test.png
similarity index 100%
rename from source/beginners_guide/quick_start/recognize_digits/image/train_and_test.png
rename to doc/fluid/beginners_guide/quick_start/recognize_digits/image/train_and_test.png
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d0dacb104f148c2aeb323365cbd6f014ae00ed5a
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,225 @@
+从源码编译
+======================
+
+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
+   docker build -t paddle:dev .
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..664b68da8b7dd3e005ebf3ec34de77729e5ab355
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@@ -0,0 +1,237 @@
+Build from Sources
+==========================
+
+.. _requirements:
+
+Requirements
+----------------
+
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+2. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
+We run all the tools by running this image.
+
+.. _build_step:
+
+How To Build
+----------------
+
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
+Or you can build your own image from source as the optional step below:
+
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
+.. code-block:: bash
+
+   # 1. clone the source code
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. Optional: build development docker image from source
+   docker build -t paddle:dev .
+   # 3. Run the following command to build a CPU-Only binaries
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container.
+
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+If you wish to run only one unit test, like :code:`test_sum_op`:
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+---------------------------
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
+
+- Not enough disk space
+
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
+
+.. _compile_deps:
+
+Appendix: Compile Dependencies
+-------------------------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Appendix: Build Options
+-------------------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
+
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..106c86bace075764c84bc2a7f7cb09d466fa8794
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@@ -0,0 +1,146 @@
+使用Docker安装运行
+================================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
+
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+------------------------------
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+------------------------------
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25aecb8d0da9feb00006da6259b529b7011d91cb
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@@ -0,0 +1,153 @@
+Run in Docker Containers
+=================================
+
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
+
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
+
+After you've read above tutorials you may proceed the following steps.
+
+.. _docker_pull:
+
+Pull PaddlePaddle Docker Image
+------------------------------
+
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+Choose between different BLAS version:
+
+  .. code-block:: bash
+
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
+
+
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+Launch your training program in Docker
+--------------------------------------
+
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
+
+Also, you can go into the container shell, run or debug your code
+interactively:
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
+
+.. _docker_run_book:
+
+PaddlePaddle Book
+------------------
+
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+Then, you would back and paste the address into the local browser:
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+.. _docker_run_gpu:
+
+Train with Docker with GPU
+------------------------------
+
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a9305ac4b6578c14a962f223c647a71e3b8a72b
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1,56 @@
+安装与编译
+==========
+
+.. _install_steps:
+
+PaddlePaddle针对不同的用户群体提供了多种安装方式。
+
+专注深度学习模型开发
+--------------------
+
+PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_cn.rst
+
+这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
+
+关注底层框架
+-------------
+
+PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_cn.rst
+
+我们推荐在Docker中运行PaddlePaddle，该方式具有以下优势：
+
+- 无需单独安装第三方依赖
+- 方便分享运行时环境，易于问题的复现
+
+对于有定制化二进制文件需求的用户，我们同样提供了从源码编译安装PaddlePaddle的方法：
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_cn.rst
+
+.. warning::
+
+	需要提醒的是，这种安装方式会涉及到一些第三方库的下载、编译及安装，整个安装过程耗时较长。
+
+
+常见问题汇总
+--------------
+
+如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
+
+:ref:`常见问题解答 <install_faq>`
+
+如果问题没有得到解决，欢迎向PaddlePaddle社区反馈问题：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7990bacbd6966e88e8763e9c5709e410f7e9fed4
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1,56 @@
+install and Compile
+======================
+
+.. _install_steps:
+
+PaddlePaddle provides various methods of installation for many different users
+
+Focus on Deep Learning Model Development
+----------------------------------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+------------------------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_en.rst
+
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
+
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
+
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
+
+FAQ
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 <install_faq>`
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/fluid/build_and_install/paddleci.png differ
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..095da19cd41d29bfa72ab23abd24bec45f925a86
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@@ -0,0 +1,105 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@@ -0,0 +1,123 @@
+Install using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/fluid/design/algorithm/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/design/algorithm/images/asgd.gif differ
diff --git a/doc/fluid/design/algorithm/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/design/algorithm/images/theta_star.gif differ
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_cn.rst
@@ -0,0 +1,7 @@
+梯度更新算法
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_en.rst
@@ -0,0 +1,7 @@
+Gradient Update Algorithm
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
new file mode 100644
index 0000000000000000000000000000000000000000..28ad6495d97515442eb8af2050158829814acd33
--- /dev/null
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -0,0 +1,74 @@
+# Averaging Parameter in PaddlePaddle
+
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
+
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
+</p>
+
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+
+### How to perform Parameter Averaging in PaddlePaddle
+
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all N instances of the parameters in memory is not feasible.
+    3. Therefore, an approximation algorithm is used.
+
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+
+During the testing/saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+
+### How to implement Averaging of Parameter in PaddlePaddle
+
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+
+	**Advantages**:
+    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+    - Makes it easy for the users to customize and extend the framework.
+
+	**Disadvantages**:
+    - Implementation requires re-writing the averaging methodology in Python.  
+
+### Low-Level implementation
+
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Python API implementation for ParameterAverageOptimizer
+
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
+
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+
+The proposal is to add the op immediately while building the computation graph.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ded0ad22f4013a521bf3bee260565dc5cf855ae
--- /dev/null
+++ b/doc/fluid/design/concepts/README.md
@@ -0,0 +1,174 @@
+A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
+
+Here are some initial thoughts. Your comments are welcome!
+
+# Required CMake Function
+
+I think we need only the following few CMake functions to make a project description mean and clean:
+
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
+
+- The `_library` functions generate  .a files from source code.
+- The `_binary` functions generate executable binary files.
+- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
+
+The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
+
+Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
+
+Also,
+
+- to describe external dependencies, we need `external_library`.
+- to build shared libraries, we need `shared_library`.
+
+## An Example Project
+
+Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
+
+- tensor.h
+- tensor.cc
+- tensor_test.cc
+- ops.h
+- ops.cu
+- ops_test.cu
+- api.go
+- api_test.go
+
+Suppose that ops.cu depends on CUDNN.
+
+```cmake
+# cc_binary parses tensor.cc and figures out that target also depend
+# on tensor.h.
+cc_binary(tensor
+  SRCS
+  tensor.cc)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+cc_test(tensor_test
+  SRCS
+  tensor_test.cc
+  DEPS
+  tensor)
+
+# I don't have a clear idea what parameters external_library need to
+# have.  @gangliao as a CMake expert would have better ideas.
+external_library(cudnn
+  ....)
+
+# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
+# include global functions that take Tensor as their parameters, so
+# ops depend on tensor.  This implies that if any of tensor.{h.cc},
+# ops.{h,cu} is changed, ops need to be re-built.
+nv_library(ops
+  SRCS
+  ops.cu
+  DEPS
+  tensor
+  cudnn)  # cudnn is defined later.
+
+nv_test(ops_test
+  SRCS
+  ops_test.cu
+  DEPS
+  ops)
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+go_library(api
+  SRCS
+  api.go
+  DEPS
+  tensor # Because ops depend on tensor, this line is optional.
+  ops)
+
+go_test(api_test
+  SRCS
+  api_test.go
+  DEPS
+  api)
+
+
+# This builds libapi.so.  shared_library might use CMake target
+# api_shared so to distinguish it from above target api.
+shared_library(api
+  DEPS
+  api)
+
+```
+
+## Implementation
+
+As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
+
+## Using Package Manager For Go
+
+Building Go binaries and libraries need to satisfy their dependencies, generally
+we can do `go get ./...` to download and compile all external dependencies. The
+problems are:
+
+1. `go get` will always get the latest code from the default branch of the
+    remote repo, so changes of dependents might break the build. This is very
+    different with what we already have in `cmake/external` which download a
+    specific version or commit id of the dependency.
+1. Some locations can not access external dependencies through the internet, as mentioned
+   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
+   tools can package the dependencies as a "vendor" package, which can be mirrored
+   at many cloud file hosting, so users what to compile paddle by themselves can
+   download this "vendor" package from a mirror site.
+
+### Choose A Suitable Tool
+
+As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
+list dozens of Go package managers. We choose the tool using following principles:
+
+- Most "active" projects with more stars, more pull requests or commits
+- Widely used project
+
+After comparing all these projects, we shall choose between the most popular
+tools: Godep and Glide.
+
+Here's a brief comparison between Godep and Glide
+: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about using `Godep`. There's also a new "official" pakcage
+management tool has been started at: https://github.com/golang/dep to resolve
+such problems, but it's currently at Alpha stage. So the best choice now is
+glide obviously.
+
+### Manage Go Packages
+
+- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
+  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
+  with their commit id. Builds will "lock" to these packages if we don't `glide up`
+  them
+- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
+  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
+  under `go/`, cmake will just check the commit id to the packages under the folder,
+  if commit id matches, there will be no download at all.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
new file mode 100644
index 0000000000000000000000000000000000000000..3757cd055c818be1e63ee8c0f000f4dd299b59f4
--- /dev/null
+++ b/doc/fluid/design/concepts/block.md
@@ -0,0 +1,375 @@
+# Design Doc: Block and Scope
+
+## The Representation of Computation
+
+Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
+
+- Caffe, Torch, and Paddle: sequences of layers.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
+- PaddlePaddle: nested blocks, like C++ and Java programs.
+
+## Block in Programming Languages and Deep Learning
+
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
+
+Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
+
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
+
+A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
+
+## Stack Frames and the Scope Hierarchy
+
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
+
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
+
+1. In traditional programs:
+
+   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
+   - After the execution leaves the right curly brace, the runtime pops the frame.
+   - The maximum number of frames in the stack is the maximum depth of nested blocks.
+
+1. In PaddlePaddle
+
+   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - The height of the highest tree is the maximum depth of nested blocks.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
+
+## Use Blocks in C++ and PaddlePaddle Programs
+
+Let us consolidate the discussion by presenting some examples.
+
+### Blocks with `if-else` and `IfElseOp`
+
+The following C++ programs shows how blocks are used with the `if-else` structure:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int z = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+
+```
+
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
+
+
+### Blocks with `for` and `RNNOp`
+
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
+
+```python
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  h_prev = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, h_prev)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
+o1, o2 = rnn()
+```
+has its equivalent C++ program as follows
+
+```c++
+int* x = {10, 20, 30};
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
+
+int mem[sizeof(x) / sizeof(x[0]) + 1];
+int o1[sizeof(x) / sizeof(x[0]) + 1];
+int o2[sizeof(x) / sizeof(x[0]) + 1];
+for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
+  int x = x[i-1];
+  if (i == 1) mem[0] = m;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
+  int act = sigmoid(sum);
+  mem[i] = act;
+  o1[i] = act;
+  o2[i] = hidden_out;
+}
+```
+
+## Compilation and Execution
+
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
+
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
+
+## The "Binary Executable File Format"
+
+The definition of the protobuf message is as follows:
+
+```protobuf
+message BlockDesc {
+  repeated VarDesc vars = 1;
+  repeated OpDesc ops = 2;
+}
+```
+
+The step net in above RNN example would look like
+
+```
+BlockDesc {
+  vars = {
+    VarDesc {...} // x
+    VarDesc {...} // h
+    VarDesc {...} // fc_out
+    VarDesc {...} // hidden_out
+    VarDesc {...} // sum
+    VarDesc {...} // act
+  }
+  ops = {
+    OpDesc {...} // matmul
+    OpDesc {...} // add_two
+    OpDesc {...} // sigmoid
+  }
+};
+```
+
+Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
+
+```
+OpDesc {
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
+  attrs {
+    "states" : {1} // the index of h
+    "step_net" : <above step net>
+  }
+};
+```
+
+This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
+
+
+## The Compilation of Blocks
+
+During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
+
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
+
+```python
+a = pd.Variable(shape=[20, 20])
+b = pd.fc(a, params=["fc.w", "fc.b"])
+
+rnn = pd.create_rnn()
+with rnn.stepnet():
+    x = a.as_step_input()
+    # reuse fc's parameter
+    fc_without_b = pd.get_variable("fc.w")
+    rnn.output(fc_without_b)
+
+out = rnn()
+```
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+
+In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
+
+To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
+
+`SymbolTable` can do the following:
+
+- store the definitions (some names and attributes) of variables and operators,
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+
+
+```c++
+// Information in SymbolTable is enough to trace the dependency graph. So maybe
+// the Eval() interface takes a SymbolTable is enough.
+class SymbolTable {
+ public:
+  SymbolTable(SymbolTable* parent) : parent_(parent) {}
+
+  OpDesc* NewOp(const string& name="");
+
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");
+
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
+  // recursively.
+  // this interface is introduced to support InferShape, find protobuf messages
+  // of variables and operators, pass pointers into InferShape.
+  //
+  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
+  VarDesc* FindVar(const string& name, bool recursive=true);
+
+  OpDesc* FindOp(const string& name);
+
+  BlockDesc Compile() const;
+
+ private:
+  SymbolTable* parent_;
+
+  map<string, OpDesc> ops_;
+  map<string, VarDesc> vars_;
+};
+```
+
+After all the description of variables and operators is added into SymbolTable,
+the block has enough information to run.
+
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
+
+
+```c++
+namespace {
+
+class Block : OperatorBase {
+public:
+  Block(const BlockDesc& desc) desc_(desc) {}
+
+  void InferShape(const framework::Scope& scope) const override {
+    if (!symbols_ready_) {
+      CreateVariables(scope);
+      CreateOperators();
+    }
+    // should run InferShape first.
+    for (auto& op : runtime_table_.ops()) {
+      op->InferShape(scope);
+    }
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
+    for (auto& op : runtime_table_.ops()) {
+      op->Run(scope, place);
+    }
+  }
+
+  void CreateVariables(const framework::Scope& scope);
+  void CreateOperators();
+
+  // some other necessary interfaces of NetOp are listed below
+  // ...
+
+private:
+  BlockDesc desc_;
+  bool symbols_ready_{false};
+};
+```
+
+## The Execution of Blocks
+
+Block inherits from OperatorBase, which has a Run method.
+Block's Run method will run its operators sequentially.
+
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
+
+The definition of Eval is as follows:
+
+```c++
+// clean a block description by targets using the corresponding dependency graph.
+// return a new BlockDesc with minimal number of operators.
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
+// to a cluster.
+BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
+
+void Block::Eval(const vector<string>& targets,
+                 const framework::Scope& scope,
+                 const platform::DeviceContext& dev_ctx) {
+  BlockDesc min_desc = Prune(desc_, targets);
+  Block min_block(min_desc);
+  min_block.Run(scope, dev_ctx);
+}
+```
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..aabc1ba75a67c5767d409bd6e7e6240dec86b16c
--- /dev/null
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -0,0 +1,204 @@
+# C++ Data Feeding
+
+While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
+
+In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
+
+## Overview
+
+![](images/readers.png)
+
+## Reader
+
+In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
+
+
+### ReaderBase
+
+`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
+
+```cpp
+class ReaderBase {
+ public:
+  // Reads the next batch of data. (A 'batch' can be only one instance)
+  // If the next batch doesn't exist, it throws an exception
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  
+  // Checks whether the next instance exists.
+  virtual bool HasNext() = 0;
+  
+  // Reinitializes the reader and read the file from the beginning.
+  virtual void ReInit() = 0;
+
+  virtual ~ReaderBase();
+};
+```
+
+### FileReader
+
+`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
+
+```cpp
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& dims);
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+
+ private:
+  std::vector<DDim> dims_;
+};
+```
+
+A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
+
+The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.  
+
+### DecoratedReader
+
+A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling,  batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+```cpp
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  void ReInit() override { reader_->ReInit(); }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+```
+
+Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
+
+### MultipleReader
+
+All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
+
+So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
+
+![](images/multiple_reader.png)
+
+This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
+
+To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. 
+
+### ReaderHolder
+
+Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+We would have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some new ops are introduced:
+
+### Operators That Create Readers
+
+Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
+
+However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
+
+### OpenFilesOp
+
+The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
+
+To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
+
+### HasNextOp
+
+`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
+
+### ResetOp
+
+`ResetOp` is used to reset a reader via its `ReInit()` interface.
+
+### ReadOp
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
+
+## Program with Readers
+
+A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
+
+The ops of a `startup_program` with readers would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+double_buffer_reader = create_double_buffer_op(batch_reader)
+... (other initializers)
+```
+
+The forwarding ops of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
+    has_next = has_next_op(double_buffer_reader)
+    if_else_op(has_next) {
+        batch_data = read_op(double_buffer_reader)
+        ... (subsequent training ops)
+    } else {
+        reset_op(double_buffer_reader)
+        increase_op(pass_count)
+        not_completed = less_than_op(pass_count, reqiured_pass_num)
+    }
+}
+```
+
+A few important considerations for these programs are as follows:
+
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
+
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+
+### Simplify Configuration by MultiPassReader
+
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+
+With `MultiPassReader`, the startup program would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+
+The forwarding part of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+while_op(not_completed) {
+    batch_data = read_op(double_buffer_reader)
+    ... (subsequent training ops)
+    not_completed = has_next_op(double_buffer_reader)
+}
+```
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..3fcddf4dd90f826ee1a16713f4371fb010f8eac5
--- /dev/null
+++ b/doc/fluid/design/concepts/executor.md
@@ -0,0 +1,29 @@
+# Executor Design Doc
+
+## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
+
+## Executor
+
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
+
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
+
+### Running an Executor
+
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f86b99e5197c3e0b85fd76fe704520ef21b06d3
--- /dev/null
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -0,0 +1,128 @@
+# Design Doc: Functions, Operators, and Layers
+
+In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
+
+Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
+
+In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
+
+```c++
+template <typename T> T add(T x, T y) { return x + y; }
+template <typename T> T mul(T x, T y) { return x * y; }
+```
+
+Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
+
+```c++
+#define MAKE_FUNCTION_OPERATOR(mul);
+```
+
+generates
+
+```c++
+template <typename T> class mulOp : public OperatorBase {...};
+REGISTER_OP(mulOp<float32>, "mul");
+```
+
+so that in Python we can create operator mul by:
+
+```python
+X1 = Var()
+X2 = Var()
+Y = Var()
+paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
+```
+
+Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
+
+```c++
+template <typename T>
+class FCOp : public OperatorBase {
+ public:
+  void Run(...) {
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
+  }
+};
+REGISTER_OP(FCOp, "fc");
+```
+
+We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
+
+Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
+
+```python
+def operator.mul(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
+    return O
+
+def operator.add(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
+    return O
+```
+
+Above code snippets are automatically generated.  Given them, users can define
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    return operator.add(operator.mul(X, W), b)
+```
+
+If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    O1 = Var()
+    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
+    O2 = Var()
+    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
+    return O2
+```
+
+We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
+
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
+
+
+This is how we differentiate layer and operators in PaddlePaddle:
+
+- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
+- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/design/concepts/images/multiple_reader.png differ
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
new file mode 100644
index 0000000000000000000000000000000000000000..40753cb140540c08d9d4c449b8d377e315280436
--- /dev/null
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
@@ -0,0 +1,83 @@
+digraph G {
+  subgraph cluster_init {
+    label="Initialization"
+    startup_program [label="startup", shape=box]
+    node_w_g0 [label="W\nGPU0"]
+    startup_program -> node_w_g0 [label="Initialize"]
+    node_w_g1 [label="W\nGPU1"]
+    node_w_g0 -> node_w_g1 [label="broadcast"]
+  }
+
+  subgraph cluster_train {
+    label="forward_backward"
+
+    subgraph cluster_gpu0 {
+      label="GPU0"
+      fc_0 [label="fc\nGPU0", shape=box]
+      hidden_0 [label="hidden\nGPU0"]
+      node_w_g0 -> fc_0
+      fc_0 -> hidden_0
+      loss0 [label="loss\nGPU0"]
+      hidden_0 -> loss0 [label="many ops omitted"]
+      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+      loss_g0 [label="loss_grad\nGPU0"]
+      scale_loss_0->loss_g0
+      
+      fc_g_0 [label="w_grad\nGPU0", shape=box]
+      loss0 -> fc_g_0
+      loss_g0 -> fc_g_0
+      hidden_0 -> fc_g_0
+    }
+
+    subgraph cluster_gpu1 {
+      label="GPU1"
+      fc_1 [label="fc\nGPU1", shape=box]
+      hidden_1 [label="hidden\nGPU1"]
+      node_w_g1 -> fc_1
+      fc_1 -> hidden_1
+      loss1 [label="loss\nGPU1"]
+      hidden_1 -> loss1 [label="many ops omitted"]
+      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+      loss_g1 [label="loss_grad\nGPU1"]
+      scale_loss_1->loss_g1
+      
+      fc_g_1 [label="w_grad\nGPU1", shape=box]
+      loss1 -> fc_g_1
+      loss_g1 -> fc_g_1
+      hidden_1 -> fc_g_1
+    }
+  }
+
+  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+  fc_g_0 -> all_reduce_w
+  fc_g_1 -> all_reduce_w
+
+  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+  all_reduce_w -> fc_g_0_merged
+  all_reduce_w -> fc_g_1_merged
+
+  subgraph cluster_optimization {
+    label="Optimization"
+    subgraph cluster_opt_gpu0 {
+      label="GPU0"
+      sgd_0 [label="SGD Op\nGPU0", shape=box]
+
+      fc_g_0_merged -> sgd_0
+      node_w_g0 -> sgd_0
+      optimized_w_0 [label="Optimized W\nGPU0"]
+      sgd_0 -> optimized_w_0
+    }
+    subgraph cluster_opt_gpu1 {
+      label="GPU1"
+      sgd_1 [label="SGD Op\nGPU1", shape=box]
+
+      fc_g_1_merged -> sgd_1
+      node_w_g1 -> sgd_1
+      optimized_w_1 [label="Optimized W\nGPU0"]
+      sgd_1 -> optimized_w_1
+    }
+  }
+
+
+}
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211
Binary files /dev/null and b/doc/fluid/design/concepts/images/parallel_executor_overview.png differ
diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/design/concepts/images/readers.png differ
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc894937ff328e6002623275ca3c65e87b2bb0
--- /dev/null
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -0,0 +1,19 @@
+核心概念
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b85a3055746facaa642e8fc899976b58435f1ef2
--- /dev/null
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -0,0 +1,19 @@
+Core Concepts
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..748488f6d5f2f1272e87b89047570632418da8dc
--- /dev/null
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -0,0 +1,211 @@
+# Design Doc: LoD (Level-of-Detail) Tensor
+
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
+
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
+
+
+## The Challenge: Variable-length Sequences
+
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
+
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
+
+```
+3   1 2
+||| | ||
+```
+
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+### A Mini-Batch of Videos
+
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
+
+```
+3     1  2
+口口口 口 口口
+```
+
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
+
+### A Mini-Batch of Images
+
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
+
+```
+1 1 1 1     1
+口口口口 ... 口
+```
+
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
+
+```
+口口口口 ... 口
+```
+
+### Model Parameters
+
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
+
+
+## The LoD Tensor
+
+Let us revisit above example of the 2-level LoD Tensor
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
+
+We can save the LoD index in the above example
+
+```
+3           1  2
+3   2  4    1  2  3
+```
+
+in a not-full 2D matrix:
+
+```c++
+typedef std::vector<std::vector<int> > LoD;
+```
+
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
+
+```
+3 2 4 1 2 3
+```
+
+into offsets
+
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
+
+so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
+
+Similarly, the lengths in the top level LoD
+
+```
+3 1 2
+```
+
+are transformed into offsets of elements/words as follows:
+
+```
+0 3 4   6
+  = =   =
+  3 3+1 4+2
+```
+
+## Slicing of LoD Tensors
+
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
+
+For example, the <2>-slice of above example is
+
+```
+10      15
+10  12  15
+  || |||
+```
+
+and the <2,0>-slice of above slice is
+
+```
+10  12
+  ||
+```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f88e27bed722e9f2f535e368926fe49b4e72e56
--- /dev/null
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -0,0 +1,104 @@
+# ParallelExecutor
+
+## Background
+
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
+
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+
+
+## Overview of MultiGPUs logic
+
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+
+![alt](images/parallel_executor_overview.png)
+
+There are several optimizations for this logic.
+
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
+   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+
+
+## Static single assignment Graph
+
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+
+The data structure of `SSA` graph is:
+
+```c++
+struct VarHandleBase {
+  OpHandleBase* generated_op_;
+  vector<OpHandleBase*> pending_ops_;
+  
+  string name;
+  Place place;
+  size_t version;
+};
+
+struct OpHandleBase {
+  vector<OpHandleBase*> inputs_;
+  vector<OpHnadleBase*> outputs_;
+};
+
+struct SSAGraph {
+  // vars on each devices. 
+  //   * the vars in each map in vector is on different device.
+  //   * the map is mapping a variable name to variable handles
+  //   with different versions
+  vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
+  
+  // All ops
+  vector<OpHandleBase> ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+
+## Execute SSA Graph
+
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+
+## Synchronize GPU Kernels
+
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
+
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+
+The `wait` are implemented by two strategies:
+
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+
+## What's next?
+
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfcd21ecdb9d2844bf93ed98a56db09651077c40
--- /dev/null
+++ b/doc/fluid/design/concepts/program.md
@@ -0,0 +1,139 @@
+# Design Doc: PaddlePaddle Programs
+
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
+
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
+
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
+}
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
+message BlockDesc {
+  required int32 parent = 1;
+  repeated VarDesc vars = 2;
+  repeated OpDesc ops = 3;
+}
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
+
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
+message OpDesc {
+  AttrDesc attrs = 1;
+  ...
+}
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+
+```
+message AttrDesc {
+  required string name = 1;
+
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```
+
+## InferShape
+
+With this design, the InferShape function should take the following parameters:
+
+```c++
+void InferShape(int current_block,
+                int current_operator,
+                ProgramDesc* program // might change VarDesc values.
+                ) {
+  ...
+}
+```
+
+where
+
+- `current_block` indices into `ProgramDesc::blocks`,
+- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ 
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+  void Close() { return queue_.Close(); }
+
+  bool IsClosed() const { return queue_.IsClosed(); }
+
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ 
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  
+  void ReInit() override { return; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcf76649357aaef80d6bc1a933ece8c4c1063547
--- /dev/null
+++ b/doc/fluid/design/concepts/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* Var(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* FindVar(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->FindVar(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+## Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* FindVar(const std::string& name) const;
+
+  // return if already contains same name variable.
+  Variable* Var(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/fluid/design/concepts/tensor.md b/doc/fluid/design/concepts/tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a27ac9bb6b03649d42e12100fda9e80a56e7f56
--- /dev/null
+++ b/doc/fluid/design/concepts/tensor.md
@@ -0,0 +1,189 @@
+# Tensor: An Unified Data Type in PaddlePaddle
+
+## Pain Point
+
+In this week, we discussed several potential weaknesses of PaddlePaddle caused by rapid iteration and development to promote new business products on the line in recent four years. For instance, current Matrix/Vector implementation in PaddlePaddle are long and tedious to read, which interfered seriously with the contribution of both fresh and professional engineers. More seriously for this issue, it will also become too challenging to maintain over time.
+
+
+## Learn from Majel
+
+Consequently, we decide to refactor PaddlePaddle step-by-step. First, refactor and replace Matrix/Vector to Tensor, a modern terminology in the deep learning system. Fortunately, we can learn from Majel how to define a Tensor.
+
+To simplify heterogeneous resource allocation in any dimensions (1-9) and types (double, float, float16), Majel consists of several primitives such as `Dim`, `Place` and `Array`, all of them are standard C++ class templates.
+
+1. `Place`: memory location [i.e. CPU/GPU].
+2. `Allocation`: heterogeneous resource allocator [i.e. 20MB in GPU].
+3. `Dim`: size of each dimension. [i.e. Dim<4>({10, 2, 5, 1})]
+4. `Array`: dynamic array consists of `Place`, `Dim`, and a pointer to memory.
+
+If you dig deeper into Majel source code, you will find Majel heavily use `boost.variant`. The variant class template is a safe, generic, stack-based discriminated union container, **offering a simple solution for manipulating an object from a heterogeneous set of types in a uniform manner**. Whereas standard containers such as std::vector may be thought of as "multi-value, single type," variant is "multi-type, single value."
+
+As a simple example, consider the following:
+
+```c++
+#include "boost/variant.hpp"
+#include <iostream>
+
+class my_visitor : public boost::static_visitor<int>
+{
+public:
+    int operator()(int i) const
+    {
+        return i;
+    }
+    
+    int operator()(const std::string & str) const
+    {
+        return str.length();
+    }
+};
+
+int main()
+{
+    boost::variant< int, std::string > u("hello world");
+    std::cout << u; // output: hello world
+
+    int result = boost::apply_visitor( my_visitor(), u );
+    std::cout << result; // output: 11 (i.e., length of "hello world")
+}
+```
+
+In Majel, `DDimVar` is derived from `Dim`, `DArrayVar` is from `Array`.
+
+```c++
+template<int i>
+struct Dim {
+...    
+int head;
+Dim<i-1> tail;
+}
+```
+
+```c++
+template<typename T, int D>
+class Array : public Buffer {
+    ...
+private:
+    Dim<D> size_;
+    Dim<D> stride_;
+    T* ptr_;
+};
+```
+
+```c++
+typedef boost::variant<CUDAPlace, CpuPlace> Place;
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
+                       Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
+typedef boost::variant<
+    Array<float, 1>,
+    Array<float, 2>,
+    Array<float, 3>,
+    Array<float, 4>,
+
+    Array<double, 1>,
+    Array<double, 2>,
+    Array<double, 3>,
+    Array<double, 4>,
+
+    Array<float16, 1>,
+    Array<float16, 2>,
+    Array<float16, 3>,
+    Array<float16, 4> > DArrayVar;
+```
+
+Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle.
+
+`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way:
+ 
+ ```c++
+ DArray arr = make_darray(make_ddim({2,3}), 0.0f);
+ ```
+ It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 .
+ 
+ The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this:
+
+ ```c++
+ arr[make_ddim({0, 1})] = 1.0；
+ ```
+
+## Implement Tensor in Paddle
+
+We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers.
+
+Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware.
+
+Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
+
+
+### Memory Management
+`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle.
+
+### Dim and Array
+#### Dim
+
+`Dim` decribes the dimension information of an array.
+
+`DDimVar` is an alias of a specializd class of boost.variant class template.
+
+`DDim` is introduced to represent a dynamically sized dimension.
+
+For example:
+
+```
+Dim<2> d1 = make_dim(3, 3);
+DDim d2 = make_ddim({1, 2, 3});
+```
+
+You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension.
+#### Array
+
+`Array` represents for a tensor with specific type and size.
+
+`DArrarVar` is an alias of a specialized class of boost.variant class template.
+
+`DArray` is introduced to represent a dynamically typed array.
+
+For example:
+
+```
+Array<float, 2> a1(Dim<2>(2, 2));
+DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace());
+```
+
+You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array.
+
+
+Please reference the section of `Learn from Majel` for more details.
+
+### ArrayView
+
+`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on.
+
+`ArrayView` is an encapsulation of `Array`， which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
+
+`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
+
+
+A global function `make_view` is provided to transform an array to corresponding arrayview.
+
+```
+template<typename T, int D>
+ArrayView<T, D> make_view(const Array<T, D>& in) {
+    return in;
+}
+```
+
+A global function `make_iterator` is provided to make iterator of an array.
+
+```
+template<typename T, int D>
+ViewIterator<ArrayView<T, D>> make_iterator(const Array<T, D>& in, Dim<D> idx) {
+    return make_iterator(make_view(in), idx);
+}
+```
+
+### Basic Operations
+
+The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on.
+
+An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU).
diff --git a/doc/fluid/design/concepts/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md
new file mode 100644
index 0000000000000000000000000000000000000000..37e4f7b90f94fa3eb015e733999cd84c96b2239c
--- /dev/null
+++ b/doc/fluid/design/concepts/tensor_array.md
@@ -0,0 +1,271 @@
+# Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+
+ private:
+  vector<LoDTensor> values_;
+};
+```
+
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+
+```python
+input = pd.data(...)
+num_steps = Var(12)
+
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+
+step = Var(1)
+
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+
+output = step_outputs.stack()
+```
+
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+
+An RNN can be implemented with the following pseudocode
+
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+
+
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+
+
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+
+## Introduce TensorArray to uniform all the three RNNs
+TensorArray as a new concept is borrowed from TensorFlow, 
+it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
+
+This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
+such as `recurrent_op`, `RecurrentGradientMachine`.
+
+In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
+`TensorArray` is used to segment inputs and store states in all time steps.
+By providing some methods similar to a C++ array,
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+
+## Dynamic-operations on TensorArray
+
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = Var(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+
+
+    def size(self, output):
+        '''
+        Return the number of values.
+
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
+
+## LoDTensor-related Supports
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+
+Since each step of RNN can only take a tensor-represented batch of data as input, 
+some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
+
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+
+Some definitions are like
+
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
+
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+
+With these two methods, a varience-length sentence supported RNN can be implemented like
+
+```c++
+// input is the varient-length data
+LodTensor sentence_input(xxx);
+TensorArray ta;
+Tensor indice_map;
+Tensor boot_state = xxx; // to initialize rnn's first state
+TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
+TessorArray step_outputs;
+TensorArray states;
+
+for (int step = 0; step = ta.size(); step++) {
+  auto state = states.read(step);
+  // rnnstep is a function which acts like a step of RNN
+  auto step_input = ta.read(step);
+  auto step_output = rnnstep(step_input, state);
+  step_outputs.write(step_output, true/*data_shared*/);
+}
+
+// rnn_output is the final output of an rnn
+LoDTensor rnn_output = ta.pack(ta, indice_map);
+```
+the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
+the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
new file mode 100644
index 0000000000000000000000000000000000000000..8db67f6703d142da71cf06bd4f7e2cb13556f9b0
--- /dev/null
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -0,0 +1,100 @@
+# Design Doc: Var_desc
+
+## Background
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
+
+PaddlePaddle uses proto message to describe compile time program because :
+
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
+
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
+
+
+## Definition of VarType
+
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+
+```proto
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+}
+```
+
+## Definition of TensorDesc
+
+```proto
+message TensorDesc {
+  // Should only be PODType. Is enforced in C++
+  required Type data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+The `Type` here comes from the enum defined inside of `VarType` :
+
+```proto
+enum Type {
+  // Pod Types
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+
+  // Other types that may need additional descriptions
+  LOD_TENSOR = 7;
+  SELECTED_ROWS = 8;
+  FEED_MINIBATCH = 9;
+  FETCH_LIST = 10;
+  STEP_SCOPES = 11;
+  LOD_RANK_TABLE = 12;
+  LOD_TENSOR_ARRAY = 13;
+  PLACE_LIST = 14;
+  READER = 15;
+  CHANNEL = 16;
+}
+```
+
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
+
+## Definition of LodTensorDesc
+
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+```
+
+A LoDTensorDesc contains a tensor and a lod_level.
+
+## Definition of Variable in Python
+
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/fluid/design/concepts/variable.md b/doc/fluid/design/concepts/variable.md
new file mode 100644
index 0000000000000000000000000000000000000000..442ef6b718b227d79ca73031efcbb55817558252
--- /dev/null
+++ b/doc/fluid/design/concepts/variable.md
@@ -0,0 +1,52 @@
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2.  It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation.  Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+  /* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
+  /* size */ t1.Size());
+  
+Randomize(
+  /* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
+  /* size */ t2.Size());
+  
+Mult(
+  /*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
+  /*input1*/ v1.Get<Tensor>().data(),
+  /*input2*/ v2.Get<Tensor>().data());
+```
+     
+We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable.  Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`.  Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code.
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
new file mode 100644
index 0000000000000000000000000000000000000000..df67438bcc741ac521b00ee962fc13c93db21182
--- /dev/null
+++ b/doc/fluid/design/concurrent/channel.md
@@ -0,0 +1,139 @@
+# Channel Design
+
+## Introduction
+
+A Channel is a data structure that allows for synchronous interprocess
+communication via message passing.  It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+
+## How to use it
+
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+
+### Create a channel
+
+Creates a new channel that takes in variables of a specific dtype.
+
+- **fluid.make_channel(dtype, capacity=0)**
+  - **dtype**: The data type of variables being sent/received through channel
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents
+    an unbuffered channel.  Capacity > 0 represents a buffered channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+
+### Close a channel
+
+Closes a channel.  Any pending senders and receivers will be awoken during
+this time.  Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+
+- **fluid.channel_close(channel)**
+
+```
+fluid.channel_close(ch)
+```
+
+### Send data to a channel
+
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
+`ChannelHolder` are supported.
+
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+
+- **channel_send(channel, variable, is_copy=False)**
+  - **channel**: The channel to send the variable to
+  - **variable**: The variable to send to the channel
+  - **is_copy**: If set to True, channel_send will perform a variable assign
+  to copy the source variable to a new variable to be sent.
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+
+### Receive data from a channel
+
+Receives a variable from a channel.  The data of the variable is moved to the
+receiving variable.
+
+- **channel_recv(channel, return_variable)**
+  - **channel**: The channel to receive the variable from
+  - **return_variable**: The destination variable used to store the data of the
+  variable received from the channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+
+## How it Works
+
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+
+### QueueMessage
+
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**.  It contains a condition variable used to lock the
+thread (when there are no available sends/receives).  In addition, it contains
+a callback function to notify a thread when the QueueMessage is being
+processed by the channel.
+
+### Queues
+
+- **buff_**: This queue holds the data buffer in a buffered channel.  The
+capacity is set to the capacity of the channel.  This data buffer is not
+used in an unbuffered channel.
+
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel.  When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+  1. The channel is buffered and is full
+  2. The channel is unbuffered and does not have a receiver
+
+- **recvq**:  This queue holds the QueueMessage of any pending receivers of a
+channel.  When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+  1. The channel is buffered and there is no data on the buff_
+  2. The channel is unbuffered and does not have a sender
+
+### State diagram
+
+#### Channel Send
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
+</p>
+
+#### Channel Receive
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
+</p>
+
+## Limitations and Considerations
+
+### Variable Copy
+
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent.  We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.  
+
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable.  Please
+note that **assign** operator has limited support for only certain variables
+datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..0428e74f9e00a87f6b0972057f48479b8ae56ad6
--- /dev/null
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -0,0 +1,193 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators">intrinsics/operators</a></td>
+<td></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h">class Executor</a></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+  paddlepaddle()
+  fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, L, Y],
+    ops = [
+      read(output = X)
+      kube_get_workers_addrs(output = L)
+      Y = tensor_array(len(L))
+      parallel_for(input = X, output = Y,
+                   attrs = {L, block_id(1)}) # referring to block 1
+    ]
+  }
+
+  block[1] = Block {
+    parent = 0,
+    vars = [x, y, index],
+    ops = [
+      slice(input = [X, index], output = x) # index is initialized by parallel_for
+      send(input = x, attrs = L[index])
+      recv(outputs = y, attrs = L[index])
+      assign(input = y, output = Y[index])
+    ]
+  }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() {  //// block 0
+  X = fluid.read(...)
+  L = fluid.k8s.get_worker_addrs()
+  Y = fluid.tensor_array(len(L))
+  fluid.parallel_for(X, L,
+                     func(index int) {  //// block 1
+                       x = X[index]
+                       fluid.send(L[index], x)
+                       y = fluid.recv(L[index])
+                       Y[index] = y
+                     })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.  
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+
+  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
+     1. creates an Executor instance, and
+     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+  W = Tensor(...)
+  x = fluid.listen_and_do(
+        fluid.k8s.self_addr(),
+        func(input Tensor) {
+          output = fluid.mult(input, W)
+        })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+  2. once a connection is established,
+     1. creates a scope of two parameters, "input" and "output",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
+     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+   2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
new file mode 100644
index 0000000000000000000000000000000000000000..66d19f44baf861c7847e81ca83f61024ec877faf
--- /dev/null
+++ b/doc/fluid/design/concurrent/csp.md
@@ -0,0 +1,251 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning.  Few example applications are:
+
+1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch  := make(chan int)       // a channel without buffer
+ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
+
+### Send and Recv
+
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
+### Select
+
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18b788e80f432ebb2f14b15229e7823c112001e
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+    
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+
+## Current Limitations
+
+#### <a name="block-captures"></a>Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
new file mode 100644
index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_recv.png differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
new file mode 100644
index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_send.png differ
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..719ed76f9d542d6c4f20c30f27656bb53325aa85
Binary files /dev/null and b/doc/fluid/design/concurrent/images/select_op_workflow.png differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e47135e9fc42760898083710e0a6767252a0225b
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_cn.rst
@@ -0,0 +1,8 @@
+并发编程
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0727e75798b2a869588f80d3cce7a886554e4ffb
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_en.rst
@@ -0,0 +1,8 @@
+Concurrent Programming
+-------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md
new file mode 100644
index 0000000000000000000000000000000000000000..42bd136f825986d94fafaeaa5f58edb02848a74c
--- /dev/null
+++ b/doc/fluid/design/concurrent/parallel_do.md
@@ -0,0 +1,163 @@
+# Design Doc: Parallel_Do in PaddlePaddle
+
+In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
+
+## Design overview
+
+The definition of a parallel_do op looks like the following
+
+```c++
+AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
+AddInput(kParameters, "Parameters are duplicated over different devices")
+    .AsDuplicable();
+AddInput(kPlaces, "Devices used for parallel processing");
+AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
+AddOutput(kParallelScopes,
+          "Scopes for all local variables in forward pass. One scope for each device");
+AddAttr<framework::BlockDesc *>(kParallelBlock,
+                                "List of operaters to be executed in parallel");
+```
+
+A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
+`||||` means multiple threads)
+
+```
+In the forward pass
+  |      Split input onto different devices
+  |      Copy parameter onto different devices
+  ||||   Compute forward pass in parallel
+  |      Merge output from different devices
+
+In the backward pass
+  |      Split output@grad onto different devices
+  ||||   Compute backward pass in parallel
+  |      accumulate param@grad from different devices to the first device
+  |      Merge input@grad from different devices
+  |      Copy param@grad to the place of parallel_do_op
+```
+
+This implementation allows to write mixed device program like this
+
+```python
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
+
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
+# parallel processing on multiple GPUs
+pd = ParallelDo(gpu_places)
+with pd.do(input=data):
+    prediction = softmax(fc(fc(data, W1), W2))
+    write_output(prediction)
+prediction = pd()
+loss = cross_entropy(prediction, label)
+```
+
+And the programDesc are like the following
+
+```
+# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
+start_program
+{
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2, w1_grad, w2_grad,
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),
+       sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+block1 { # the forward pass
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 { # the backward pass
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
+  ops: softmax_grad,
+       fc_grad
+       fc_grad
+}
+}
+```
+
+## Performance Imporvement
+
+There are serial places we can make this parallel_do faster.
+
+### forward: split input onto different devices
+
+If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by 
+prefetching the input onto different devices in a seperate background thread. And the python code
+looks like this.
+```python
+pd = ParallelDo(gpu_places)
+with pd.do():
+    feature = get_data_from_prefetch_queue(gpu_places)
+    prediction = my_net(feature)
+    write_output(activation)
+```
+
+### forward: Copy parameter to onto different devices
+
+We can avoid this step by making each device have a copy of the parameter. This requires:
+
+1. `fluid.default_start_up_program()` to be run on all devices
+1. In the backward, allreduce param@grad at different devices, this requires
+    1. `backward.py` add `allreduce` operators at parallel_do_grad
+    1. `allreduce` operators need to be called in async mode to achieve maximum throughput
+1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
+
+By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
+And the ProgramDesc looks like the following
+
+```
+# w1, w2 will be allocated on all GPUs
+start_program
+{
+block0 {
+  parallel_do(block1)
+}
+block1 {
+  parent_block: 0
+  vars: w1, w2
+  ops: init(w1), init(w2)
+}
+}
+
+main_program
+{
+block0 {
+  vars: data, places, w1, w2
+  ops: data, get_place, parallel_do(block1),
+       parallel_do_grad(block2),      # append_backward
+       parallel_do(block3)            # append_optimization
+       
+}
+block1 {
+  parent_block: 0
+  vars: data, h1, h2, loss
+  ops: fc, fc, softmax
+}
+block2 {
+  parent_block: 1
+  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  ops: softmax_grad,
+       fc_grad, allreduce(places, scopes, w1_grad),
+       fc_grad, allreduce(places, scopes, w2_grad)
+}
+block3 {
+  parent_block: 0
+  vars: lr
+  ops: sgd(w2, w2_grad),
+       sgd(w1, w1_grad)
+}
+}
+```
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fcae57cc7932cdaebe549486e7f7cebf0bd038a
--- /dev/null
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -0,0 +1,265 @@
+# select_op Design
+
+## Introduction
+
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case.  If multiple cases are ready to run, then one case is
+choosen at random to be executed.
+
+With the introduction of CSP for Paddle, we mimic this behavior by
+creating a ***select_op***.
+
+## How to use it
+
+The **select_op** is available as a c++ operator.  However most users
+will prefer to use the much simplier Python API.
+
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program.  Also creates a sub block and adds it to the
+main program.  This sub block is used to hold all variables and operators
+used by the case statements.
+
+Within the select block, users can add cases by
+calling **select.case** or **select.default** method.
+
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case.  This method creates a SelectCase block
+guard and adds it to the Select block.  The arguments into this method tells
+the select which channel operation to listen to.
+
+- **fluid.Select.default()**: Represents the fluid default case.  This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)    
+
+with while_op.block():
+    with fluid.Select() as select:
+        with select.case(fluid.channel_send, channel, x):
+            # Send x, then perform Fibonacci calculation on x and y
+            x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+            assign(input=x, output=x_tmp)
+            assign(input=y, output=x)
+            assign(elementwise_add(x=x_tmp, y=y), output=y)
+        with select.case(fluid.channel_recv, quit_channel, result2):
+            # Exit out of While loop
+            while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+            helper = layer_helper.LayerHelper('assign')
+            helper.append_op(
+                type='assign',
+                inputs={'X': [while_false]},
+                outputs={'Out': [while_cond]})
+```
+
+## How it Works
+
+### Program Description
+
+```
+blocks {
+  idx: 0
+  ...
+  // Create "case_to_execute" variable
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: -1.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create "select" operator.
+  // inputs:
+  //   X: All input variables used by operators within the select block
+  //   case_to_execute: Variable filled in by select_op when it determines
+  //     which case to execute.
+  //  
+  // outputs:
+  //   Out: All output variables referenced by operators within select block.
+  //
+  // attrs:
+  //   sub_block: The block id containing the select "cases"
+  //   cases:  Serialized list of all cases in the select op.
+  //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
+  //     where type is 0 for default, 1 for send, and 2 for receive.
+  //     No channel and values are needed for default cases.
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_103.tmp_0"
+      arguments: "fill_constant_104.tmp_0"
+    }
+    inputs {
+      parameter: "case_to_execute"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }    
+    type: "select"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+    attrs {
+      name: "cases"
+      type: STRINGS
+      strings: "0,1,channel_101,fill_constant_109.tmp_0"
+      strings: "1,2,channel_102,fill_constant_108.tmp_0"
+    }
+  }
+  ...
+}
+```
+
+The python select API will add the **select_op** to the current block.  In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**.  It will also create a temp variable called **case_to_execute**.  This variable is
+filled in by the select_op after it has completed processing the case statements.
+
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread.  The thread will
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+
+Finally the select_op will call executor.run on the **sub_block**.
+
+```
+blocks {
+  idx: 1
+  parent_idx: 0
+  ...
+  // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_111.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 0.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create an "equal" operator to compare the case index with the "case_to_execute"
+  // tensor (which was filled in by the select op).
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_111.tmp_0"  // case 0
+    }
+    inputs {
+      parameter: "Y"
+      arguments: "fill_constant_110.tmp_0"  // case_to_execute
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "equal_0.tmp_0"
+    }
+    type: "equal"
+    attrs {
+      name: "axis"
+      type: INT
+      i: -1
+    }
+  }
+  // Use the output of the "equal" operator as a condition for the "conditional_block".
+  // If the condition evaluates to true, then execute the "sub_block" (which represents
+  // the select case's body)
+  ops {
+    inputs {
+      parameter: "Params"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "equal_0.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+    }
+    outputs {
+      parameter: "Scope"
+      arguments: "_generated_var_0"
+    }
+    type: "conditional_block"
+    attrs {
+      name: "is_scalar_condition"
+      type: BOOLEAN
+      b: true
+    }
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 4
+    }
+  }
+  ...
+  // Repeat the above operators for each case statements inside the select body
+}
+
+```
+
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block,
+only one case will be executed.
+
+### select_op flow
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
+</p>
+
+The select algorithm is inspired by golang's select routine.  Please refer to
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+
+## Backward Pass
+
+TODO
diff --git a/doc/fluid/design/data_type/float16.md b/doc/fluid/design/data_type/float16.md
new file mode 100644
index 0000000000000000000000000000000000000000..844d2aafcf257b85057e1ac200ed3d5cf0be2ff0
--- /dev/null
+++ b/doc/fluid/design/data_type/float16.md
@@ -0,0 +1,183 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## float16 inference
+In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one.  
+
+### Operator level requirement
+Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output. 
+
+This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type.
+
+The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable.
+
+So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator.
+
+### Variable level requirement
+Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type.
+
+When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights. 
+
+In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h). 
+
+The following code demonstrates how to do the tensor conversion.
+```Python
+# var is the variable of float weights
+# tensor is a numpy array of data copied from the tensor data in var 
+# fp16_var is the variable that will contain float16 weights converted from var  
+tensor = numpy.array(var.get_tensor())
+fp16_tensor = fp16_var.get_tensor()
+
+# After the original tensor data is converted to numpy float16 data type, 
+# view(numpy.uint16) is used so that the internal memory of the numpy array 
+# will be reinterpreted to be of uint16 data type, which is binded to 
+# Fluid float16 class via pybind with the help of uint16_t built-in c++ type
+fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace)  
+```
+
+### Consistent API requirement
+The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type.
+
+This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode.   
+
+### float16 transpiler
+Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one.
+
+Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md),
+this transpiler mainly does the following modifications:
+
+1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel. 
+
+2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result.
+
+3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program.
+
+4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart.
+
+Below is an example of usage:
+```Python
+# Get the float inference program
+[float_inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(float_inference_program,
+                        feed={feed_target_names[0]: tensor_img},
+                        fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = fluid.InferenceTranspiler()
+t.float16_transpile(float16_inference_program, GPUPlace)
+
+# Running 
+float16_results = exe.run(float16_inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+```
+
+As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode.
+
+### Speedup on GPU
+Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+
+Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details.
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b60167b6b1599df69dfc5073ebf32bdbb0a316ec
--- /dev/null
+++ b/doc/fluid/design/data_type/index_cn.rst
@@ -0,0 +1,7 @@
+数据类型
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a88d17943f49134a2d00363845e919537ff4545
--- /dev/null
+++ b/doc/fluid/design/data_type/index_en.rst
@@ -0,0 +1,7 @@
+Data Type
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+![synchronous distributed training](./src/sync_distributed_training.png)
+
+1. Pserver
+	1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+	1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+	1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+	1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+	1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+	1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+   1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+	1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+	1. Pserver will run the optimization block to optimize the parameters.
+	1. After optimization, pserver set `barrier_condition_` to 1.
+	1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+	1. The trainer uses GetVariable to get all the parameters from pserver.
+	1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+	1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+![asynchronous distributed training](./src/async_distributed_training.png)
+
+1. Pserver:
+	1. Each parameter has a queue to receive its gradient from trainers.
+	1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+	1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+	1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+	1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+	2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..248d2ec18dafdecac9184527638754b6ba4d85b8
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,61 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
+1. After the PS node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
+so all the Trainers would get the same parameters.
+
+In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
+have completed running current mini-batch. After that, all trainers can continue to run the next
+mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends 
+on the slowest node.
+
+In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
+the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
+train such models that achieve scaling, better throughput. In this design doc, we will introduce how to 
+implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
+
+## Design
+
+<img src="./src/async_update.png" width="600"/>
+
+As the figure above, we describe a global view of the asynchronous update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PS
+instances and then send them.
+1. PS would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch the latest parameter from PS before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PS instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+<img src="./src/async_pserver.png" width="750"/>
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8b8427811cddcddf872db5badfd37c96a76c3e3
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+whatever platform you like.
+
+It has two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+<img src="src/ncc2_design.png">
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer has the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
new file mode 100644
index 0000000000000000000000000000000000000000..371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -0,0 +1,197 @@
+# Design Doc: Fluid Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
+
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
+
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
+
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
+
+## Analysis
+
+The assumption is that the user writes the trainer program in either Python or C++.
+
+### Limitation 1
+
+There are two basic functionalities in the trainer program:
+
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
+  optimizer.
+
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
+
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
+
+### Limitation 2
+
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
+
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
+
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
+
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
+
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
+
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
+
+## Distributed Training Architecture
+
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
+
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
+
+### Python API
+
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
+
+```Python
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+    for data in train_reader():
+        loss, acc = exe.run(trainer_prog,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost])
+```
+
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+   distributed training program:
+   1. Parse configurations from `RemoteExecutor`.
+   1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+   1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+      DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+      "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+    feed=feeder.feed(data),
+    fetch_list=[avg_cost],
+    job_desc=JobDesc(
+      jobname,
+      num_trainer,
+      num_pserver,
+      cpu_per_trainer,
+      gpu_per_trainer,
+      mem_per_trainer,
+      cpu_per_pserver,
+      mem_per_pserver
+    ))
+for data in train_reader():
+    loss, acc = exe.run(trainer_prog,
+                        feed=feeder.feed(data),
+                        fetch_list=[avg_cost])
+```
+
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
+
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
+
+
+### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
+
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](./README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -0,0 +1,89 @@
+# Design Doc: Distributed Lookup Table Operator
+
+A distribute lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+
+## Background
+
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+
+### The Forward Algorithm
+
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+
+$$y = x * W$$
+
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x).  Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say two symbols, and a lookup table W:
+
+![lookup table](./src/lookup_table.png)
+
+### The Backward Algorithm
+
+The backward algorithm computes W'(x) using W(x).  W'(x) has the same
+the scale of size as W(x) and is much smaller than W.
+
+To optimize W given W', we can do simple SGD update:
+
+$$W = f(W') = \lambda * W'$$
+
+or some more sophisticated algorithms that rely on both W' and W:
+
+$$W = f(W, W')$$
+
+The following figure illustrates the backward pass of the lookup
+operator: ![lookup table training](./src/lookup_table_training.png)
+
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
new file mode 100644
index 0000000000000000000000000000000000000000..c09b7c99159ace9b3df989f803ede20bc3585d92
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -0,0 +1,44 @@
+# Parallelism, Asynchronous,  Synchronous, Codistillation
+
+
+For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
+
+# Model Parallelism
+In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
+
+A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
+
+More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
+
+# Data Parallelism
+Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
+
+# Asynchronous Training
+In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
+
+In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
+
+Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
+
+# Synchronous Training
+Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
+
+An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
+
+Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
+
+Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
+
+# Codistillation
+Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
+
+
+# Reference
+
+[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
+
+[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
+
+[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
+
+[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad3afc7b7522c60460c6f1f387f9415d3738778
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args:  Need add execute args to assign when and how to use MPI operations.
+* New ops:  Need new op  ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add   ```mpi_send_op``` and ```mpi_listenandserve_op```  to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI```  compile args to control MPI to use or not. If the  ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer),  The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests,  the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives,  the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if  confirm to use MPI, we will modify  ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..38222d083084ebfca3099ce96b47868c42d55101
--- /dev/null
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
new file mode 100644
index 0000000000000000000000000000000000000000..563b70bc0e852bec953eb40dda3c46b3d45d7e68
--- /dev/null
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -0,0 +1,106 @@
+# Design Doc: Parameter Server
+
+## Abstract
+
+We propose an approach to implement the parameter server. In this
+approach, there is no fundamental difference between the trainer and
+the parameter server: they both run subgraphs, but subgraphs of
+different purposes.
+
+## Background
+
+The previous implementations of the parameter server do not run a
+fluid sub-program. Parameter initialization, optimizer computation, network
+communication and checkpointing are implemented twice on both the
+trainer as well as the parameter server.
+
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
+server becomes a natural extension.
+
+## Design
+
+### Distributed Transpiler
+
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
+steps:
+
+1. OP placement: the OPs will be placed on different nodes according
+   to a heuristic that minimizes the estimated total computation
+   time. Currently we will use a simple heuristic that puts parameter
+   variable on parameter server workers and everything else on trainer
+   workers.
+1. Add communication OPs to enable the communication between nodes.
+
+We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
+
+Below is an example of converting the user defined graph to the
+subgraphs for the trainer and the parameter server:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
+
+After converting:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
+
+1. The parameter variable W and its optimizer program are placed on the parameter server.
+1. Operators are added to the program.
+   - *Send* sends data to the connected *Recv* operator.  The
+	 scheduler on the receive node will only schedule *Recv* operator
+	 to run when the *Send* operator has ran (the *Send* OP will mark
+	 the *Recv* OP runnable automatically).
+   - *Enqueue* enqueues the input variable, it can block until space
+     become available in the queue.
+   - *Dequeue* outputs configurable numbers of tensors from the
+     queue. It will block until the queue has the required number of
+     tensors.
+
+### Sparse Update
+
+For embedding layers, the gradient may have many rows containing only 0 when training,
+if the gradient uses a dense tensor to do parameter optimization,
+it could spend unnecessary memory, slow down the calculations and waste
+the bandwidth while doing distributed training.
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
+non-zero gradient data. So when we do parameter optimization both locally and remotely,
+we only need to send those non-zero rows to the optimizer operators:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
+### Benefits
+
+- Model parallelism becomes easier to implement: it is an extension to
+  the trainer - parameter server approach. We can have several "Transpilers"
+  to achieve different goals.
+- User-defined optimizer is easier to add - user can now express it as
+  a sub-program.
+- No more duplication logic inside the trainer and the parameter
+  server mentioned in the background section.
+
+### Challenges
+
+- It is important to balance the parameter shards on multiple
+  parameter servers. If a single parameter is very big (for example: some
+  word-embedding, fully connected, softmax layer), we need to
+  automatically partition the single parameter onto different
+  parameter servers when possible (only element-wise optimizer depends
+  on the parameter variable).
+- In the "Async SGD" figure, the "W" variable on the parameter server
+  could be read and written concurrently. See
+  [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+  details about concurrent program in Fluid.
+
+### Discussion
+
+- Can the Enqueue OP be implemented under our current tensor design
+  (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
+  `min_count` attribute), does our current design support it? (similar
+  question for the *Add* OP)
+
+### References
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/design/dist_train/src/compiler.graffle differ
diff --git a/doc/fluid/design/dist_train/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/design/dist_train/src/compiler.png differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/dist-graph.graffle differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/dist-graph.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local-graph.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local-graph.png differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table_training.png differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads.graffle differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/paddle-compile.graffle differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/paddle-compile.png differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.graffle differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.png differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.png differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/2_level_rnn.png differ
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/design/dynamic_rnn/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn.jpg differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b39ae0675c45e56852293d97f45e91861cf31667
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -0,0 +1,153 @@
+# RNNOp design
+
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
+
+## RNN Algorithm Implementation
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
+</p>
+
+The above diagram shows an RNN unrolled into a full network.
+
+There are several important concepts here:
+
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
+
+### Step-scope
+
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/><br/>
+Figure 2 illustrates the RNN's data flow
+</p>
+
+Please be aware that every step runs the same step-net.  Each step does the following:
+
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
+
+The RNN operator will compose its output from step outputs in each of the step scopes.
+
+### Memory and Ex-memory
+
+Let's give more details about memory and ex-memory using a simple example:
+
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
+
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
+
+### Usage in Python
+
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
+
+We can define an RNN's step-net using a Block:
+
+```python
+import paddle as pd
+
+X = some_op() # x is some operator's output and is a LoDTensor
+a = some_op()
+
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+    x = rnn.add_input(X)
+    # declare a memory (rnn's step)
+    h = rnn.add_memory(init=a)
+    # h.pre_state(), the previous memory of rnn
+    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+    # update current memory
+    h.update(new_state)
+    # indicate that h variables in all step scopes should be merged
+    rnn.add_outputs(h)
+
+out = rnn()
+```
+
+Python API functions in above example:
+
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
+
+### Nested RNN and LoDTensor
+
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
+
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/>
+</p>
+
+```python
+import paddle as pd
+
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+
+# a is output of some op
+a = some_op()
+
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is a chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+
+def lower_level_rnn(paragraph):
+    '''
+    x: the input
+    '''
+    rnn = pd.create_rnn_op(output_num=1)
+    with rnn.stepnet():
+        sentence = rnn.add_input(paragraph, level=0)
+        h = rnn.add_memory(shape=[20, 30])
+        h.update(
+            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+        # get the last state as sentence's info
+        rnn.add_outputs(h)
+    return rnn
+
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+    paragraph_data = rnn.add_input(chapter_data, level=1)
+    low_rnn = lower_level_rnn(paragraph_data)
+    paragraph_out = low_rnn()
+
+    h = rnn.add_memory(init=a)
+    h.update(
+        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+    top_level_rnn.add_outputs(h)
+
+# output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
+
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn_2level_data.png"/>
+</p>
diff --git a/doc/fluid/design/dynamic_rnn/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..cecfcd3307ae4c4fa603220a360e9e124069fa58
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -0,0 +1,242 @@
+# RNN 变长输入设计
+对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
+但不管是padding还是bucket，对于用户都是额外的使用负担。
+
+因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
+
+为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
+
+为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
+其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
+如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下：
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
+可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
+简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性，以实现`lod_start_pos`的传递：
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
+Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
+
+比如原始的输入：
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化，这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置，并添加一个新的接口
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
+
+即下面的转变：
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9493908f4f73b3e7d91f5f6364a2a3660257d508
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
@@ -0,0 +1,175 @@
+# Varient Length supported RNN Design
+For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
+
+Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
+
+The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`, 
+which supports the variable length sequences without padding. 
+This doc will design fluid's RNN based on this idea.
+
+## Multi-layer sequence data format `LODTensor`
+At present, Paddle stores data in one mini-batch in one-dimensional array.
+
+`Argument.sequenceStartPositions` is used to store information for each sentence.
+
+In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
+
+In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
+
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+Or more clearly defined here
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+Each `level_t` here stores a level of offset information consistent with paddle's current practice.
+
+In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
+Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
+Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
+The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
+
+The definition of `LODTensor` is as follows:
+
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
+`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
+
+## How to support the framework
+### Replace `Tensor` with `LoDTensor`
+To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
+Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
+
+In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
+
+### Transmit `lod_start_pos` along with the Op call chain
+`lod_start_pos` is passed along with the Op call chain
+The framework needs to support the following features to implement the transmit of `lod_start_pos`:
+
+1. Implement the transfer as `shared_ptr`
+    - Do not modify the contents of `lod_start_pos` as a consumer
+    - Modify producer of `lod_start_pos` as producer
+    - Conventions consumer only needs to copy `shared_ptr` passed over
+    - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
+    - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
+
+2. Op is transparent enough not to sense `lod_start_pos`
+3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
+
+## sorted by length
+After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
+
+For example, the original input:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+In order to track the changes before and after sorting, use here
+
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+To track the position of the sequence after sorting, and add a new interface
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+Due to the sequence of input sequences, the following existing interfaces need to be modified:
+
+- InitMemories, memory needs to be rearranged according to `sorted_seqs`
+- SetmentInputs
+- ConcatOutputs
+
+In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
+It is passed in as an input to `RecurrentGradientOp`.
+
+## InitMemories
+Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
+
+## SegmentInputs
+
+`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
+
+the transition is as follows:
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` needs
+
+- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
+- Concat each sequence as a regular mini-batch representation
+
+## references
+1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/execution/if_else_op.md b/doc/fluid/design/execution/if_else_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..26d140f06db4ecefa86be015eaa731ffddc6910c
--- /dev/null
+++ b/doc/fluid/design/execution/if_else_op.md
@@ -0,0 +1,51 @@
+# The `IfElse` Operator
+
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
+
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+```
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b017429d168b2466d8f6b423f48bd5d78d1f
--- /dev/null
+++ b/doc/fluid/design/execution/index_cn.rst
@@ -0,0 +1,8 @@
+执行流程
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcf846da348ff0bed707c42718e08314998fbac0
--- /dev/null
+++ b/doc/fluid/design/execution/index_en.rst
@@ -0,0 +1,8 @@
+Execution Process
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8
--- /dev/null
+++ b/doc/fluid/design/execution/switch.md
@@ -0,0 +1,31 @@
+# Design Doc: Switch
+
+## Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+## The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,19 @@
+设计思想
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_cn.rst
+  execution/index_cn.rst
+  concepts/index_cn.rst
+  data_type/index_cn.rst
+  memory/index_cn.rst
+  multi_devices/index_cn.rst
+  dynamic_rnn/index_cn.rst
+  concurrent/index_cn.rst
+  algorithm/index_cn.rst
+  network/index_cn.rst
+  modules/index_cn.rst
+  interface/index_cn.rst
+  dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bfee02ad4626633b08ddff747e2886faf9ba99f
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,19 @@
+Design
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_en.rst
+  execution/index_en.rst
+  concepts/index_en.rst
+  data_type/index_en.rst
+  memory/index_en.rst
+  multi_devices/index_en.rst
+  dynamic_rnn/index_en.rst
+  concurrent/index_en.rst
+  algorithm/index_en.rst
+  network/index_en.rst
+  modules/index_en.rst
+  interface/index_en.rst
+  dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69a8d9bad4fe88935b9fa87757abf0105ca8eb75
--- /dev/null
+++ b/doc/fluid/design/interface/index_cn.rst
@@ -0,0 +1,4 @@
+多语言接口
+------------
+
+TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22abc71f984aa5da7151d5ebf0c3bdbcc69a3624
--- /dev/null
+++ b/doc/fluid/design/interface/index_en.rst
@@ -0,0 +1,4 @@
+Multi-Language Interface
+-----------------------
+
+TBD
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..83ef97c99efeaf27a27f93f0cd3857c0f1bc812e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/design/memory/README.md b/doc/fluid/design/memory/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cf61d089b39041b7a15184e0ea9211d14a66f5e
--- /dev/null
+++ b/doc/fluid/design/memory/README.md
@@ -0,0 +1,141 @@
+# Region-based Heterogeneous Memory Management
+## Design
+
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::CUDAPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template <typename Place> void* Alloc(Place, size_t);
+template <typename Place> void Free(Place, void*);
+template <typename Place> size_t Used(Place);
+}  // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
+
+```cpp
+template<>
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and 
+
+```cpp
+template<>
+void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
+  return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = NULL;
+  if (a == NULL) {
+    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+  }
+  return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator* as = NULL;
+  if (as == NULL) {
+    as = new BuddyAllocator*[platform::NumGPUs()];
+    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+    }
+  }
+  return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+  ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+  struct Block {
+    size_t size;
+    Block* left, right;
+    size_t index; // allocator id
+  };
+  ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/doc/fluid/design/memory/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/design/memory/images/control_flow_graph.png differ
diff --git a/doc/fluid/design/memory/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/design/memory/images/dataflow_equations.png differ
diff --git a/doc/fluid/design/memory/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/design/memory/images/deep_learning.png differ
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c507c638bd1a6eb428175ed2756a6ecfc6cca198
--- /dev/null
+++ b/doc/fluid/design/memory/index_cn.rst
@@ -0,0 +1,7 @@
+内存管理
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7526437a73a09b300f05e138084755f5528b242
--- /dev/null
+++ b/doc/fluid/design/memory/index_en.rst
@@ -0,0 +1,7 @@
+Memory Management
+-------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..285464ada728d8f7a086a26beca6cfa4418e98e4
--- /dev/null
+++ b/doc/fluid/design/memory/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
+
+Following graph shows the details:
+
+![](images/deep_learning.png)
+
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. 
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
+
+#### In-place Operation
+In a relu activation operator： 
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
+
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+![](images/control_flow_graph.png)
+
+#### Dataflow Analysis
+
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+![](images/dataflow_equations.png)
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._sucessors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+        self._program = Program
+    
+    def build(self):
+        pass
+    
+    def dataflow_analysis(self):
+        pass
+        
+    def memory_optimization(self):
+        pass
+        
+    def get_program(self):
+        return self._program
+```
+
+#### Make dataflow analysis
+
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+    i --> pool
+    pool --> o
+else:
+    pool --> o
+    i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/fluid/design/modules/backward.md b/doc/fluid/design/modules/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a
--- /dev/null
+++ b/doc/fluid/design/modules/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. 
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by the cost function.
+        parameter_list(list): Parameters that need to be updated by optimizers.
+            If None, it means all parameters need to be updated.
+
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+        
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
+    """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building. 
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. 
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. 
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Create a new block(`grad_s_block`), whose father is `s_block`.
+        Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+    
+    Invoke `core.get_grad_op_desc()` to get op's grad_op.
+    Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+    Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+    Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. 
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block, 
+                           start_op_idx, 
+                           grad_to_var, 
+                           grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Invoke _append_backward_vars_(), with `block=s_block`
+        
+    for var_name in op.all_output_names():
+        if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+            continue
+        create a new variable named 'var_name' in block
+        if grad_to_var.has_key(var_name):
+            set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+            
+    do op's var type inference
+    do op's shape inference
+```
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37
--- /dev/null
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -0,0 +1,134 @@
+# Batch Normalization
+
+## What is batch normalization
+
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
+
+The principle of batch normalization can be summarized into a simple function:
+
+```
+y = (x - E[x]) / STD[x]) * scale + bias
+```
+
+`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
+
+In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
+
+## Differences with normal operators
+
+`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
+
+1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
+
+2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
+
+```
+if batch_id == 0
+  estimated_mean = E[x]
+else
+  estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
+```
+
+The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
+
+## Implementation
+
+Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
+
+### C++
+
+As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
+
+#### Inputs
+
+- `x`: The inputs data, which is generated by the previous layer.
+- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
+- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
+- `scale`: trainable parameter 'scale'
+- `bias`: trainable parameter 'bias'
+
+#### Outputs
+
+- `y`: The output data.
+- `batch_mean`: The mean value of batch data.
+- `batch_var`: The standard deviation value of batch data.
+- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
+- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
+
+#### Attributes
+
+- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
+- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
+- `epsilon`: *float*. The epsilon value to avoid division by zero.
+- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
+
+#### Kernels
+
+The following graph showes the training computational process of `batch_norm_op`:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
+
+cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
+
+### Python
+
+`batch_norm_op` is warpped as a layer in Python:
+
+```python
+def batch_norm_layer(net,
+                     input,
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
+                     epsilon = 1e-6,
+                     momentum = 0.99):
+	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
+	var_cache = scop.new_var(name = 'estimated_var', trainable = False)
+	batch_mean = scope.new_var(name = 'batch_mean')
+	batch_var = scope.new_var(name = 'batch_var')
+	batch_norm_op = Operator('batch_norm_op',
+	                         x = input,
+	                         estimated_mean = mean_cache,
+	                         estimated_mean = var_cache,
+	                         scale = scale,
+	                         bias = bias,
+	                         y = output,
+	                         batch_mean = batch_mean,
+	                         batch_var = batch_var,
+	                         saved_mean = mean_cache,
+	                         saved_var = var_cache,
+	                         is_infer = False,
+	                         use_global_est = use_global_est,
+	                         epsilon = epsilon,
+	                         momentum = momentum)
+	net.append_op(batch_norm_op)
+	return output
+```
+
+Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
+
+1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
+
+2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
+
+```python
+for pass_id in range(PASS_NUM):
+    # ...
+    net.train()  # run training model
+    if pass_id % 100 == 0:
+        net.infer(test_image)    # run inferencing model
+    # ...
+```
+
+`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
+</div>
+
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
+
+When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
+
+How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md
new file mode 100644
index 0000000000000000000000000000000000000000..de9605b0e67a035ab1ef1e4cafbe838f83bc5807
--- /dev/null
+++ b/doc/fluid/design/modules/evaluator.md
@@ -0,0 +1,58 @@
+# Evaluator Design
+
+## Problem Statement
+
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
+
+## Evaluator Design
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+## Implementation
+This design is shown in the Python API.
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
+
+
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/design/modules/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/design/modules/images/batch_norm_fork.png differ
diff --git a/doc/fluid/design/modules/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/design/modules/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/design/modules/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/design/modules/images/feed_forward.png differ
diff --git a/doc/fluid/design/modules/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/design/modules/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/design/modules/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/design/modules/images/l1_regularization.png differ
diff --git a/doc/fluid/design/modules/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/design/modules/images/l2_regularization.png differ
diff --git a/doc/fluid/design/modules/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/design/modules/images/loss_equation.png differ
diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b25783f0f5120991c29ba31b7b512bd4c183eecf
--- /dev/null
+++ b/doc/fluid/design/modules/index_cn.rst
@@ -0,0 +1,14 @@
+代码结构和重要模块
+-----------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2108156e080996916f2650448f0a56f998757204
--- /dev/null
+++ b/doc/fluid/design/modules/index_en.rst
@@ -0,0 +1,14 @@
+Code Structure and Important Modules
+-------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a
--- /dev/null
+++ b/doc/fluid/design/modules/infer_var_type.md
@@ -0,0 +1,78 @@
+# Design Doc: InferVarType
+
+## The Problem Posed
+
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+
+## Proposed Solution
+
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+
+
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+
+It takes an operator description as its input and will write the output variable type and store them in block description.
+
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+
+## Register InferVarType
+
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+
+Operator developers can write the specialize `VarTypeInferer` as follow.
+
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e64ac2fb1c6898bfeb883250347da3d9a4757b97
--- /dev/null
+++ b/doc/fluid/design/modules/net_op_design.md
@@ -0,0 +1,250 @@
+# Network Design
+
+`Network` is the container and controller of a set of operators,
+user can build a real network from a `NetDesc` which is a protobuf message
+and use `Network.Run()` to run all the operators in the network.
+
+A network object knows all Operators belonging to this network. Variables,
+which are inputs and outputs of these operators,
+are created and managed by a hierarchy of Scope objects.
+
+## API
+
+### Net
+To make the `Network` extendable, a base class is defined like this
+
+```c++
+// operator's index stored in a network.
+typedef int OpIndex;
+
+// The minimum a network should be implemented.
+class Net {
+ public:
+  // run all the operators and return success(true) or not, with all the
+  // variables are located in `scope`. `context` describes the detail execution
+  // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+  // If no positive indexes are provided, all operators in `ops_` will run.
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                   OpIndex end = -1) const = 0;
+
+  // Add an Operator according to `def`.
+  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+  // Add optimizer operators acctording to `attrs`.
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+  // Add backward operators.
+  virtual Error AddBackwardOps() = 0;
+
+  // Infer the shapes of variables required by operators in the network. The
+  // `scope` will be mutated according to the inferred shapes.
+
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+};
+```
+
+All network implementations should build networks from a protobuf message which
+describes the structure of a real network; `Run` method should be implemented by
+all implementations to offer a universal method to forward or backward compute a network.
+
+`Net::Create` is a method of factory pattern and can be implemented like
+
+```c++
+std::unique<Net> Net::Create(const NetDesc& def) {
+  switch (def.model_type()) {
+    case NN:
+      return new Network(def);
+    case Recursive:
+      return new RecursiveNet(def);
+    case Recurrent:
+      return new RecurrentNet(def);
+  }
+  return nullptr;
+}
+```
+
+Network is designed as the container of operators. to make it more extendable,
+we decouple it from the related variable resources.
+
+`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
+
+Finally, `Net` can be used as followed
+
+```c++
+Scope default_scope;
+OpContext default_context;
+auto net = Net::CreateNet(def);
+
+if (net) {
+  net.Run(&default_scope, &default_context);
+}
+```
+
+### `PlainNet` as a simple implementation of `BaseNet`
+
+A very basic implementation is as follows. All it does is simply to run every operators in sequence.
+
+```c++
+class PlainNet : public Net {
+ public:
+  // Create a network describe by `def`.  NetDesc is the definition of a network.
+  PlainNet(const NetDesc &def);
+
+  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+  training.
+  virtual Error InferShape(Scope *scope) override;
+
+  // Run all the operators with the `scope`, if no scope is provided, default
+  // scope will be used instead. If no OpContext is provicded, default context will be used.
+  virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
+                   OpIndex end = -1) const override;
+
+  virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+  virtual Error AddBackwardOps() override;
+
+ protected:
+  // Create operators accordding to `def`, will be called by the constructor.
+  Error BuildNet(const NetDesc &def);
+
+  // Add a operator which is identified as `type` and has attributes described
+  // in `attrs`, the `inputs` are the keys of readonly input variables,
+  // `outputs` are keys of mutable output variables. An `OpIndex` will be
+  // returned to indicate the offset of the new operator in `ops_`.
+  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
+                const std::vector<string> &outputs,
+                const OprAttr &attrs = OprAttr());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+```
+
+`PlainNet` will create operators so that a private member `ops_` is defined,
+the operators are created by `CreateNet`, and each operator is created by `AddOp`.
+
+
+## PlainNet Usage
+`PlainNet` can be used to define and run a network as follows
+
+```c++
+// create an empty scope located on CPU device.
+Scope scope(CPUPlace());
+
+// create and init variables described in `net_desc`.
+scope.CreateVariables(net_desc);
+scope.InitVariables(net_desc);
+
+// create a network according to `net_desc`
+auto net = Net::CreateNet(net_desc);
+// Add more operators if needed.
+net->AddOp(add...);
+net->AddOp(fc...);
+
+net->AddBackwardOps();
+net->AddOptimizerOps();
+
+// run the network providing the `scope`.
+net.Run(&scope);
+```
+
+## `NetBuilder` as a C++ syntax wrapper
+This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
+
+The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
+
+```c++
+Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
+Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
+Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
+Variable* avg_loss = builder.AddOp("mean", loss);
+
+builder.BackwardFrom(avg_loss)
+builder.AddOptimization(1e-4, "adam");
+builder.Run();
+```
+
+`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
+
+```c++
+class NetBuilder final {
+ public:
+  NetBuilder(Net* net) : net_(net) {}
+
+  Variable* AddOp(const string& type, const vector<Variable>& inputs,
+                  size_t size, Activation act) {
+    // much code here.
+    // ...
+    net_->AddOp(def);
+    need_rebuild_net_ = true;
+    net_->InferShape();
+    // ...
+  }
+
+  Error BackwardFrom(const Variable& cost);
+
+  Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
+    // backward.
+    if (need_backward) {
+      if (need_rebuild_net_) {
+        AddBackwardOps();
+        AddOptimizerOps();
+      }
+      net_->Run(scope, context);
+      return;
+    }
+    // just forward.
+    net_->Run(scope, context, 0, last_forward_op_);
+  }
+
+ protected:
+  Error AddBackwardOps();
+  Error AddOptimizerOps();
+
+ private:
+  Net* net_;
+  OpIndex last_forward_op_{-1};
+  bool need_rebuild_net_{true};
+}
+```
+
+### Compatibility with RNN
+
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
+for example we can implement a simple recurrent neural network as follows
+
+```c++
+// copy some `vars` form `source` to `target`
+void Copy(const Scope &source, Scope &target,
+          const std::vector<std::string> &vars);
+
+Scope default_scope;
+// some initial mutations on `default_scope` here.
+
+auto rnn_step_net = PlainNet(rnn_step_net_def);
+
+// Create rnn's states, the last scope is used to store rnn outputs.
+Scope *rnn_states = new Scope[num_states + 1];
+
+for (int i = 0; i < num_states + 1; i++) {
+  // Initialize all rnn state scopes, copy parameters and so on.
+  rnn_states[i].CreateVars(rnn_step_net_def);
+  Copy(default_scope, rnn_states[i], rnn_related_vars);
+  // Prepare rnn's inlinks, just copy inlink variables to each state.
+  Copy(default_scope, rnn_states[i], inlink_vars);
+}
+
+// Run the rnn.
+for (int i = 0; i < num_states; i++) {
+  rnn_step_net.Run(rnn_states[i]);
+  // Copy current state's state variables to next state, the related variables
+  // are named like "previous_state_xxx".
+  Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
+}
+
+// Copy rnn's final outputs to `default_scope`.
+Copy(rnn_states[num_states], default_scope, outlink_vars);
+```
diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c25fde9cafb322f789662077d3fc6cc1d64ce38
--- /dev/null
+++ b/doc/fluid/design/modules/optimizer.md
@@ -0,0 +1,91 @@
+# Optimizer Design
+
+## The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+## High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/fluid/design/modules/prune.md b/doc/fluid/design/modules/prune.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a
--- /dev/null
+++ b/doc/fluid/design/modules/prune.md
@@ -0,0 +1,63 @@
+# Prune
+
+## Motivation
+
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+
+## Challenge
+
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+
+## Solution
+
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+
+```c++
+message OpDesc {
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+```
+
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+
+### Algorithm
+
+If an operator needs to be run, it must fall into one of the following cases:
+
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+```
+
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..83af4e55485c079265d3f2b1e15070825b532c02
--- /dev/null
+++ b/doc/fluid/design/modules/python_api.md
@@ -0,0 +1,325 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+
+    def type(self):
+        return self.desc.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Function
+
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
+
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/fluid/design/modules/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c
--- /dev/null
+++ b/doc/fluid/design/modules/register_grad_op.md
@@ -0,0 +1,92 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+
+However, we noticed two problems with the current design:
+
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+
+## The Current Implementation
+
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+
+map<string, OpInfo> OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
+
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
new file mode 100644
index 0000000000000000000000000000000000000000..519a9143033386678351ff78a465e5ba6e220c52
--- /dev/null
+++ b/doc/fluid/design/modules/regularization.md
@@ -0,0 +1,66 @@
+# Regularization in PaddlePaddle
+
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
+
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
+
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+
+##### L2 Regularization:
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
+
+##### L1 Regularization
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
+
+A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+
+## Regularization Survey
+
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
+
+## Proposal for Regularization in PaddlePaddle
+
+### Low-Level implementation
+
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Computation Graph
+
+Below is an example of a really simple feed forward neural network.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
+
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
+   
+### Python API implementation for Regularization
+
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
+
+The proposal is to add these ops in a lazy manner just before the backward pass.
+
+#### Storage of Regularization attributes
+
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/fluid/design/modules/selected_rows.md b/doc/fluid/design/modules/selected_rows.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5
--- /dev/null
+++ b/doc/fluid/design/modules/selected_rows.md
@@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators that need to be written to support `SelectedRows`. These are:
+
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc222564e3ec28e306ca0572b6a23104f6e9cbc5
--- /dev/null
+++ b/doc/fluid/design/motivation/api.md
@@ -0,0 +1,261 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems as neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use GAN in this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code of `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/fluid/design/motivation/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.graffle differ
diff --git a/doc/fluid/design/motivation/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.png differ
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid.md
@@ -0,0 +1,140 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..6dd3840a0734e8593890dcf8044746197350c6f5
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -0,0 +1,110 @@
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself.  The C++ class `Executor` can run this
+protobuf message as an interpreter.  This article describes the Fluid
+compiler.
+
+![](fluid-compiler.png)
+
+## ProgramDesc
+
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+
+```python
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+This program consists of a [block](../concepts/block.md) of three operators --
+`read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
+the following
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+## Transpilers
+
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`.  Let us take some
+examples:
+
+1. *Memory optimization transpiler*: We can write a transpiler that
+   inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+   to free memory early, before the end of an iteration, so to keep a
+   small memory footprint.
+
+1. *Distributed training transpiler*: We can write a transpiler that
+   converts a`ProgramDesc` into its distributed version of two
+   `ProgramDesc`s -- one for running by the trainer processes and the
+   other for the parameter server.
+
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+
+## Native Code Generator
+
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+
+```c++
+void main() {
+  auto X = fluid_cuda_read(...);
+  auto W = fluid_cuda_create_tensor(...);
+  auto Y = fluid_cuda_mult(X, W);
+}
+```
+
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`.  Please be aware
+that each function could just define a C++ instance of an operator and
+run it.  For example
+
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+  paddle::Tensor t;
+  paddle::operator::Read r(&t, ...);
+  r.Run();
+  return t;
+}
+```
+
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
+                               const paddle::Tensor& b) {
+  paddle::Tensor t;
+  paddle::operator::Mult m(a, b, ...);
+  Mult.Run(cuda_context);
+}
+```
+
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+
+## Multi-Block Code Generation
+
+Most Fluid application programs may have more than one blocks.  To
+execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7706e73eca644ed6db772fd77da947395313237f
--- /dev/null
+++ b/doc/fluid/design/motivation/index_cn.rst
@@ -0,0 +1,10 @@
+设计动机和目标
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10b64b257c604ced6b957d6d6018e8a363f00fac
--- /dev/null
+++ b/doc/fluid/design/motivation/index_en.rst
@@ -0,0 +1,10 @@
+Design Motivations and Goals
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d0f6d3f3ad9884f108826e8410871fffd51bf
--- /dev/null
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -0,0 +1,275 @@
+# Design Doc: Refactorization Overview
+
+The goals of refactoring include:
+
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
+
+## Computation Graphs
+
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
+
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
+
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
+
+1. A graph is composed of *variables* and *operators*.
+
+1. The description of graphs must be serializable/deserializable, so that:
+
+   1. It can be sent to the cloud for distributed execution, and
+   1. It can be sent to clients for mobile or enterprise deployment.
+
+1. The Python program does two things
+
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
+      1. the C++ library `libpaddle.so` for local execution,
+      1. the master process of a distributed training job for training, or
+      1. the server process of a Kubernetes serving job for distributed serving.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
+
+## Description and Realization of Computation Graph
+
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
+
+At runtime, the C++ program realizes the graph and runs it.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
+
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+
+## Compilation and Execution
+
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
+
+   1. Create `VarDesc` to represent local/intermediate variables,
+   1. Create operators and set attributes,
+   1. Validate attribute values,
+   1. Infer the type and the shape of variables,
+   1. Plan memory-reuse for variables,
+   1. Generate the backward graph
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
+
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
+
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
+      1. realize local variables defined in the BlockDesc message in the new scope,
+      1. a scope is similar to the stack frame in programming languages,
+
+   1. Create an instance of class `Block`, in which,
+      1. realize operators in the BlockDesc message,
+
+   1. Run the Block by calling
+      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
+      1. `Block::Eval(vector<Operator>* targets)` for optimization.
+
+
+## Intermediate Representation (IR)
+
+```text
+Compile Time -> IR -> Runtime
+```
+
+### Benefits of IR
+
+- Optimization
+  ```text
+  Compile Time -> IR -> Optimized IR -> Runtime
+  ```
+- Automatically send partitioned IR to different nodes.
+  - Automatic Data Parallelism
+    ```text
+    Compile Time
+    |-> Single GPU IR
+        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+            |-> Node-0 (runs trainer-IR-0)
+            |-> Node-1 (runs trainer-IR-1)
+            |-> Node-2 (runs pserver-IR)
+    ```
+  - Automatic Model Parallelism (planned for future)
+
+---
+
+## Operator/OpWithKernel/OpKernel
+
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
+
+---
+
+## Operator
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
+
+* `Operator` is the fundamental building block of the user interface.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+    * Use `Run` to compute the `output` variables from the `input` variables.
+
+---
+
+## OpWithKernel/Kernel
+
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
+
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+
+---
+
+## Why separate Kernel and Operator
+
+* Separate GPU and CPU code.
+    * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
+---
+
+## Libraries for Kernel development
+
+* `Eigen::Tensor` contains basic math and element-wise functions.
+    * Note that `Eigen::Tensor` has broadcast implementation.
+    * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
+---
+## Operator Registration
+
+### Why is registration necessary?
+We need a method to build mappings between Op type names and Op classes.
+
+### How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
+
+---
+## The Registry Map
+
+### `OpInfoMap`
+
+`op_type(string)` -> `OpInfo`
+
+`OpInfo`:
+
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+
+---
+## Related Concepts
+
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
+
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+
+---
+## Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+	1. Call maker class to complete `proto` and `checker`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
+
+---
+## Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forward Op to backward Op
+![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
+
+---
+## Backward Module (2/2)
+### Build Backward Network
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+	- Shared Variables => insert an `Add` operator to combine gradients
+	- No Gradient => insert a `fill_zero_grad` operator
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
+	- RNN Op => recursively call `Backward` on stepnet
+
+
+---
+## Scope, Variable, Tensor
+
+* `Tensor` is an n-dimension array with type.
+	* Only dims and data pointers are stored in `Tensor`.
+	* All operations on `Tensor` are written in `Operator` or global functions.
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+	* `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
+
+---
+## Block (in design)
+### the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
+
+---
+## Milestone
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+  - Framework development gives **priority support** to model migration, for example,
+    - the MNIST demo needs a Python interface,
+    - the RNN models require the framework to support `LoDTensor`.
+  - Determine some timelines,
+  - Frequently used Ops need to be migrated first,
+  - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
+
+---
+## Control the migration quality
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+  - The documentation of layers and ops should be written inside the code.
+  - Take the documentation quality into account when submitting pull requests.
+  - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/fluid/design/multi_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f8439e8623e1c1ae9a12c24d08079f0ec3d761f
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_cn.rst
@@ -0,0 +1,9 @@
+多设备支持
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_en.rst
@@ -0,0 +1,9 @@
+Multi-Device Support
+----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..6edc14ca73b1abf824981b59511a9aca4e0f3b47
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
@@ -0,0 +1,59 @@
+# Kernel Hint Design
+
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d2aab87b8cf30d03075e96cc4c67070efaf963a
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -0,0 +1,101 @@
+# Kernel Selection
+
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
+
+The `OpKernelType ` is as follows:
+
+```cpp
+struct OpKernelType {
+  Place place_;
+  DataType data_type_;
+  DataLayout data_layout_;
+  LibraryType library_type_;
+};
+```
+
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
+
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
+
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+     |
+ op1_2_op2
+     |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution: data transform
+
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
+
+The algorithm is described as following
+
+```cpp
+void OperatorWithKernel::Run(
+        const Scope& scope,
+        const platform::Place& place) const {
+  ExecutionContext ctx(...);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name : this->Inputs()) {
+    auto* tensor_in = GetTensor(var_name);
+    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+      auto* trans_var = new_scope.Var(var_name);
+      auto* out = TransformData(expected_kernel_key,
+                                kernel_type_for_var,
+                                *tensor_in);
+      SetTensorToVariable(...);
+    }
+  }
+
+  auto kernel = kernels.find(expected_kernel_key);
+  kernel->Compute(ExecutionContext(...));
+}
+```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+     |
+op1_2_op2(on CPU)
+     |
+[transform](from CPU to GPU)
+     |
+op1_2_op2(on GPU)
+     |
+OP2(CUDAPlace)
+```
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e391bd62b4f4e123a9a6f35b7adf5726f205635
--- /dev/null
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
new file mode 100644
index 0000000000000000000000000000000000000000..f32a5b7e8a4d820319a666dab4c3129360e2c924
--- /dev/null
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -0,0 +1,235 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
+Figure 1. Archetecture of Deep Speech 2 Network.
+</div>
+
+We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
+
+Key ingredients about the layers:
+
+- **Data Layers**:
+   - Frame sequences data of audio **spectrogram** (with FFT).
+   - Token sequences data of **transcription** text (labels).
+   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
+- **2D Convolution Layers**:
+   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
+   - With striding for only the first convlution layer.
+   - No pooling for all convolution layers.
+- **Uni-directional RNNs**
+	- Uni-directional + row convolution: for low-latency inference.
+	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
+- **Row convolution**:
+	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
+	- Not nessesary if with bi-direcitional RNNs.
+	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
+- **Batch Normalization Layers**:
+   - Added to all above layers (except for data and loss layer).
+   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
+
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
+
+### Row Convolution
+
+TODO by Assignees
+
+### Beam Search with CTC and LM
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
+
+## Future Work
+
+- Efficiency Improvement
+- Accuracy Improvement
+- Low-latency Inference Library
+- Large-scale benchmarking
+
+## References
+
+1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
+2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/design/network/images/beam_search.png b/doc/fluid/design/network/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/design/network/images/beam_search.png differ
diff --git a/doc/fluid/design/network/images/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/design/network/images/ds2_network.png differ
diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3557d55fe4dbae1f712e0760ca15111ec6f6792d
--- /dev/null
+++ b/doc/fluid/design/network/index_cn.rst
@@ -0,0 +1,7 @@
+复杂网络设计
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73a7137236bdf0548d35721609351d6deca3013b
--- /dev/null
+++ b/doc/fluid/design/network/index_en.rst
@@ -0,0 +1,7 @@
+Complex Network Design
+------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..b95773c50ca0dcbd1b93529332e035d4de90faa8
--- /dev/null
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -0,0 +1,229 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
+
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
+
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
+
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following examples are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx_expanded],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
+
+In this way, users can customize anything on the input or output of beam search, for example:
+
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`:
+
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
+
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state:
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state is stored in `encoder_ctx_expanded`:
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
+
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
+
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
+
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According to the image above, the only phase that changes the LoD is beam search.
+
+## Beam search design
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
+
+1. `topk_ids`, the top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables:
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab1c2ff23cfff586516876684348bb15bd2084fc
Binary files /dev/null and b/doc/fluid/design/onnx/images/project_structure.png differ
diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc1665d7c33eb54cb63e5306a439c1ca67016d1e
--- /dev/null
+++ b/doc/fluid/design/onnx/onnx_convertor.md
@@ -0,0 +1,131 @@
+# Background
+
+[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
+
+Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
+
+
+# How it works
+
+ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
+
+When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
+
+Here are a few major considerations when it comes to converting models:
+
+- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
+- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
+- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
+- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
+- **Model validation**: There are two kinds of validations that are necessary:
+   1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
+   2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
+- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
+
+One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
+
+
+# Project structure
+
+<p align="center">
+<img src="./images/project_structure.png"/>
+</p>
+
+The project contains four important parts:
+
+* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
+
+* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
+
+* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
+
+* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
+
+
+# Usage
+The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
+
+* Convert Fluid inference model to ONNX binary model
+
+    ```
+    python convert.py --fluid_model <fluid inference model> --onnx_model <ONNX model> validate True
+    ```
+
+* Validate the converted model
+
+    ```
+    python validate.py --fluid_model <fluid inference model> --onnx_model <ONNX model>
+    ```
+
+The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
+
+
+# Challenges and mitigation
+
+## Cycles
+
+Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
+
+*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
+
+## Sequences
+
+Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
+
+*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
+
+## Ops that can't easily be mapped
+
+There are ops that just aren't possible to map today:
+
+**Control flow operators**
+
+Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
+
+*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
+
+
+**Non-existent in Fluid**
+
+There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
+
+*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
+
+
+**Concurrency**
+
+ONNX does not have any considerations for concurrency right now.
+
+*Resolution*: There are two ways to approach this:
+
+a. We choose to not support concurrent models.
+b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
+
+
+**Overloaded in Fluid**
+
+There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
+
+*Resolution*: Chain multiple Paddle operators.
+
+
+## Lack of LoDTensors
+
+As stated above, ONNX only supports simple Tensor values.
+
+*Resolution*: Deprecate to plain old numpy-able tensors.
+
+
+## Reconstruction from deprecated ONNX ops
+
+For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
+
+*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
+
+
+# Supported models
+
+As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
+
+- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
+- Recurrent models: language model, stacked LSTMs etc.
diff --git a/doc/fluid/design/others/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md
new file mode 100644
index 0000000000000000000000000000000000000000..773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf
--- /dev/null
+++ b/doc/fluid/design/others/auto_gradient_check.md
@@ -0,0 +1,150 @@
+## Auto Gradient Check Design
+
+## Background：
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+  1. The formula for backpropagation formula should be correct according to the forward computation.
+  2. The Implementation of the above shoule be correct in CPP.
+  3. It is difficult to prepare an unbiased test data.
+
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+  1. Numerical gradient checker only needs the forward operator.
+  2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
+
+## Mathematical Theory
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numerical Gradient Implementation
+### Python Interface
+```python
+def get_numerical_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=0.005,
+                         local_scope=None):
+    """
+    Get Numerical Gradient for the input of an operator.
+
+    :param op: C++ operator instance, could be an network.
+    :param input_values: The input variables. Should be an dictionary, whose key is
+    variable name, and value is a numpy array.
+    :param output_name: The final output variable name.
+    :param input_to_check: The input variable with respect to which the gradient has to be computed.
+    :param delta: The perturbation value for numerical gradient method. The
+    smaller the delta, the more accurate the result. But if the delta is too
+    small, it will suffer from the numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+```
+
+### Explanation:
+
+- Why do we need an `output_name`
+  - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
+
+- Why do we need `input_to_check`
+  - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
+
+
+### Core Algorithm Implementation
+
+
+```python
+    # we only compute the gradient of one element a time.
+    # we use a for loop to compute the gradient of each element.
+    for i in xrange(tensor_size):
+        # get one input element using the index i.
+        original = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run the forward op and then
+        # get the new value of the result tensor.
+        x_pos = original + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        # Subtract delta from this element, run the op again
+        # and get the new value of the result tensor.
+        x_neg = original - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        # restore old value
+        tensor_to_check.set_float_element(i, original)
+
+        # compute the gradient of this element and store
+        # it into a numpy array.
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Gradient Check Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+1. Numerical gradient
+2. CPU kernel gradient
+3. GPU kernel gradient (if supported by the device)
+
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
+
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
+
+#### Python Interface
+
+```python
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+          computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which the
+          gradient will be computed.
+        :param output_name: The final output variable name.
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used to create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+```
+
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
+
+```python
+numerical_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numerical_grad = numpy.abs(numerical_grad)
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
+abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes：
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
+
+
+#### References:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/fluid/design/others/dcgan.png b/doc/fluid/design/others/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/design/others/dcgan.png differ
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..7167470088766985fa5ad31657410309330fd725
--- /dev/null
+++ b/doc/fluid/design/others/gan_api.md
@@ -0,0 +1,253 @@
+# Design for GAN
+
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
+
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+
+## The Conditional-GAN might be a class.
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one.
+Returns a 0/1 binary label.
+
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+
+To be more detailed, we introduce our design of DCGAN as following:
+
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively.
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+
+    # executor
+    sess = pd.executor()
+
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+
+        if batch_id % 2 == 0:
+          sess.run(d_step,
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/fluid/design/others/graph.md b/doc/fluid/design/others/graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..7519a65df835a39fe14f6ef45530afff170191ff
--- /dev/null
+++ b/doc/fluid/design/others/graph.md
@@ -0,0 +1,70 @@
+# Design Doc: Computations as a Graph
+
+A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
+
+This document explains that the construction of a graph as three steps:
+
+- construct the forward part
+- construct the backward part
+- construct the optimization part
+
+## The Construction of a Graph
+
+Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+### Forward Part
+
+The first four lines of above program build the forward part of the graph.
+
+![](images/graph_construction_example_forward_only.png)
+
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
+
+In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
+
+### Backward Part
+
+The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
+
+`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
+
+![](images/graph_construction_example_forward_backward.png)
+
+According to the chain rule of gradient computation, `ConstructBackwardGraph` would
+
+1. create a gradient operator G for each operator F,
+1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
+1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
+1. make all these gradients as outputs of G.
+
+### Optimization Part
+
+For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
+
+![](images/graph_construction_example_all.png)
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+  repeated OpDesc ops = 1;
+  repeated VarDesc vars = 2;
+}
+```
+
+in the order that they appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
new file mode 100644
index 0000000000000000000000000000000000000000..97f395133b48a1d0ed5136f0ebc8720b8ca87ded
--- /dev/null
+++ b/doc/fluid/design/others/graph_survey.md
@@ -0,0 +1,232 @@
+## Survey on Graph
+
+Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
+
+### Mxnet
+
+The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
+
+
+`Symbol` is help class used to represent the operator node in Graph.
+`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
+
+
+A simple network topology wrote by Symbol is as follows:
+
+```python
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.symbol.Flatten(data=data)
+    fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+    fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+    act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+    fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+    mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+    return mlp
+```
+
+
+
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
+
+Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
+
+And Symbol can be saved to a Json file.
+
+Here is a detailed example:
+
+```
+>>> import mxnet as mx
+>>> data = mx.symbol.Variable('data')
+>>> print data.debug_str()
+Variable:data
+
+>>> data = mx.symbol.Flatten(data=data)
+>>> print data.debug_str()
+Symbol Outputs:
+	output[0]=flatten0(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+
+>>> fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+>>> print fc1.debug_str()
+Symbol Outputs:
+	output[0]=fc1(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+	arg[0]=flatten0(0)
+	arg[1]=fc1_weight(0) version=0
+	arg[2]=fc1_bias(0) version=0
+Attrs:
+	num_hidden=128
+
+```
+
+
+### TensorFlow
+
+
+The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
+
+A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
+
+A simple example is as follows:
+
+```python
+  # Build a dataflow graph.
+  c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+  e = tf.matmul(c, d)
+
+  # Construct a `Session` to execute the graph.
+  sess = tf.Session()
+
+  # Execute the graph and store the value that `e` represents in `result`.
+  result = sess.run(e)
+```
+
+  
+The main method of `Tensor` is as follows: 
+ 
+ 
+```python
+@property
+def op(self):
+  """The `Operation` that produces this tensor as an output."""
+  return self._op
+
+@property
+def dtype(self):
+   """The `DType` of elements in this tensor."""
+  return self._dtype
+
+@property
+def graph(self):
+  """The `Graph` that contains this tensor."""
+  return self._op.graph
+
+@property
+def name(self):
+  """The string name of this tensor."""
+  if not self._op.name:
+    raise ValueError("Operation was not named: %s" % self._op)
+  return "%s:%d" % (self._op.name, self._value_index)
+
+@property
+def device(self):
+  """The name of the device on which this tensor will be produced, or None."""
+  return self._op.device
+```
+
+
+Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
+
+
+Here is a detailed example:
+
+
+```
+>>> import tensorflow as tf
+>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+>>> print c.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+>>> print d.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> e = tf.matmul(c, d)
+>>> print e.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+```
+
+### Dynet
+
+
+The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
+
+
+A simple example is as follows:
+
+```cpp
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+
+Expression in = input(cg, xs[i]);
+Expression label = input(cg, ys[i]);
+Expression pred = W * in;
+Expression loss = square(pred - label);
+```
+
+The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. 
+
+Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
+
+
+Here is a detailed example:
+
+write topology in C++
+
+```
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+cg.print_graphviz();
+
+Expression pred = W * xs[i];
+cg.print_graphviz();
+
+Expression loss = square(pred - ys[i]);
+cg.print_graphviz();
+```
+
+compile and print
+
+```
+# first print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+}
+# second print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+}
+# third print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+  N2 [label="v2 = -1.88387 - v1"];
+  N1 -> N2;
+  N3 [label="v3 = -v2"];
+  N2 -> N3;
+  N4 [label="v4 = square(v3)"];
+  N3 -> N4;
+}
+```
+
+### Conclusion
+
+
+Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
+
+- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
+- Expression corresponds with a global Graph, and Expression can also be composed.
+- Expression tracks all dependency and can be taken as a run target
diff --git a/doc/fluid/design/others/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/design/others/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/design/others/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/design/others/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/design/others/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/design/others/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7ac3f17c44ca94a669a8f1e283b291bceb42317
--- /dev/null
+++ b/doc/fluid/design/others/parameters_in_cpp.md
@@ -0,0 +1,41 @@
+# Design Doc: The C++ Class `Parameters`
+
+`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of  sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+
+We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
+* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
+* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
+
+It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
+
+1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
+It is evident that we should use `paddle::Parameter` when developing `Parameters`.
+However, the `Parameter` class contains many functions and does not have a clear interface.
+It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
+When we developing `Parameters`, we only use `create/store Parameter` functionality.
+We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
+
+2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
+We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
+Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
+`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
+
+3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. 
+So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
+
+
+The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
+
+1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
+
+2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
+
+3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
+Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
+`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
+   * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
+   * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
+
+4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
+
+5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/fluid/design/others/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7aeed7f9b4637e1c29d530f37b42d12500af82f
--- /dev/null
+++ b/doc/fluid/design/others/simple_op_design.md
@@ -0,0 +1,202 @@
+## Interaction between C++ and Python
+
+Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. 
+
+The Interaction between Python and C++ can be simplified as two steps:
+
+1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
+
+2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
+
+### Message from C++ to Python
+
+We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
+
+Following message are necessary:
+
+1. Op's name, and its simple comment.
+2. Input and output variable number; each variable's name, type, and comment.
+3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
+
+So `OpProto` can be defined as follows:
+
+```proto
+enum AttrType {
+	INT = 1;
+	FLOAT = 2;
+	STRING = 3;
+	INTS = 4;
+	FLOATS = 5;
+	STRINGS = 6;
+};
+
+message AttrValue {
+	AttrType type = 1;
+	optional int iv = 2;
+	optional float fv = 3;
+	optional string sv = 4;
+	repeated int ivs = 5;
+	repeated float fvs = 6;
+	repeated string svs = 7;
+};
+
+message AttrProto {
+	required string name = 1;
+	required string comment = 2;
+	required AttrType type = 3;
+};
+
+message VarProto {
+	required string name = 1;
+	required string comment = 2;
+	required bool is_tensor = 3;
+};
+
+message OpProto {
+	repeated VarProto inputs = 1;
+	repeated VarProto outputs = 2;
+	repeated AttrProto attrs = 3;
+	required string type = 4;
+	required string comment = 5;
+};
+```
+
+To generate Python code automatically:
+
+```python 
+def create_python_ops_creatation_functions():
+	op_protos = paddle.framework.OpRegistry.get_all_op_proto()
+	for type_name in op_protos:
+		op_proto = op_protos[type_name]
+		def __impl__(**kwargs):  # User must use key word args in Paddle API
+			inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs]
+			outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs]
+			attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs]
+			opdesc = （input, outputs, type_name, attrs）
+			return paddle.framework.OpRegistry.CreateOp(opdesc)
+		__impl__.__doc__ = create_doc_string(op_proto)
+		globals()[type_name] = __impl__
+
+create_python_ops_creatation_functions()
+```
+
+### Message from Python to C++
+
+To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
+
+```proto
+message OpDesc {
+	required string type = 1;	
+	repeated string inputs = 2;
+	repeated string outputs = 3;
+	map<string, AttrValue> attrs = 4;
+};
+```
+
+## OpProto Register
+
+Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
+
+```cpp
+class OpProtoMaker {
+public:
+	OpProtoMaker(OpProto* proto): proto_(proto) {}
+protected:
+	OpProto* proto_;
+	void AddInput(const std::string& name, const std::string& desc) {...}
+	void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
+	void AddComment(const std::string& comment) { ... }
+};
+
+class OpRegistry {
+public:
+	using OpCreator = std::function<OperatorBase* (OpDesc& desc)>;
+	
+	template <typename OpType, typename OpMaker>
+	static void RegisterOp(const std::string& name) {
+		gCreators_[name] = [](const OpDesc& desc) {
+			return new OpType(desc);
+		};
+		OpProto& opProto = gProtos_[name];
+		OpMaker()(&opProto);
+	}
+
+	static map<string, OpCreator> gCreators_;
+	static map<string, OpProto> gProtos_;
+};
+
+template <typename OpType, typename OpMaker>
+class OpRegister {
+  public:
+    OpRegister(std::string type) {
+        OpRegistry::RegisterOp<OpType, OpMaker>(type);
+    }
+};
+
+#define REGISTER_OP(op_class, op_maker_class, type_name)         \
+    class op_class##Register {                                   \
+      private:                                                   \
+        const static OpRegister<#op_class, #op_maker_class> reg; \
+    };                                                           \
+    const Register op_class##Register::reg(#type_name);
+    
+class CosineOp {
+// ...
+}
+
+struct CosineOpProtoMaker : public OpProtoMaker {
+	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
+		AddInput("input", "input of cosine op");
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
+		AddType("cos");
+		AddComment("This is cos op");
+	}
+}
+
+REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
+```
+
+In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. 
+
+## Python API
+
+Python  APIs are divided into two types, high-level API and low-level API.
+
+### High-Level API
+
+High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
+
+Here is a sample about how a define a fc layer:
+
+```python
+hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
+```
+
+`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
+
+The definition of `fc_layer()`:
+
+```python
+def fc_layer(input, size, with_bias, activation):
+	attr_map = {"size":size}
+	check_attrs(attr_map)
+	w = make_variable('w')
+	if with_bias:
+		b = make_variable('b')
+	else:
+		b = None
+	fc_output = make_variable('fc_output');
+	fc_op(input, w, b, fc_output, attr_map)
+	act_output = make_variable('sigmod_output');
+	if activation == "sigmod":
+		sigmod_op(fc_output, act_output);
+	elif:
+		# ...
+	return act_output;
+```
+
+### Low Leval API
+
+In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
+
+*TODO*
diff --git a/doc/fluid/design/others/test.dot b/doc/fluid/design/others/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/design/others/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/design/others/test.dot.png b/doc/fluid/design/others/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/design/others/test.dot.png differ
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements.  It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$  denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer. 
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M  = max(abs(x))  $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer.  For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer. 
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure. 
+
+<p align="center"> 
+<img src="quantization_forward.png" width="300" height="340"><br/>
+Figure 1. Forward in training with simulated quantization.
+</p>
+
+- Firstly, both input and weight will be quantized to 8-bit integers. 
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil  $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+       &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m) 
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+<p align="center"> 
+<img src="quantization_equivalent_forward.png"  width="300" height="330"><br/>
+Figure 2. Equivalent forward in training with simulated quantization.
+</p>
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+<p align="center"> 
+<img src="quantization_backward_and_optimization.png"><br/>
+Figure 3. Backward and weight updating in training with simulated quantization.
+</p>
+
+So the quantization transipler will change some inputs of the corresponding backward operators. 
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+    $$ Vt = (1 - k) * V +  k * V_{t-1}  $$
+    
+    where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -0,0 +1,221 @@
+# API注释撰写标准
+
+- [API注释撰写标准](#api)
+    - [API注释模块](#api)
+    - [格式及示例](#)
+    - [完整示例](#)
+
+
+## API注释模块
+
+API文档须包含以下几个模块（排列顺序为文档撰写顺序）：
+
+- Python API Definition
+
+  API的代码定义。
+
+- Function Description
+
+  API的功能描述。描述该API的含义、作用或对输入所做的操作，及参考文献和对应链接（如果有），必要时给出公式，并解释公式中关键变量的含义。
+
+- Args Description
+
+  API参数介绍。按代码定义中的参数顺序逐个介绍，介绍内容包含数据类型、默认值（如果有）、含义等。
+
+- Returns
+
+  API返回值介绍。介绍返回值含义，必要时给出对应的形状。若返回值为包含多个参数的tuple，则按顺序逐个介绍各参数。
+
+- Raises（如果有）
+
+  可能抛出的异常或错误及可能的产生原因，当可能抛出多种异常或错误时应分条列出。
+
+- Note（如果有）
+
+  注意事项。当有多条注意事项时，应分条列出。
+
+- Examples
+
+  API的使用示例。
+
+
+## 格式及示例
+
+API文档须使用reStructuredText格式撰写，该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下（以下以fc为例进行说明）：
+
+- Python API Definition
+
+  - 格式：
+
+      [Python API Definition]
+
+  - 示例
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - 格式
+
+      本模块应包含以下内容（排列顺序为文档撰写顺序）：
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - 示例
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      因fc没有必要列出的参考文献，故该内容省略。其他情况下需明确给出对应的参考文献和对应连接，以 layer_norm 为例：
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - 格式
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - 示例
+
+      fc的部分参数注释如下：
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - 格式
+
+      [Name][Shape]
+
+  - 示例
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      当返回值为包含多个参数的tuple时，应按顺序逐个介绍各参数，以dynamic_lstm为例：
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - 格式
+
+      [Exception Type][Condition]
+
+  - 示例
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - 格式
+
+     [Note]
+
+  - 示例
+
+      fc没有注意事项，故该模块省略不写。如有注意事项应明确给出，当有多条注意事项，须分条列出，以scaled\_dot\_product\_attention为例：
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - 格式
+
+      \[Python Code Snipper]
+
+  - 示例
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## 完整示例
+
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f175b219750d1c765a6a111c2ec3aa732fa46175
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,227 @@
+# API Doc Standard
+
+- [API Doc Standard](#api-doc-standard)
+    - [API Doc Structure](#api-doc-structure)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises（if any）
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note（if any）
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/dev/ci_build_whl.png differ
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3244eedf918b93f9351258f1218dfb2d507c1a9c
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,243 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
+
+## 克隆（Clone）
+
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
+```
+
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
+
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像，在源码目录树的根目录中运行：
+
+```bash
+➜  docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
+
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试，可以用如下命令：
+
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
+
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
+
+选择目标分支：
+
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
+
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
+
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后，删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+
+至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1,162 @@
+# Contribute Code
+
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+
+1. Fork
+
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+   To make a copy of your fork to your local computers, please run
+
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+
+1. Create the local feature branch
+
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+
+1. Commit
+
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
+1. Build and test
+
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+
+1. Push and file a pull request
+
+   You can "push" your local work into your forked repo:
+
+   ```bash
+   git push origin my-cool-stuff
+   ```
+
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+
+### Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter.  For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37e608160db0ad5a92297987937bbbfa8f842ea8
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,16 @@
+开发标准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
+  api_doc_std_cn.md
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_cn.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7f83035010f13c30514673ecbee301f194dc175
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,16 @@
+Development
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_en.md
+  write_docs_en.md
+  api_doc_std_en.md
+  new_op_en.md
+  new_op_kernel.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_en.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b4244d0f506c8cd6c08739141eabad27c581ca7
--- /dev/null
+++ b/doc/fluid/dev/name_convention.md
@@ -0,0 +1,65 @@
+# Operator's Parameter Name Convention
+
+To make the operator document itself more clear, we recommend operator names obey the listing conventions.
+
+## OpProtoMaker names
+
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
+
+- Input/Output.
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
+  - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
+
+- Attribute.
+  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
+
+- Comments.
+  - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
+
+- Order.
+  - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
+
+## Best Practice
+
+Here we give some examples to show how these rules will be used.
+
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
+
+- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
+
+- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
+
+  We give a full example of Accumulator Operator.
+
+```c++
+class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
+    the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
+    AddOutput("Out", "(Tensor) Accumulated output tensor");
+    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
+    AddComment(R"DOC(
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
+output tensor already has the right size, we add to it; otherwise, we first
+initialize the output tensor to all zeros, and then do accumulation. Any
+further calls to the operator, given that no one else fiddles with the output
+in the interim, will do simple accumulations.
+
+Accumulation is done as follows:
+
+Out = 1*X + gamma*Out
+
+where X is the input tensor, Out is the output tensor and gamma is the multiplier
+argument.
+
+)DOC");
+  }
+};
+```
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff7408111fa20a7a6a3a2fe9f9ba20835918f399
--- /dev/null
+++ b/doc/fluid/dev/new_op_cn.md
@@ -0,0 +1,435 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
+   - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
+
+
+## 概念简介
+
+简单介绍需要用到基类，详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写，Op)基类。
+- `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+- `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+- `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>.cc 文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> .cc 文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在.h 文件中，否则，CPU 实现在.cc 文件中，CUDA 实现在.cu 文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在.cc 文件；Kernel注册CPU实现在.cc 文件中，CUDA实现在.cu 文件中</td>
+</tr>
+</tbody>
+</table>
+
+
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 定义ProtoMaker类
+
+矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释：
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+
+   - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
+   - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
+
+构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
+
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
+)DOC");
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
+  }
+};
+```
+
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker，若未定制对应前向Op的GradProtoMaker，fluid提供了DefaultGradProtoMaker，默认注册会使用全部输入输出，包括Input, Output, Output@Grad等，使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+```
+
+### 定义Operator类
+
+下面实现了MulOp的定义：
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数，也可写成：
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+还需要重写`InferShape`接口。`InferShape`为const函数，不能修改Op的成员变量，参数为`const framework::InferShapeContext &ctx`，通过该参数可获取到输入输出以及属性。它的功能是：
+
+  - 1). 做检查， 尽早报错：检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中
+
+### 定义OpKernel类
+
+`MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型，如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数：`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比，`ExecutionContext`增加了设备类型，同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现：
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
+
+### 注册Operator
+
+- 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   在上面的代码中：
+
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
+
+
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### 编译
+
+运行下面命令可以进行编译：
+
+```
+make mul_op
+```
+
+## 绑定Python
+
+系统会对新增的op自动绑定Python，并链接到生成的lib库中。
+
+## 实现单元测试
+
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
+
+### 前向Operator单测
+
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：
+
+1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。
+
+
+  ```python
+  import unittest
+  import numpy as np
+  from op_test import OpTest
+
+
+  class TestMulOp(OpTest):
+      def setUp(self):
+          self.op_type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
+
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
+
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。
+
+### 反向operator单测
+
+而反向测试中：
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
+
+
+### 编译和执行
+
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+
+请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+或者:
+
+```bash
+ctest -R test_mul_op
+```
+
+## 注意事项
+
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义，基本格式如下：
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真，或者比较对象A=B，则检查通过，否则会终止程序运行，向用户反馈相应的错误提示信息。
+为了确保提示友好易懂，开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方，必须有详略得当的备注解释！**错误提示信息**不能为空！
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了？为什么错了？
+    - 例如：`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的？实际的输入是怎样的？
+    - 例如：`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见？
+    - 例如：`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点，根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单，不能给用户提供有效的提示！
+
+问题示例1 ：未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 ：提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么？
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写，不易理解！
+
+问题示例：
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+                    "Fail to find eltwise_fwd_pd in device context");  //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口：Op内部如果出现Output = ShareDataWith(Input) 
+问题示例：
+```cpp
+auto *out = ctx.Output<framework::LoDTensor>("Out");
+auto *in = ctx.Input<framework::LoDTensor>("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input)，相当于operator图的中有一条隐藏边，连接了Input和Output，这条边无法在图分析中表达，引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作，性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen，gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量，请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`  
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+                        "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查，要写明反向Op的名字
+
+正确示例：
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+                        "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8de271ed4e5e0fb4018478bffd4b525d4319738
--- /dev/null
+++ b/doc/fluid/dev/new_op_en.md
@@ -0,0 +1,352 @@
+# How to write a new operator
+
+ - [Background](#background)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
+   - [Defining Operator](#defining-operator)
+   - [Defining OpKernel](#defining-opkernel)
+   - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
+   - [Compilation](#compilation)
+ - [Python Binding](#python-binding)
+ - [Unit Tests](#unit-tests)
+   - [Testing Forward Operators](#testing-forward-operators)
+   - [Testing Backward Operators](#testing-backward-operators)
+   - [Compiling and Running](#compiling-and-running)
+ - [Remarks](#remarks)
+## Background
+
+Here are the base types needed. For details, please refer to the design docs.
+
+- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation kernel.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
+
+
+Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+
+
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
+
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+
+
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+
+
+## Implementing C++ Types
+
+
+### Defining ProtoMaker
+
+Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
+
+First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+
+   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
+   - `framework::OpAttrChecker` is used to validate variable attributes.
+
+The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
+
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
+
+
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
+
+
+### Defining Operator
+
+The following code defines the interface for MulOp:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
+
+  - 1). validate and error out early: it checks input data dimensions and types.
+  - 2). configures the tensor shape in the output.
+
+Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
+
+### Defining OpKernel
+
+`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
+
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
+
+- `typename T` denotes data type, such as `float` or `double`.
+
+`MulKernel` types need to rewrite the interface for `Compute`.
+
+- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
+- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
+- `Compute` implements the computation logics of an `OpKernel`.
+
+`MulKernel`'s implementation of `Compute` is as follows:
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
+
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
+
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
+
+
+This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
+
+The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
+
+### Registering Operator and OpKernel
+
+- In `.cc` files, register forward and backward operator classes and the CPU kernel.
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
+
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   In that code block,
+
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
+
+
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### Compilation
+
+Run the following commands to compile.
+
+```
+# maybe you need to rerun cmake
+make mul_op
+```
+
+## Python Binding
+
+The system will automatically bind to Python and link it to a generated library.
+
+## Unit Tests
+
+Unit tests for an operator include
+
+1. comparing a forward operator's implementations on different devices,
+
+2. comparing a backward operator's implementation on different devices, and
+
+3. a scaling test for the backward operator.
+
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
+
+### Testing Forward Operators
+
+A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following:
+
+1. Defining input, output and relevant attributes in `setUp` method.
+
+2. Generating random input data.
+
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
+
+  ```python
+  import unittest
+  import numpy as np
+  from op_test import OpTest
+
+
+  class TestMulOp(OpTest):
+      def setUp(self):
+          self.op_type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
+Get its output, and compare it with the forward operator's own output.
+
+The code above first loads required packages. In addition, we have
+
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.inputs` defines input, with type `numpy.array` and initializes it.
+- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
+
+### Testing Backward Operators
+
+Some key points in checking gradient above include:
+
+- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The second variable `"Out"` points to the network's final output target `Out`.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
+
+### Compiling and Running
+
+
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
+
+Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
+
+After successfully compiling the project, run the following command to run unit tests:
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+Or,
+
+```bash
+ctest -R test_mul_op
+```
+
+## Remarks
+
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md
new file mode 100644
index 0000000000000000000000000000000000000000..87e617d44041bde9c9051151878ffb4304689b3c
--- /dev/null
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -0,0 +1,121 @@
+# Add Kernels for a New Device
+
+## Background
+
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
+
+## Write Kernels for A New Device
+
+### Add A New Device
+
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24).  We will correct this ASAP.
+
+To register a new device, we need to add an enum value to `LibraryType`:
+
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+
+
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
+
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
+
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
+
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+```
+
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
+
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
+
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+
+
+### Register the OpKernel to framework
+
+After writing the components described above, we should register the kernel to the framework.
+
+We use `REGISTER_OP_KERNEL` to do the registration.
+
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+
+In the code above:
+
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..4e539d7992e5f67ee7b07193b59b6b425b73c9e5
--- /dev/null
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+## PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+## Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+## Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+## LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+## The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c6728fba7150b0f1e180e57590f18a5b677c70d
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -0,0 +1,199 @@
+# PaddlePaddle发行规范
+
+PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
+
+PaddlePaddle每次发新的版本，遵循以下流程:
+
+1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+  * 使用Regression Test List作为检查列表，测试本次release的正确性。
+	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像（参考后面的操作细节）。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。
+
+需要注意的是:
+
+* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
+* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+
+## 发布wheel包到pypi
+
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
+
+* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可：
+
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
+
+## PaddlePaddle 分支规范
+
+PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+
+* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
+	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
+	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+
+* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
+	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
+	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
+
+* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+
+## PaddlePaddle回归测试列表
+
+本列表说明PaddlePaddle发版之前需要测试的功能点。
+
+### PaddlePaddle Book中所有章节
+
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..2c1c30c1eddfde6d9a8e2637be86537c43cc1b00
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,237 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+### Publish wheel Packages for MacOS
+
+You need to build the binary wheel package for MacOS before publishing, to
+make sure that the package can be used by many versions of MacOS
+(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
+you must build the package ***exactly*** following below steps:
+
+Build steps:
+
+1. install python from python.org downloads, and make sure it's currently in use
+   in your system.
+1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
+1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
+1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF  ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
+1. `make -j`
+1. `pip install delocate`
+1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
+
+Then the whl under `fixed_wheel` is ready to upload.
+
+Install steps:
+
+1. run `pip install paddlepaddle...whl`
+1. find the `libpython.dylib` that are currently in use:
+    - for python.org package installs, do nothing.
+    - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/src/fc.py b/doc/fluid/dev/src/fc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b074821cc2276a29b2a8639e82199fcf4d72020
--- /dev/null
+++ b/doc/fluid/dev/src/fc.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+
+    Returns:
+        A tensor variable storing the transformation result.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
new file mode 100644
index 0000000000000000000000000000000000000000..051a463cfcf97df2e2d5b6a880923ca70fefbd6e
--- /dev/null
+++ b/doc/fluid/dev/support_new_device.md
@@ -0,0 +1,240 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicate the device id and manage hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+
+```
+        |   CPUPlace
+Place --|   CUDAPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   
+DeviceContext ---->  CUDADeviceContext  
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_;
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in the header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implementation is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implementation is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
+
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+For more details, please refer to following docs:
+
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..56203d6fad444f61ef1be187ad0d149b2aa99ba4
--- /dev/null
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -0,0 +1,146 @@
+# 在Paddle中如何使用Eigen
+
+神经网络本质上是一个计算图，计算需要的数据存放在`Tensor`中，而计算过程是由`Operartor`来描述的。在执行时，`Operator`调用对应`OpKernel`中的`Compute`接口，实现对`Tensor`的操作。
+
+
+## Eigen Tensor模块
+
+Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
+
+关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+
+
+## paddle::framework::Tensor
+
+Paddle Tensor定义在framework目录下，其主要接口如下：
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:  
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder`的作用是延迟分配内存，即我们可以先定义一个Tensor，然后使用Resize接口设置Tensor的大小，最后再调用mutable_data接口分配实际的内存。
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor使用样例
+下面以AddOp为例说明Tensor的使用过程：
+
+- InferShape
+
+在运行神经网络计算图时，我们先调用每个`Operator`的`InferShape`接口，根据输入Tensor的大小来设置输出Tensor的大小，`Resize`接口会被调用。
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口，在这时真正的分配内存，`mutable_data`接口会被调用。
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+### paddle::framework::Tensor到EigenTensor的转换
+
+如上一小节所示，在具体的计算中，我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
+
+以EigenTensor为例，做一个介绍
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+From是EigenTensor模板提供的一个接口，可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数，因此在转换时需要显示的指定。
+
+在Eigen中，不同rank的Tensor是不同类型，Vector是rank为1的Tensor。需要额外注意的是，EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor，在这里用EigenVector来表示；而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作，压扁成为Eigen的一维Tensor，类型仍然为EigenVector。
+
+更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。
+
+
+
+## 实现计算
+
+当需要完成计算时，我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是，这里的EigenTensor之间的运算只是改变了原有Tensor中的数据，而不会改变原有Tensor的shape信息。
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+在这段代码中，input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口，把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后，input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息，可以调用Resize接口进行改变。
+
+由于Eigen Tensor模块的文档较少，我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。
diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3313d097cb21e40c23aa13187b6a50562f12403a
--- /dev/null
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -0,0 +1,146 @@
+# How to use Eigen in Paddle
+
+Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
+
+
+## Eigen Tensor Module
+
+The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
+
+Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse.
+
+For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
+
+
+## paddle::framework::Tensor
+
+Paddle Tensor's is defined in the framework directory with the following interface:
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor Usage
+`AddOp` demonstrates Tensor's usage.
+
+- InferShape
+
+When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor.
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+## paddle::framework::Tensor到EigenTensor的转换
+
+As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
+
+Using EigenTensor as an example:
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation.
+
+In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
+
+For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file.
+
+
+
+## Implementing Computation
+
+While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface.
+
+Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels).
diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4231f2bb5cd800c0cd86835b5d07e491fcde4989
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.rst
@@ -0,0 +1,136 @@
+#############
+如何贡献文档
+#############
+
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
+
+如何构建文档
+============
+
+PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
+
+我们建议使用PaddlePaddle.org工具来构建文档。
+
+使用PaddlePaddle.org工具
+------------------------
+这个是目前推荐的使用方法。除了可以自动编译文档，还可以直接在网页中预览文档，需要注意的是，采用后续说明的其它方式虽然也可以预览文档，但是文档的样式与官网文档是不一致的，使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。
+
+PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后即可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
+
+不使用PaddlePaddle.org工具
+--------------------------
+
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
+
+如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # 如果只需要构建使用文档，则执行以下命令
+   make -j $processors paddle_docs
+
+   # 如果只需要构建API，则执行以下命令
+   make -j $processors paddle_apis
+
+其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
+
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+如何书写文档
+============
+
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
+
+如何更新www.paddlepaddle.org
+============================
+
+更新的文档以PR的形式提交到github中，提交方式参见 `如何贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/dev/write_docs_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6105455e202e4704aa25f0fd9916b9b61a569702
--- /dev/null
+++ b/doc/fluid/dev/write_docs_en.rst
@@ -0,0 +1,139 @@
+########################
+Contribute Documentation
+########################
+
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
+
+How to build Documentation
+===========================
+
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
+
+We recommend using PaddlePaddle.org tool to build documentation.
+
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
+
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors paddle_docs
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors paddle_apis
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+How to write Documentation
+===========================
+
+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.
+
+How to update www.paddlepaddle.org
+===================================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/source/faq/faq.rst b/doc/fluid/faq/faq.rst
similarity index 100%
rename from source/faq/faq.rst
rename to doc/fluid/faq/faq.rst
diff --git a/source/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst
similarity index 100%
rename from source/faq/index_cn.rst
rename to doc/fluid/faq/index_cn.rst
diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_en.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..79df6c59578e2acf495a3453ab61f069c3f09a49
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+-
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+---
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block`
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor`
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    }
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p>
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+  - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+  - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e7f70fc4cb871a80ffaffec6c06797973cd2f85
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
@@ -0,0 +1,4 @@
+基本使用概念
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..78cca1e2a3443c2949ca0655190b0f05502f519a
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_en.rst
@@ -0,0 +1,4 @@
+Concepts
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/reader/README.md b/doc/fluid/getstarted/concepts/reader/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cd4b6225b61cf374458e40afabad7745f61ba71
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/reader/README.md
@@ -0,0 +1,206 @@
+# Python Data Reader Design Doc
+
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
+
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
+
+```
+iterable = data_reader()
+```
+
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
+
+An example implementation for single item data reader creator is as follows:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator is as follows:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
+
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],)]
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It is easy to convert from a reader to a batch reader:
+
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+It is also straight forward to create a custom batch reader:
+
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
+
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do the following :
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care about the second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why does a reader return only a single entry, and not a mini batch?
+
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
+
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
+
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
+
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
+
+### Why use a dictionary instead of a list to provide mapping?
+
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create a custom data reader creator ?
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train is:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f12ba0497369eacc6a2db7984781b5672f45ea1
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -0,0 +1,76 @@
+# Design Doc: Model Format
+
+## Motivation
+
+A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
+
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
+
+## Implementation
+
+The topology is saved as a plain text in a detailed self-contain protobuf file.
+
+The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
+
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
+
+The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
+
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
+
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
+
+## Summary
+
+- We introduce a model format.
+- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message.
+- A bunch of specified format binary tensors describe the **parameters**.
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3daea71d0933a2774227ff2b5e744392ca6b1765
--- /dev/null
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -0,0 +1,20 @@
+新手入门
+============
+
+
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fb20bb4f245281c3acf67c417979dc63c144fef3
--- /dev/null
+++ b/doc/fluid/getstarted/index_en.rst
@@ -0,0 +1,19 @@
+GET STARTED
+============
+
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/index_en.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a964d4f8561f30aa10936d2399698c51583442c
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -0,0 +1,45 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考： :ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..680122f25893a5a48fac103266bda4788f891f6d
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -0,0 +1,49 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..55326940ce7c7dbaa5bf19f1950f470527ddf4f0
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,181 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+
+### 分布式训练
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步，启动Parameter Server：
+```bash
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步，启动Trainer：
+```bash
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_en.md b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4465e8269c2e1603c02404ea33f8c4572e76442
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_en.md
@@ -0,0 +1,153 @@
+# Fluid Distributed Training
+
+## Introduction
+
+In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
+
+## Preparations
+
+### Getting the cluster ready
+
+Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other.
+
+### Have PaddlePaddle installed
+
+PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries.
+
+PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
+
+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+### Update the training script
+
+#### Non-cluster training script
+
+Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example.
+
+The non-cluster version of this demo with fluid API is as follows:
+
+``` python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes.
+
+Now let's try to convert it to a distributed version to run on a cluster.
+
+#### Introducing parameter server
+
+As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
+
+![parameter server architecture](src/trainer.png)
+
+Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to  [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
+
+Now we need to create programs for both: trainers and parameter servers, the question is how?
+
+#### Slice the program
+
+Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program.
+
+The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
+
+Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function.
+
+To put them together:
+
+``` python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+
+
+```
+
+### E2E demo
+
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
+```
+
+*please note we assume that your parameter server runs at 192.168.1.2:6174*
+
+Wait until the prompt `Server listening on 192.168.1.2:6174`
+
+Then in 2 of your trainer nodes run this:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
+```
+
+*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50*
+
+Now you have 2 trainers and 1 parameter server up and running.
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000000000000000000000000000000000..92859e8f622d0c155128821c54252113c5016989
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..8adaf324fccb4cda7af16b9bace559c0642ae444
--- /dev/null
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -0,0 +1,110 @@
+# Distributed Training with NCCL2 and RDMA
+
+When doing distributed multi-GPU training, network bandwidth often becomes the
+bottleneck. We introduce a way to use NCCL2 to do such training job to
+achieve best performance.
+
+## Prepare Hardware with RDMA and Multiple GPUs
+
+I'm using two Linux servers each of them installed with 8 GPUs and
+one 100Gb RDMA card.
+Base environment is:
+
+* OS: CentOS 7.4
+* RDMA device: "Mellanox Technologies MT27700 Family [ConnectX-4]"
+* Kernel version: `4.4.88-1.el7.elrepo.x86_64`
+* Docker version: `1.12.6`
+* Docker storage driver: `overlay2`
+* IP addresses: 192.168.16.30,192.168.16.34
+
+In general, the steps including:
+
+1. Install GPU drivers
+1. Install RDMA drivers
+1. Install "InfiniBand Support"
+1. Use docker to run tests and make sure GPUs and RDMA can work inside
+   the container.
+
+I'll omit the section "Install GPU drivers" because we can find it easily
+somewhere else.
+
+### Install RDMA drivers
+
+For my case, I've got two machines with device
+"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
+"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
+work with the latest overlay2 filesystem.
+
+***NOTE: before you start, make sure you have a way to get a console
+of the server other than ssh because we may need to re-configure the
+network device.***
+
+1. Go to http://www.mellanox.com/page/products_dyn?product_family=26,
+   download `MLNX_OFED` software in the bottom of the page, and upload it
+   onto the server.
+1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
+1. Run `/etc/init.d/openibd restart` to make everything work, note that
+   this operation may cause the network goes down if you are using this
+   RDMA device as default network device and use ssh to log in the server.
+1. Re-configure the network interface, for example:
+   `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
+   `ip route add default via 192.168.16.1 dev eth2`.
+1. Do the same thing on the other node.
+1. Use `ping` to test if the two nodes have typical ICMP connection.
+1. Use either `udaddy` or `ib_write_bw` to test the network connection is
+   ready and have the desired bandwidth.
+
+### Prepare Docker Image to Run RDMA Programs
+
+1. Build a docker image using cuda base image like: `nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04` and install paddlepaddle whl
+   package in it.
+1. Start a docker container and mount GPU driver libs into it (you can
+   skip this step if you are using nvidia-docker).
+1. Mount RDMA drivers and libs into the docker image (see below section),
+   also `udaddy` and `ib_write_bw` if needed.
+1. Mount GPU devices and RDMA devices into the container using `--device`
+   or just use privileged mode `--privileged`.
+1. Start the container using host network mode: `--net=host`
+
+### RDMA Library Files Needed
+
+Usually, `MLNX_OFED` install latest supported libs under
+`/usr/lib64/mlnx_ofed/valgrind`. Other libs also needed to run RDMA programs
+is listed below. These libs must be mounted into the docker container.
+
+* Libs under `/usr/lib64/mlnx_ofed/valgrind`
+  * libibcm.so
+  * libibverbs.so
+  * libmlx4.so
+  * libmlx5.so
+  * libmlx5-rdmav2.so
+  * librdmacm.so
+* Other libs:
+  * libnl-3.so.200
+  * libnl-route-3.so.200
+  * libnuma.so.1
+
+## Start to Run the Training Job
+
+Setting NCCL environment variables to turn NCCL switches on and off:
+
+
+| Env Name | Description |
+| --- | --- |
+| NCCL_SOCKET_IFNAME | The RDMA device, e.g. eth2 |
+| NCCL_P2P_DISABLE | Set to 1 to disable P2P transfer between GPUs |
+| NCCL_IB_DISABLE | Set to 1 to disable using RDMA |
+| NCCL_IB_CUDA_SUPPORT | Set to 1 to enable GPU Direct if supported |
+| NCCL_DEBUG | Set debug level: VERSION, WARN, INFO |
+
+My two servers are: `192.168.16.30,192.168.16.34`, On node 1, Run :
+
+```bash
+PADDLE_TRAINER_ID=0 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.30 stdbuf -oL python vgg16.py
+```
+
+On node 2, Run:
+
+```bash
+PADDLE_TRAINER_ID=1 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.34 stdbuf -oL python vgg16.py
+```
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b57af64f44da82926c4862578f3072960ca5aa92
--- /dev/null
+++ b/doc/fluid/howto/index_cn.rst
@@ -0,0 +1,8 @@
+进阶使用
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  inference/index_cn.rst
+  optimization/index_cn.rst
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd21e167ce3a46da167db1e9d7013804f730e047
--- /dev/null
+++ b/doc/fluid/howto/index_en.rst
@@ -0,0 +1,7 @@
+HOW TO
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  optimization/index_en.rst
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..91357dd8c8da19f2f33c6f285ed7eb234428b1ab
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,97 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz/?branch=0.14.0>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a
--- /dev/null
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -0,0 +1,304 @@
+# 使用指南
+
+## 目录：
+
+- Python Inference API
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+  ```python
+  def save_inference_model(dirname,
+                           feeded_var_names,
+                           target_vars,
+                           executor,
+                           main_program=None,
+                           model_filename=None,
+                           params_filename=None):
+  ```
+  Inference模型和参数将会保存到`dirname`目录下：
+  - 序列化的模型
+    - `model_filename`为`None`，保存到`dirname/__model__`
+    - `model_filename`非`None`，保存到`dirname/model_filename`
+  - 参数
+    - `params_filename`为`None`，单独保存到各个独立的文件，各文件以参数变量的名字命名
+    - `params_filename`非`None`，保存到`dirname/params_filename`
+
+- 两种存储格式
+  - 参数保存到各个独立的文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`None`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+    ```
+  - 参数保存到同一个文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`__params__`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ __params__
+    ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+  ```python
+  def load_inference_model(dirname,
+                           executor,
+                           model_filename=None,
+                           params_filename=None):
+    ...
+    return [program, feed_target_names, fetch_targets]
+  ```
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+  - GCC配置
+    ```bash
+    $ g++ -o a.out -std=c++11 main.cc \
+          -I${PADDLE_ROOT}/ \
+          -I${PADDLE_ROOT}/third_party/install/gflags/include \
+          -I${PADDLE_ROOT}/third_party/install/glog/include \
+          -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+          -I${PADDLE_ROOT}/third_party/eigen3 \
+          -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+          -lrt -ldl -lpthread
+    ```
+
+  - CMake配置
+    ```cmake
+    include_directories(${PADDLE_ROOT}/)
+    include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+    include_directories(${PADDLE_ROOT}/third_party/eigen3)
+    target_link_libraries(${TARGET_NAME}
+                          ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+                          -lrt -ldl -lpthread)
+    ```
+
+  - 设置环境变量：
+  `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+  - 1、 初始化设备
+    ```cpp
+    #include "paddle/fluid/framework/init.h"
+    paddle::framework::InitDevices(false);
+    ```
+
+  - 2、 定义place，executor，scope
+    ```cpp
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    auto* scope = new paddle::framework::Scope();
+    ```
+
+  - 3、 加载模型
+    ```cpp
+    #include "paddle/fluid/inference/io.h"
+    auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+    // or
+    auto inference_program = paddle::inference::Load(executor,
+                                                     *scope,
+                                                     dirname + "/" + model_filename,
+                                                     dirname + "/" + params_filename);
+    ```
+
+  - 4、 获取`feed_target_names`和`fetch_target_names`
+    ```cpp
+    const std::vector<std::string>& feed_target_names = inference_program->GetFeedTargetNames();
+    const std::vector<std::string>& fetch_target_names = inference_program->GetFetchTargetNames();
+    ```
+
+  - 5、 准备`feed`数据
+    ```cpp
+    #include "paddle/fluid/framework/lod_tensor.h"
+    std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+    ...
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+    for (size_t i = 0; i < feed_target_names.size(); ++i) {
+      // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+      feed_targets[feed_target_names[i]] = cpu_feeds[i];
+    }
+    ```
+
+  - 6、 定义`Tensor`来`fetch`结果
+    ```cpp
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs;
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+      fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+    }
+    ```
+
+  - 7、 执行`inference_program`
+    ```cpp
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    ```
+
+  - 8、 使用`fetch`数据
+    ```cpp
+    for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+      std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+      std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+      std::cout << "result:";
+      float* output_ptr = cpu_fetchs[i]->data<float>();
+      for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+        std::cout << " " << output_ptr[j];
+      }
+      std::cout << std::endl;
+    }
+    ```
+    针对不同的数据，4. - 8.可执行多次。
+
+  - 9、 释放内存
+    ```cpp
+    delete scope;
+    ```
+
+
+- 接口说明
+
+  ```cpp
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+  ```
+  - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`，用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+  - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致，可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+  - 默认情况下，除了`persistable`属性设置为`True`的`Variable`之外，每次执行`executor.Run`会创建一个局部`Scope`，并且在这个局部`Scope`中创建和销毁所有的`Variable`，以最小化空闲时的内存占用。
+  - `persistable`属性为`True`的`Variable`有：
+    - Operators的参数`w`、`b`等
+    - `feed_op`的输入变量
+    - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    executor.CreateVariables(*inference_program, scope, 0);
+    // Call as many times as you like
+    executor.Run(
+        *inference_program, scope, feed_targets, fetch_targets, false);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁变量的时间（约占每次`Run`总时间的1% ~ 12%）
+    - 执行结束后可获取所有Operators的计算结果
+  - **缺点**
+    - 空闲时也会占用大量的内存
+    - 在同一个`Scope`中，相同的变量名是公用同一块内存的，容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    auto ctx = executor.Prepare(*inference_program, 0);
+    // Call as many times as you like if you have no need to change the inference_program
+    executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁Op的时间
+  - **缺点**
+    - 一旦修改了`inference_program`，则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+  - 主线程
+    - 1、 初始化设备
+    - 2、 定义`place`，`executor`，`scope`
+    - 3、 加载模型，得到`inference_program`
+  - 从线程
+    - **复制`inference_program`得到`copy_program`，修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+      ```cpp
+      auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+                 new paddle::framework::ProgramDesc(*inference_program));
+      std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+      std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+      copy_program->SetFeedHolderName(feed_holder_name);
+      copy_program->SetFetchHolderName(fetch_holder_name);
+      ```
+    - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+    - 5、 准备feed数据，定义Tensor来fetch结果
+    - 6、 执行`copy_program`
+      ```cpp
+      executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+      ```
+    - 7、 使用fetch数据
+  - 主线程
+    - 8、 释放资源
+
+
+- 基本概念
+  - 数据相关：
+    - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md)，一个N维数组，数据可以是任意类型（int，float，double等）
+    - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)，带LoD(Level-of-Detail)即序列信息的Tensor
+    - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)，记录了变量Variable
+  - 执行相关：
+    - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md)，无状态执行器，只跟设备相关
+    - Place
+      - CPUPlace，CPU设备
+      - CUDAPlace，CUDA GPU设备
+  - 神经网络表示：
+    - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+    详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+  1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+  1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+  1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+  1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+  1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+  1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+  1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+  ```python
+  class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        ...
+        if scope is None:
+            scope = global_scope()
+        ...
+  ```
+  - 使用`InferenceTranspiler`将会直接修改`program`。
+  - 使用`InferenceTranspiler`会修改参数的值，请确保`program`的参数在`scope`内。
+- 支持的优化
+  - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+  ```python
+  import paddle.fluid as fluid
+  # NOTE: Applying the inference transpiler will change the inference_program.
+  t = fluid.InferenceTranspiler()
+  t.transpile(inference_program, place, inference_scope)
+  ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+  ```python
+  fluid.memory_optimize(inference_program)
+  ```
diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9404800eb86ca6d27886258b67393028c76954dc
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
@@ -0,0 +1,8 @@
+基准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e200b660cc7f6aeaf8b3d94fd7a14999a52bccd
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
@@ -0,0 +1,8 @@
+Benchmark
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..198a05a79e19227e90eaafe116217a164cd51a7d
--- /dev/null
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,183 @@
+# CPU性能调优
+
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05
--- /dev/null
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -0,0 +1,224 @@
+# Tune CPU performance
+
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
+
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
+
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
+
+1. the Python code and
+1. the mixture of Python and C++ code.
+
+## Profiling the Python Code
+
+### Generate the Performance Profiling File
+
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
+
+### Look into the Profiling File
+
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
+
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
+
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
+
+### Identify Performance Bottlenecks
+
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(<module>)
+```
+
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
+
+```
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+
+## Profiling Python and C++ Code
+
+### Generate the Profiling File
+
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
+
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+Then we can run the following command
+
+```bash
+python -m yep -v main.py
+```
+
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
+
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
+
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
+
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
+
+### Examining the Profiling File
+
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
+
+```bash
+go get github.com/google/pprof
+```
+
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
+
+![result](./pprof_1.png)
+
+### Identifying the Performance Bottlenecks
+
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
+
+![kernel_perf](./pprof_2.png)
+
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
+
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb0883dd937465d15479b29df95078edb50e069
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+# 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+
+
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能：
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 环境
+
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+
+## 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+	
diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27cc96702356703b339db845dc81913bdcc9f23b
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_cn.rst
@@ -0,0 +1,9 @@
+性能优化
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_cn.md
+  benchmark/index_cn.rst
diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ce624fe8f108a6afc7cd08a1542332755d22e04
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_en.rst
@@ -0,0 +1,9 @@
+Performance Optimization
+---------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_en.md
+  benchmark/index_en.rst
diff --git a/doc/fluid/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/howto/optimization/pprof_1.png differ
diff --git a/doc/fluid/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/howto/optimization/pprof_2.png differ
diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/howto/optimization/timeline.jpeg differ
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..faf39f276dbddcd4961407ba2d082c9826051cbe
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,32 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f963c6b4da6967fb2f493ada917a4b08917fa4c
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -0,0 +1,33 @@
+# how to use timeline tool to do profile
+
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+	```python
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
+	            ...
+	```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
+
+	![chrome tracing](./tracing.jpeg)
+
+1. The resulting timeline should be like:
+
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/howto/optimization/tracing.jpeg differ
diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md
new file mode 100644
index 0000000000000000000000000000000000000000..749cf7693c75696feb17f8556224ed03649baa80
--- /dev/null
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -0,0 +1,92 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc.set_attr("min", self.min)
+        clip_op_desc.set_attr("max", self.max)
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (error_clip is None or isinstance(error_clip,
+                                                 BaseErrorClipAttr)):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/fluid/howto/performance/images/profiler.png b/doc/fluid/howto/performance/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/howto/performance/images/profiler.png differ
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee96e7c74ce317caddb387cbb1d4998937bd5c81
--- /dev/null
+++ b/doc/fluid/howto/performance/profiler.md
@@ -0,0 +1,97 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range.
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece.
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
+
+```c++
+enum ProfilerState {
+  kDisabled,
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_allreduce.png b/doc/fluid/howto/third_party/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/howto/third_party/images/multigpu_before_convert.png b/doc/fluid/howto/third_party/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/howto/third_party/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/howto/third_party/mkldnn_fluid.md b/doc/fluid/howto/third_party/mkldnn_fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..bef126f3f0577b69f646dfe5d10539b372c6a8a5
--- /dev/null
+++ b/doc/fluid/howto/third_party/mkldnn_fluid.md
@@ -0,0 +1,149 @@
+# Design Doc: Add MKLDNN Kernel in Fluid Operator
+
+## Principles
+
+First of all, we should follow some basical principles like:
+1.  [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
+2.  [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
+3.  [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
+4.  [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
+
+## Sulution
+
+In general, there are four parts we should follow to run a MKL-DNN primitive.
+-  Create a primitive descriptor that describe this operator
+-  Create a primitive itself by primitive descriptor and the engine
+-  Create all memory buffers that primitive needed
+-  Launch a stream to execute the primitive created
+More details can refer to [here](http://01org.github.io/mkl-dnn).
+
+It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
+So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
+
+It's assumed that following three conditions should be satisfied.
+1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
+2. the `Input Tensor` inside `Compute` function is the one after converted.
+3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
+
+### Compute
+The algorithm of `Compute` would be described as follow, let's take conv like an example.
+
+```c++
+
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
+  PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
+
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+  // find primitive by unique key from mkldnn context
+  // the op_key should be a unique name of this op instance
+  auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  // assuming the input tensor inside this compute function is the one after converted
+  // this point should be guarantee by another mechanism
+  auto& i = dev_ctx.findMemory(op_key + "_input");
+  
+  if (p == nullptr || i == nullptr || inputSizeChanged(p, i))  {
+    auto fwd_primitive_desc = createPrimitiveDesc(ctx);
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
+    shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
+    shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
+    shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
+
+    dev_ctx.addMemory(op_key+"_input", in);
+    dev_ctx.addMemory(op_key+"_output", out);
+    dev_ctx.addMemory(op_key+"_filer", wgt);
+    dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
+    dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  PADDLE_ENFORCE(p, "Should have forward Primitive");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
+  PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
+  dev_ctx.submit(p);
+  dev_ctx.execute();  // the convert primitive should have already contained.
+
+```
+
+The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
+```c++
+  auto* input = ctx.Input<Tensor>("Input");
+  auto* filter = ctx.Input<Tensor>("Filter");
+  auto* output = ctx.Output<Tensor>("Output");
+  std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+  int groups = ctx.Attr<int>("groups");
+  algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
+  prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
+    
+  auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
+  shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
+
+  return fwd_primitive_desc;
+  }
+```
+
+### MKLDNNDeviceContext
+`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
+
+
+### mkldnn_helper
+Some functions would be put in `paddle/platform/mkldnn_helper.h`.
+- create MKLDNN memories
+- create MKLDNN primitives
+- error check function
+- etc
+
+
+### Kernel Switch
+We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
+
+`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. 
+`trans` would be like this:
+
+```c++
+void trans(inputs, ctx) override {
+  if (NoNeedTrans()) {
+    return;
+  }
+  // find reorder primitive by op_key from context
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  auto& i = dev_ctx.findMemory(op_key + "_src_input");
+
+  if (p == nullptr || i == nullptr || changeSized(i, input)) {
+    auto prim = createPrimitiveDesc(ctx);
+    auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
+    auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
+    auto dst = createMemory(p->expected_desc(), newbuffer->data);
+    auto reorder_primitive(new mkldnn::reorder(src, dst));
+
+    dev_ctx.addMemory(op_key+"_src_input", src);
+    dev_ctx.addMemory(op_key+"_input", dst);
+    dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  PADDLE_ENFORCE(p, "Should have Reorder Primitive");
+  dev_ctx.submit(p);
+  if (! this->isMKLDNNKernel()) {
+    // execute immediately only if this is not mkldnn kernel function.
+    // otherwise, it can be executed with the operator primitive in Compute
+    dev_ctx.stream();
+  }
+  // after submit, the input tensor in ExecutionContext should be changed as the converted one
+  // there should be another mechanism to ensure this
+}
+```
+
+### Unit Test
+All the functions should be tested corresponding.
+TBD
diff --git a/doc/fluid/howto/third_party/paddle_nccl.md b/doc/fluid/howto/third_party/paddle_nccl.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7dac70998a6cfec3a6d2fc72b698ff9722e6805
--- /dev/null
+++ b/doc/fluid/howto/third_party/paddle_nccl.md
@@ -0,0 +1,65 @@
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/images/2_level_rnn.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/images/asgd.gif differ
diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/images/batch_norm_fork.png differ
diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/images/beam_search.png differ
diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/images/ci_build_whl.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/images/compiler.graffle differ
diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/images/compiler.png differ
diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/images/control_flow_graph.png differ
diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/images/dataflow_equations.png differ
diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/images/dcgan.png differ
diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/images/deep_learning.png differ
diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/images/dist-graph.graffle differ
diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/images/dist-graph.png differ
diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.graffle differ
diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.png differ
diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/images/ds2_network.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/images/feed_forward.png differ
diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.graffle differ
diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/images/l1_regularization.png differ
diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/images/l2_regularization.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/images/local-graph.graffle differ
diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/images/local-graph.png differ
diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/images/local_architecture.graffle differ
diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/images/local_architecture.png differ
diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/images/lookup_table.png differ
diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/images/lookup_table_training.png differ
diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/images/loss_equation.png differ
diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/images/multi-threads.graffle differ
diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/images/multi-threads@3x.png differ
diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/images/multiple_reader.png differ
diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c8ad839cb88788e9b5906402257cc7bbc3ddcb54
--- /dev/null
+++ b/doc/fluid/images/op.dot
@@ -0,0 +1,4 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8f24e9ea83acf879c7008f2d97113c0a4cc111c3
--- /dev/null
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
@@ -0,0 +1,38 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+
+  mul_op [label="MulOp"]
+  op_with_kern -> mul_op [dir=back, arrowtail=onormal]
+  mul_kernel [label="template &#60;typename Place&#62;\lclass MulOpKernel\l"]
+  op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
+  mul_op -> mul_kernel [arrowhead=vee, label="register many"]
+  
+  {
+    rank=same;
+    mul_op;
+    mul_kernel;
+  }
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4f5af4f7b5f5a69693a058c99eb658900136077a
--- /dev/null
+++ b/doc/fluid/images/op_with_kernel.dot
@@ -0,0 +1,26 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+}
\ No newline at end of file
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/images/paddle-compile.graffle differ
diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/images/paddle-compile.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/images/pprof_1.png differ
diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/images/pprof_2.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/images/profiler.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/images/readers.png differ
diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/images/remote_executor.graffle differ
diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/images/remote_executor.png differ
diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/images/rnn.jpg differ
diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/images/rnn.png differ
diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/images/rnn_2level_data.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/images/single-thread@3x.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/images/sparse_update.graffle differ
diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/images/sparse_update.png differ
diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/images/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/images/test.dot.png differ
diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/images/theta_star.gif differ
diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/images/timeline.jpeg differ
diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/images/tracing.jpeg differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3321fa8eed3ac0bbb60cd56542baa39ee2931f50
--- /dev/null
+++ b/doc/fluid/index_cn.rst
@@ -0,0 +1,16 @@
+.. PaddlePaddle Fluid documentation master file, created by
+   sphinx-quickstart on Thu Jun  7 17:04:53 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+##############
+欢迎使用 Fluid
+##############
+
+..  toctree::
+    :maxdepth: 1
+
+    beginners_guide/index.rst
+    user_guides/index.rst
+    advanced_usage/index.rst
+    faq/index_cn.rst
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bc76b58982cf50e637d15cca0c5d78166aa73a9
--- /dev/null
+++ b/doc/fluid/index_en.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  build_and_install/index_en.rst
+  design/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb6d4563f5617fb98af055bca2f6f0479bdb4393
--- /dev/null
+++ b/doc/fluid/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
+      - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
+      - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
+      - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
+  - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
+    - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
diff --git a/source/user_guides/howto/basic_concept/fluid_basic_concept.rst b/doc/fluid/user_guides/howto/basic_concept/fluid_basic_concept.rst
similarity index 100%
rename from source/user_guides/howto/basic_concept/fluid_basic_concept.rst
rename to doc/fluid/user_guides/howto/basic_concept/fluid_basic_concept.rst
diff --git a/source/user_guides/howto/basic_concept/fluid_local_train.jpeg b/doc/fluid/user_guides/howto/basic_concept/fluid_local_train.jpeg
similarity index 100%
rename from source/user_guides/howto/basic_concept/fluid_local_train.jpeg
rename to doc/fluid/user_guides/howto/basic_concept/fluid_local_train.jpeg
diff --git a/source/user_guides/howto/basic_concept/fluid_mnist.png b/doc/fluid/user_guides/howto/basic_concept/fluid_mnist.png
similarity index 100%
rename from source/user_guides/howto/basic_concept/fluid_mnist.png
rename to doc/fluid/user_guides/howto/basic_concept/fluid_mnist.png
diff --git a/doc/fluid/user_guides/howto/configure_simple_model/index.rst b/doc/fluid/user_guides/howto/configure_simple_model/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5946a2ccb7e43004eae39ec4b3c6112c66c1fd04
--- /dev/null
+++ b/doc/fluid/user_guides/howto/configure_simple_model/index.rst
@@ -0,0 +1,88 @@
+..  _user_guide_configure_simple_model:
+
+##############
+配置简单的网络
+##############
+
+在解决实际问题时，可以先从逻辑层面对问题进行建模，明确模型所需要的 **输入数据类型**、**计算逻辑**、**求解目标** 以及 **优化算法**。PaddlePaddle提供了丰富的算子来实现模型逻辑。下面以一个简单回归任务举例说明如何使用PaddlePaddle构建模型。该例子完整代码参见 `fit_a_line <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py>`_。
+
+问题描述及定义
+##############
+
+问题描述: 给定一组数据 :math:`<X, Y>`，求解出函数 :math:`f`，使得 :math:`y=f(x)`，其中 :math:`x\subset X` 表示一条样本的特征，为 :math:`13` 维的实数向量；:math:`y \subset Y` 为一实数表示该样本对应的值。
+
+我们可以尝试用回归模型来对问题建模，回归问题的损失函数有很多，这里选择常用的均方误差。为简化问题，这里假定 :math:`f` 为简单的线性变换函数，同时选用随机梯度下降算法来求解模型。
+
++----------------+----------------------------------------------+
+| 输入数据类型   |  样本特征: 13 维 实数                        |
++                +----------------------------------------------+
+|                |  样本标签: 1 维 实数                         |
++----------------+----------------------------------------------+
+| 计算逻辑       | 使用线性模型，产生 1维实数作为模型的预测输出 |
++----------------+----------------------------------------------+
+| 求解目标       | 最小化模型预测输出与样本标签间的均方误差     |
++----------------+----------------------------------------------+
+| 优化算法       | 随机梯度下降                                 |
++----------------+----------------------------------------------+
+
+使用PaddlePadle建模
+###################
+
+从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后，需要使用PaddlePaddle提供的API及算子来实现模型逻辑。一个典型的模型主要包含4个部分，分别是：输入数据格式定义，模型前向计算逻辑，损失函数以及优化算法。
+
+数据层
+------
+
+PaddlePaddle提供了 :code:`fluid.layers.data()` 算子来描述输入数据的格式。
+
+:code:`fluid.layers.data()` 算子的输出是一个Variable。这个Variable的实际类型是Tensor。Tensor具有强大的表征能力，可以表示多维数据。为了精确描述数据结构，通常需要指定数据shape以及数值类型type。其中shape为一个整数向量，type可以是一个字符串类型。目前支持的数据类型参考    :ref:`user_guide_paddle_support_data_types` 。 模型训练一般会使用batch的方式读取数据，而batch的size在训练过程中可能不固定。data算子会依据实际数据来推断batch size，所以这里提供shape时不用关心batch size，只需关心一条样本的shape即可，更高级用法请参考 :ref:`user_guide_customize_batch_size_rank`。从上知，:math:`x` 为 :math:`13` 维的实数向量，:math:`y` 为实数，可使用下面代码定义数据层：
+
+.. code-block:: python
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+该模型使用的数据比较简单，事实上data算子还可以描述变长的、嵌套的序列数据。也可以使用 :code:`open_files` 打开文件进行训练。更详细的文档可参照 :ref:`user_guide_prepare_data`。
+
+前向计算逻辑
+------------
+
+实现一个模型最重要的部分是实现计算逻辑，PaddlePaddle提供了丰富的算子。这些算子的封装粒度不同，通常对应一种或一组变换逻辑。算子输出即为对输入数据执行变换后的结果。用户可以灵活使用算子来完成复杂的模型逻辑。比如图像相关任务中会使用较多的卷积算子、序列任务中会使用LSTM/GRU等算子。复杂模型通常会组合多种算子，以完成复杂的变换。PaddlePaddle提供了非常自然的方式来组合算子，一般地可以使用下面的方式：
+
+.. code-block:: python
+
+    op_1_out = fluid.layers.op_1(input=op_1_in, ...)
+    op_2_out = fluid.layers.op_2(input=op_1_out, ...)
+    ...
+
+其中op_1和op_2表示算子类型，可以是fc来执行线性变换(全连接)，也可以是conv来执行卷积变换等。通过算子的输入输出的连接来定义算子的计算顺序以及数据流方向。上面的例子中，op_1的输出是op_2的输入，那么在执行计算时，会先计算op_1，然后计算op_2。更复杂的模型可能需要使用控制流算子，依据输入数据来动态执行，针对这种情况，PaddlePaddle提供了IfElseOp和WhileOp等。算子的文档可参考 :code:`fluid.layers`。具体到这个任务, 我们使用一个fc算子：
+
+.. code-block:: python
+
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+损失函数
+--------
+
+损失函数对应求解目标，我们可以通过最小化损失来求解模型。大多数模型使用的损失函数，输出是一个实数值。但是PaddlePaddle提供的损失算子一般是针对一条样本计算。当输入一个batch的数据时，损失算子的输出有多个值，每个值对应一条样本的损失，所以通常会在损失算子后面使用mean等算子，来对损失做归约。模型在一次前向迭代后会得到一个损失值，PaddlePaddle会自动执行链式求导法则计算模型里面每个参数和变量对应的梯度值。这里使用均方误差损失：
+
+.. code-block:: python
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+优化方法
+--------
+
+确定损失函数后，可以通过前向计算得到损失值，然后通过链式求导法则得到参数的梯度值。获取梯度值后需要更新参数，最简单的算法是随机梯度下降法：:math:`w=w - \eta \cdot g`。但是普通的随机梯度下降算法存在一些问题: 比如收敛不稳定等。为了改善模型的训练速度以及效果，学术界先后提出了很多优化算法，包括： :code:`Momentum`、:code:`RMSProp`、:code:`Adam` 等。这些优化算法采用不同的策略来更新模型参数，一般可以针对具体任务和具体模型来选择优化算法。不管使用何种优化算法，学习率一般是一个需要指定的比较重要的超参数，需要通过实验仔细调整。这里采用随机梯度下降算法：
+
+.. code-block:: python
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+
+更多优化算子可以参考 :code:`fluid.optimizer()` 。
+
+下一步做什么？
+##############
+
+使用PaddlePaddle实现模型时需要关注 **数据层**、**前向计算逻辑**、**损失函数** 和 **优化方法**。不同的任务需要的数据格式不同，涉及的计算逻辑不同，损失函数不同，优化方法也不同。PaddlePaddle提供了丰富的模型示例，可以以这些示例为参考来构建自己的模型结构。用户可以访问 `模型库 <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_ 查看官方提供的示例。
diff --git a/source/user_guides/howto/debug/index.rst b/doc/fluid/user_guides/howto/debug/index.rst
similarity index 100%
rename from source/user_guides/howto/debug/index.rst
rename to doc/fluid/user_guides/howto/debug/index.rst
diff --git a/source/user_guides/howto/debug/visualdl.md b/doc/fluid/user_guides/howto/debug/visualdl.md
similarity index 100%
rename from source/user_guides/howto/debug/visualdl.md
rename to doc/fluid/user_guides/howto/debug/visualdl.md
diff --git a/source/user_guides/howto/evaluation/index.rst b/doc/fluid/user_guides/howto/evaluation/index.rst
similarity index 100%
rename from source/user_guides/howto/evaluation/index.rst
rename to doc/fluid/user_guides/howto/evaluation/index.rst
diff --git a/source/user_guides/howto/evaluation/metrics.rst b/doc/fluid/user_guides/howto/evaluation/metrics.rst
similarity index 100%
rename from source/user_guides/howto/evaluation/metrics.rst
rename to doc/fluid/user_guides/howto/evaluation/metrics.rst
diff --git a/source/appendix/foo.rst b/doc/fluid/user_guides/howto/modification/foo.rst
similarity index 100%
rename from source/appendix/foo.rst
rename to doc/fluid/user_guides/howto/modification/foo.rst
diff --git a/doc/fluid/user_guides/howto/prepare_data/feeding_data.rst b/doc/fluid/user_guides/howto/prepare_data/feeding_data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c3bf033bb8316eeb4901c0cdc61e0556c8816dac
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/feeding_data.rst
@@ -0,0 +1,169 @@
+.. _user_guide_use_numpy_array_as_train_data:
+
+###########################
+使用Numpy Array作为训练数据
+###########################
+
+PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
+再使用 Numpy Array 或者直接使用Python创建C++的
+:code:`fluid.LoDTensor` , 通过 :code:`Executor.run(feed=...)` 传给
+:code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` 。
+
+数据层配置
+##########
+
+通过 :code:`fluid.layers.data()` 可以配置神经网络中需要的数据层。具体方法为:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   image = fluid.layers.data(name="image", shape=[3, 224, 224])
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+   # use image/label as layer input
+   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
+   loss = fluid.layers.cross_entropy(input=prediction, label=label)
+   ...
+
+上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data`
+创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据;
+:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是:
+
+1. Fluid中默认使用 :code:`-1` 表示 batch size 维度，默认情况下会在 :code:`shape`
+   的第一个维度添加 :code:`-1` 。 所以 上段代码中， 我们可以接受将一个
+   :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size
+   维度的位置的话，请设置 :code:`fluid.layers.data(append_batch_size=False)` 。
+   请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。
+
+
+2. Fluid中用来做类别标签的数据类型是 :code:`int64`，并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。
+
+.. _user_guide_feed_data_to_executor:
+
+传递训练数据给执行器
+####################
+
+:code:`Executor.run` 和 :code:`ParallelExecutor.run` 都接受一个 :code:`feed` 参数。
+这个参数是一个Python的字典。它的键是数据层的名字，例如上文代码中的 :code:`image`。
+它的值是对应的numpy array。
+
+例如:
+
+.. code-block:: python
+
+   exe = fluid.Executor(fluid.CPUPlace())
+   exe.run(feed={
+      "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+      "label": numpy.random.random(size=(32, 1)).astype('int64')
+   })
+
+进阶使用
+########
+
+如何传入序列数据
+----------------
+
+序列数据是PaddlePaddle Fluid支持的特殊数据类型，可以使用 :code:`LoDTensor` 作为
+输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据;
+2.每个序列的长度信息。
+用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。
+
+传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level`。
+例如训练数据是词汇组成的句子，:code:`lod_level=1`；训练数据是 词汇先组成了句子，
+句子再组成了段落，那么 :code:`lod_level=2`。
+
+例如:
+
+.. code-block:: python
+
+   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
+
+   ...
+
+   exe.run(feed={
+     "sentence": create_lod_tensor(
+       data=numpy.array([1, 3, 4, 5, 3, 6, 8], dtype='int64').reshape(-1, 1),
+       lod=[4, 1, 2],
+       place=fluid.CPUPlace()
+     )
+   })
+
+训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2`。
+他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]`。
+
+如何分别设置ParallelExecutor中每个设备的训练数据
+------------------------------------------------
+
+用户将数据传递给使用 :code:`ParallelExecutor.run(feed=...)` 时，
+可以显示指定每一个训练设备(例如GPU)上的数据。
+用户需要将一个列表传递给 :code:`feed` 参数，列表中的每一个元素都是一个字典。
+这个字典的键是数据层的名字，值是数据层的值。
+
+例如:
+
+.. code-block:: python
+
+   parallel_executor = fluid.ParallelExecutor()
+   parallel_executor.run(
+     feed=[
+        {
+          "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(32, 1)).astype('int64')
+        },
+        {
+          "image": numpy.random.random(size=(16, 3, 224, 224)).astype('float32'),
+          "label": numpy.random.random(size=(16, 1)).astype('int64')
+        },
+     ]
+   )
+
+上述代码中，GPU0会训练 32 个样本，而 GPU1训练 16 个样本。
+
+
+.. _user_guide_customize_batch_size_rank:
+
+自定义BatchSize维度
+-------------------
+
+PaddlePaddle Fluid默认batch size是数据的第一维度，以 :code:`-1` 表示。但是在高级
+使用中，batch_size 可以固定，也可以是其他维度或者多个维度来表示。这都需要设置
+:code:`fluid.layers.data(append_batch_size=False)` 来完成。
+
+1. 固定batch size维度
+
+  .. code-block:: python
+
+     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
+
+  这里，:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。
+
+2. 使用其他维度表示batch size
+
+  .. code-block:: python
+
+     sentence = fluid.layers.data(name="sentence",
+                                  shape=[80, -1, 1],
+                                  append_batch_size=False,
+                                  dtype="int64")
+
+  这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经
+  网络中。
+
+
+.. _user_guide_paddle_support_data_types:
+
+Fluid目前支持的数据类型
+-----------------------
+
+PaddlePaddle Fluid目前支持的数据类型包括:
+
+   * float16： 部分操作支持
+   * float32:  主要实数类型
+   * float64:  次要实数类型，支持大部分操作
+   * int32:  次要标签类型
+   * int64: 主要标签类型
+   * uint64: 次要标签类型
+   * bool: 控制流数据类型
+   * int16: 次要标签类型
+   * uint8: 输入数据类型，可用于图像像素
\ No newline at end of file
diff --git a/doc/fluid/user_guides/howto/prepare_data/index.rst b/doc/fluid/user_guides/howto/prepare_data/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..56fa928029903f1e3bd3e8064c146797f01b2b85
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/index.rst
@@ -0,0 +1,52 @@
+..  _user_guide_prepare_data:
+
+########
+准备数据
+########
+
+PaddlePaddle Fluid支持两种传入数据的方式:
+
+1. 用户需要使用 :code:`fluid.layers.data`
+配置数据输入层，并在 :code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor`
+中，使用 :code:`executor.run(feed=...)` 传入训练数据。
+
+2. 用户需要先将训练数据
+转换成 Paddle 识别的 :code:`fluid.recordio_writer` ， 再使用
+:code:`fluid.layers.open_files` 以及 :code:`fluid.layers.reader` 配置数据读取。
+
+这两种准备数据方法的比较如下:
+
+.. _user_guide_prepare_data_comparision:
+
++------------+----------------------------------+---------------------------------------+
+|            |        Feed数据                  |         使用Reader                    |
++============+==================================+=======================================+
+| API接口    | :code:`executor.run(feed=...)`   |         :code:`fluid.layers.reader`       |
++------------+----------------------------------+---------------------------------------+
+| 数据格式   |           Numpy Array            | :code:`fluid.recordio_writer` |
++------------+----------------------------------+---------------------------------------+
+| 数据增强   | Python端使用其他库完成           | 使用Fluid中的Operator 完成            |
++------------+----------------------------------+---------------------------------------+
+|   速度     |                 慢               |                 快                    |
++------------+----------------------------------+---------------------------------------+
+| 推荐用途   |   调试模型                       |   工业训练                            |
++------------+----------------------------------+---------------------------------------+
+
+这些准备数据的详细使用方法，请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   feeding_data
+   use_recordio_reader
+
+Python Reader
+#############
+
+为了方便用户在Python中定义数据处理流程，PaddlePaddle Fluid支持 Python Reader，
+具体请参考:
+
+.. toctree::
+   :maxdepth: 2
+
+   reader.md
diff --git a/source/user_guides/howto/prepare_data/reader.md b/doc/fluid/user_guides/howto/prepare_data/reader.md
similarity index 100%
rename from source/user_guides/howto/prepare_data/reader.md
rename to doc/fluid/user_guides/howto/prepare_data/reader.md
diff --git a/doc/fluid/user_guides/howto/prepare_data/use_recordio_reader.rst b/doc/fluid/user_guides/howto/prepare_data/use_recordio_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dfda33f1b03516fe2c704f55d095955282b19109
--- /dev/null
+++ b/doc/fluid/user_guides/howto/prepare_data/use_recordio_reader.rst
@@ -0,0 +1,167 @@
+.. _user_guide_use_recordio_as_train_data:
+
+############################
+使用RecordIO文件作为训练数据
+############################
+
+相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
+:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
+但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
+:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
+用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝，
+使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
+
+将训练数据转换成RecordIO文件格式
+################################
+
+:code:`fluid.recordio_writer` 中，每个记录都是一个
+:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
+的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
+
+用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
+:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
+:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
+:ref:`user_guide_reader` 转换成多个RecordIO文件。
+
+具体使用方法为:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   import numpy
+
+   def reader_creator():
+       def __impl__():
+           for i in range(1000):
+               yield [
+                        numpy.random.random(size=[3,224,224], dtype="float32"),
+                        numpy.random.random(size=[1], dtype="int64")
+                     ]
+       return __impl__
+
+   img = fluid.layers.data(name="image", shape=[3, 224, 224])
+   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+
+   BATCH_SIZE = 32
+   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
+   fluid.recordio_writer.convert_reader_to_recordio_file(
+      "train.recordio", feeder=feeder, reader_creator=reader)
+
+其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
+:ref:`_api_fluid_data_feeder_DataFeeder`
+是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
+:ref:`user_guide_reader` 。
+
+上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
+其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
+用户可以将每一个Record的样本数设置成1。并参考
+:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
+
+
+配置神经网络, 打开RecordIO文件
+##############################
+
+RecordIO文件转换好之后，用户可以使用 :code:`fluid.layers.open_files()`
+打开文件，并使用 :code:`fluid.layers.read_file` 读取文件内容。
+简单使用方法如下:
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+
+   file_obj = fluid.layers.open_files(
+     filenames=["train.recordio"],
+     shape=[[3, 224, 224], [1]],
+     lod_levels=[0, 0],
+     dtypes=["float32", "int64"],
+     pass_num=100
+   )
+
+   image, label = fluid.layers.read_file(file_obj)
+
+其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
+直到读取了 :code:`pass_num` 遍。
+
+
+
+进阶使用
+########
+
+
+使用 :code:`fluid.layers.double_buffer()`
+------------------------------------------
+
+:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
+需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
+
+.. code-block:: python
+
+   import paddle.fliud as fluid
+   file_obj = fluid.layers.open_files(...)
+   file_obj = fluid.layers.double_buffer(file_obj)
+
+   image, label = fluid.layers.read_file(file_obj)
+
+双缓冲技术可以参考
+`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
+
+配置数据增强
+------------
+
+使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   file_obj = fluid.layers.open_files(...)
+   preprocessor = fluid.layers.Preprocessor(reader=data_file)
+   with preprocessor.block():
+       image, label = preprocessor.inputs()
+       image = image / 2
+       label = label + 1
+       preprocessor.outputs(image, label)
+
+如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
+:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
+:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
+:code:`preprocessor.outputs()` 标记预处理后的输出。
+
+.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
+
+使用Op组batch
+-------------
+
+使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   file_obj = fluid.layers.open_files(...)
+   file_obj = fluid.layers.batch(file_obj, batch_size=32)
+
+   img, label = fluid.layers.read_file(file_obj)
+
+需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
+那么这几个样本直接组成一个批量数据进行训练。
+
+读入数据的shuffle
+-----------------
+
+使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
+
+.. code-block:: python
+
+   import paddle.fluid as fluid
+   file_obj = fluid.layers.open_files(...)
+   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
+
+   img, label = fliud.layers.read_file(file_obj)
+
+需要注意的是:
+
+1. :code:`shuffle` 实现方法是:
+先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
+
+2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
+足够支持缓存 :code:`buffer_size` 条数据。
diff --git a/source/user_guides/howto/training/checkpoint_doc_cn.md b/doc/fluid/user_guides/howto/training/checkpoint_doc_cn.md
similarity index 100%
rename from source/user_guides/howto/training/checkpoint_doc_cn.md
rename to doc/fluid/user_guides/howto/training/checkpoint_doc_cn.md
diff --git a/source/user_guides/howto/training/checkpoint_doc_en.md b/doc/fluid/user_guides/howto/training/checkpoint_doc_en.md
similarity index 100%
rename from source/user_guides/howto/training/checkpoint_doc_en.md
rename to doc/fluid/user_guides/howto/training/checkpoint_doc_en.md
diff --git a/source/user_guides/howto/training/cluster_howto.rst b/doc/fluid/user_guides/howto/training/cluster_howto.rst
similarity index 100%
rename from source/user_guides/howto/training/cluster_howto.rst
rename to doc/fluid/user_guides/howto/training/cluster_howto.rst
diff --git a/source/user_guides/howto/training/cluster_quick_start.rst b/doc/fluid/user_guides/howto/training/cluster_quick_start.rst
similarity index 100%
rename from source/user_guides/howto/training/cluster_quick_start.rst
rename to doc/fluid/user_guides/howto/training/cluster_quick_start.rst
diff --git a/source/user_guides/howto/training/index.rst b/doc/fluid/user_guides/howto/training/index.rst
similarity index 100%
rename from source/user_guides/howto/training/index.rst
rename to doc/fluid/user_guides/howto/training/index.rst
diff --git a/source/user_guides/howto/training/multi_node.rst b/doc/fluid/user_guides/howto/training/multi_node.rst
similarity index 100%
rename from source/user_guides/howto/training/multi_node.rst
rename to doc/fluid/user_guides/howto/training/multi_node.rst
diff --git a/source/user_guides/howto/training/save_load_variables.rst b/doc/fluid/user_guides/howto/training/save_load_variables.rst
similarity index 100%
rename from source/user_guides/howto/training/save_load_variables.rst
rename to doc/fluid/user_guides/howto/training/save_load_variables.rst
diff --git a/source/user_guides/howto/training/single_node.rst b/doc/fluid/user_guides/howto/training/single_node.rst
similarity index 100%
rename from source/user_guides/howto/training/single_node.rst
rename to doc/fluid/user_guides/howto/training/single_node.rst
diff --git a/source/user_guides/howto/training/src/dist_train_nccl2.graffle b/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.graffle
similarity index 100%
rename from source/user_guides/howto/training/src/dist_train_nccl2.graffle
rename to doc/fluid/user_guides/howto/training/src/dist_train_nccl2.graffle
diff --git a/source/user_guides/howto/training/src/dist_train_nccl2.png b/doc/fluid/user_guides/howto/training/src/dist_train_nccl2.png
similarity index 100%
rename from source/user_guides/howto/training/src/dist_train_nccl2.png
rename to doc/fluid/user_guides/howto/training/src/dist_train_nccl2.png
diff --git a/source/user_guides/howto/training/src/dist_train_pserver.graffle b/doc/fluid/user_guides/howto/training/src/dist_train_pserver.graffle
similarity index 100%
rename from source/user_guides/howto/training/src/dist_train_pserver.graffle
rename to doc/fluid/user_guides/howto/training/src/dist_train_pserver.graffle
diff --git a/source/user_guides/howto/training/src/dist_train_pserver.png b/doc/fluid/user_guides/howto/training/src/dist_train_pserver.png
similarity index 100%
rename from source/user_guides/howto/training/src/dist_train_pserver.png
rename to doc/fluid/user_guides/howto/training/src/dist_train_pserver.png
diff --git a/source/user_guides/howto/training/src/parallelism.png b/doc/fluid/user_guides/howto/training/src/parallelism.png
similarity index 100%
rename from source/user_guides/howto/training/src/parallelism.png
rename to doc/fluid/user_guides/howto/training/src/parallelism.png
diff --git a/source/user_guides/howto/training/test_while_training.rst b/doc/fluid/user_guides/howto/training/test_while_training.rst
similarity index 100%
rename from source/user_guides/howto/training/test_while_training.rst
rename to doc/fluid/user_guides/howto/training/test_while_training.rst
diff --git a/source/user_guides/index.rst b/doc/fluid/user_guides/index.rst
similarity index 100%
rename from source/user_guides/index.rst
rename to doc/fluid/user_guides/index.rst
diff --git a/doc/fluid/user_guides/models/index.rst b/doc/fluid/user_guides/models/index.rst
new file mode 120000
index 0000000000000000000000000000000000000000..5ac5e2d94aca9f6a9abc84e0ec1137fda576d435
--- /dev/null
+++ b/doc/fluid/user_guides/models/index.rst
@@ -0,0 +1 @@
+../../../../external/models/fluid/README.cn.rst
\ No newline at end of file
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b34ba8d0768427802b11614c6962f3c3f6ef4e3
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,52 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+        "${BINARY_BUILD_DIR_EN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+        html
+        ${BINARY_BUILD_DIR_EN}
+        ${SPHINX_CACHE_DIR_EN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+        "${BINARY_BUILD_DIR_CN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+        html
+        ${BINARY_BUILD_DIR_CN}
+        ${SPHINX_CACHE_DIR_CN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_CN})
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0607748b751e9f2d606236d9e98868335379b05c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -0,0 +1,187 @@
+# Android平台编译指南
+
+用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
+
+## 基于Docker容器的编译方式
+Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
+
+### 构建PaddlePaddle的Android开发镜像
+我们把PaddlePaddle的交叉编译环境打包成一个镜像，称为开发镜像，里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t username/paddle-android:dev . -f Dockerfile.android
+```
+
+用户也可以使用PaddlePaddle提供的官方开发镜像：
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
+### 编译PaddlePaddle C-API库
+构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
+Android的Docker开发镜像向用户提供两个可配置的参数：
+
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 16</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+
+- 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
+```
+
+- 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
+```
+
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+
+## 基于Linux交叉编译环境的编译方式
+本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
+
+### 准备交叉编译环境
+
+从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn)，用户可自行前往下载预编译好的版本，也可通过以下命令获取：
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK中包含了所有Android API级别、所有架构（arm/arm64/x86/mips）需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别，构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
+
+- 构建`armeabi-v7a`、 `Android API 21`的独立工具链：
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
+
+此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+- 构建`arm64-v8a`、 `Android API 21`的独立工具链：
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
+
+此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+### 配置交叉编译参数
+
+CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
+
+交叉编译Android版本的PaddlePaddle库时，有一些必须配置的参数：
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后，PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本，并自动编译PaddlePaddle所需的所有第三方库。此外，还会强制设置一些PaddlePaddle参数的值（`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`）。
+- `WITH_C_API`，必须设置为`ON`。在Android平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
+
+Android平台可选配置参数：
+
+- `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
+- `ANDROID_TOOLCHAIN`，目标工具链。可设置`gcc/clang`，默认值为`clang`。
+	- CMake 3.7以上，将会始终使用`clang`工具链；CMake 3.7以下，可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
+	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
+- `ANDROID_ABI`，目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`，默认值为`armeabi-v7a`。
+- `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
+- `ANROID_ARM_MODE`，是否使用ARM模式。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
+- `ANDROID_ARM_NEON`，是否使用NEON指令。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
+
+其他配置参数：
+
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值；若环境变量`CC/CXX`没有设置，则设置成`cc/c++`编译器。
+
+常用的cmake配置如下：
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 使用`clang`编译工具链
+- `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
+
+### 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```bash
+make
+make install
+```
+
+注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录，其中`include`中包含C-API的头文件，`lib`中包含若干个不同Android ABI的PaddlePaddle库，`third_party`中包含所依赖的所有第三方库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..572063e8012efee2d2e142eb57e459e0e8c6382c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -0,0 +1,189 @@
+# Build PaddlePaddle for Android
+
+There are two approaches to build PaddlePaddle for Android: 
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux) 
+
+## Cross-Compiling Using Docker
+
+Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows.
+
+### Build the Docker Image
+
+The following steps pack all the tools that we need to build PaddlePaddle into a Docker image.
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t paddle:dev-android . -f Dockerfile.android
+```
+
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
+### Build the Inference Library
+
+We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
+```
+
+The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
+
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 16</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+
+The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
+
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+
+The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
+
+## Cross-Compiling on Linux
+
+The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer.
+
+### Setup the Environment
+
+To build for Android's, we need [Android NDK](
+https://developer.android.com/ndk/downloads/index.html):
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android.  (We plan to remove the intermediate stage of building the standalone toolchain in the near future.)
+
+- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
+  
+  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
+
+- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
+
+  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
+
+### Cross-Compiling Arguments
+
+CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake.  `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
+
+Some other CMake arguments you need to know:
+
+- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
+- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
+- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
+
+Some Android-specific arguments:
+
+- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory.  PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument.
+- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`.  The default value is `clang`.
+  - For CMake >= 3.7, it should anyway be `clang`.  For older versions, it could be `gcc`.
+  - Android's official `clang` requires `glibc` >= 2.15.
+- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`.  The default value is `armeabi-v7a`.
+- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`.
+- `ANROID_ARM_MODE`:
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+- `ANDROID_ARM_NEON`: indicates if to use NEON instructions.
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+
+Other useful arguments:
+
+- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen.  Could be `ON` or `OFF`, defaults to `OFF`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
+
+Some frequent configurations for your reference:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+
+There are some other arguments you might want to configure.
+
+- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library.
+- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
+
+Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
+- `CMAKE_BUILD_TYPE=Release`
+- `ANDROID_TOOLCHAIN=clang`
+- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
+
+### Build and Install
+
+After running `cmake`, we can run `make; make install` to build and install.
+
+Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures.
+
+After building，in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories:
+
+- `include`: the header file of the inference library,
+- `lib`: the inference library built for various Android ABIs,
+- `third_party`: dependent third-party libraries built for Android.
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5196d9a4c93c7692d2a624ec7d0650e32806338
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -0,0 +1,117 @@
+# iOS平台编译指南
+交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
+
+## 准备交叉编译环境
+Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境，用户从App Store下载安装Xcode即可。也可自行前往官网下载，[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后，可在命令行执行`xcodebuild -version`，判断是否安装成功。
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## 配置交叉编译参数
+
+PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake)，以提供一些默认的编译器和编译参数配置。
+
+交叉编译iOS版本的PaddlePaddle库时，有一些必须配置的参数：
+
+- `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
+- `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
+
+iOS平台可选配置参数：
+
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
+  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
+  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
+- `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。
+- `IOS_USE_VECLIB_FOR_BLAS`，是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算，可设置`ON/OFF`，默认值为`OFF`。
+- `IOS_DEVELOPMENT_ROOT`，`Developer`目录，可显式指定为`/path/to/platform/Developer`。若未显式指定，PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。
+- `IOS_SDK_ROOT`，所使用`SDK`的根目录，可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定，PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。
+
+其他配置参数：
+
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算，在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`，默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值；若环境变量`CC/CXX`未设置，则使用`cc/c++`编译器。
+
+常用的cmake配置如下：
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望得到最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`，调用`vecLib`框架提供的BLAS函数进行矩阵计算。
+
+## 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```
+$ make
+$ make install
+```
+
+注意：如果你曾在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含以下内容：
+
+- `include`目录，其中包含所有C-API的头文件
+- `lib`目录，其中包含PaddlePaddle的C-API静态库
+- `third_party`目录，其中包含所依赖的所有第三方库
+
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
+
+自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..19bfe86c511c7e43b462f94c8cabba420b3007f1
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# Build PaddlePaddle for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings：
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Build and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `make install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8ef9dc8031613831437745995268f3abc392f5b
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -0,0 +1,62 @@
+# Raspberry Pi平台编译指南
+
+通常有两个方法来构建基于 Rasspberry Pi 的版本：
+
+1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。
+
+1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
+
+## 安装交叉编译器
+
+克隆下面 Github repo
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。
+
+## 配置交叉编译参数
+
+CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。
+
+交叉编译Raspberry Pi版本PaddlePaddle库时，有一些必须配置的参数：
+
+- `CMAKE_SYSTEM_NAME`：CMake编译的目标平台，必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后，PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本，并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
+
+- `RPI_TOOLCHAIN`：编译工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器；否则，用户需要在cmake时手动设置这些值。无默认值。
+
+- `RPI_ARM_NEON`：是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+
+- `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
+
+一个常用的CMake配置如下：
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+其中`WITH_C_API=ON`表示需要构建推理库。
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。
+
+## 编译和安装
+
+CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。
+
+```bash
+make
+make install
+```
+
+注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Raspberry Pi版本的库。
diff --git a/doc/mobile/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c1a5950ff9553bb725d5a96e3fdf2e5e9f6f95c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_raspberry_en.md
@@ -0,0 +1,62 @@
+# Build PaddlePaddle for Raspberry Pi
+
+You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi:
+
+1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).
+
+1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article.
+
+## The Cross-Compiling Toolchain
+
+Step 1. Clone the Github repo by running the following command.
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`.  To run it on a Linux computer, glibc version >= 2.14 is needed.
+
+## CMake Arguments
+
+CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake).
+
+Some important arguments that need to be set:
+
+- `CMAKE_SYSTEM_NAME`: The target platform.  Must be `RPi`.
+
+- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain.
+
+- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`.
+
+- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host.  It is used to build building tools running on the host, for example, protoc.
+
+A commonly-used CMake configuration is as follows:
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`.
+
+You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`.
+
+## Build and Install
+
+The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies.
+
+```bash
+make
+make install
+```
+
+ The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`.
+
+The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`.
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..56d1515005f6e40b084c6b2184c6a0b3e3a00496
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e0acdff0284e3bc84b2cc4a34a142ee01754f940
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f
--- /dev/null
+++ b/doc/survey/cluster_bootstrapping_tools.md
@@ -0,0 +1,71 @@
+# Cluster bootstrapping tool survey
+## Abstract
+In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
+
+## Basic assumptions
+Here are some basic assumptions before we move on to  details
+1. You are an administrator of a bare metal machine cluster, which means:
+  * you have full control to each of the machines.
+  * you have full control to the network which machines are connected to.
+2. Machines can be booted from network with PEX or iPXE
+3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
+
+if your cluster is able to mark above items with checkmarks, then keep reading.
+
+## Comparing Sextant and Tectonic installer
+### Sextant
+Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
+
+#### Pros
+1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
+2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
+3. docker registry integrated.
+4. GPU machine took care of.
+
+### Cons
+1. k8s API server is not deployed with high availability in considering by default.
+2. No grouping support.
+3. No API interface, a one-off service.
+
+
+### Tectonic installer
+First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
+
+Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
+Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
+
+Matchbox's Approach is similar to Sexstant.
+
+### Pros
+1. supports grouping machines.
+2. supports running provisioning service in rtk. (not a big deal though).
+3. supports http/gRPC API interface.
+4. supports multi-template.
+
+### Cons
+1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
+2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
+
+## Conclusion
+Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
+
+
+
+## Appendix: General procedure to bring up a cluster
+It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
+1. setup a bootstrap machine with static IP in the cluster, which has following services:
+  * DHCP: assigns ip address for rest of the nodes.
+  * name service: to map node name to a IP
+  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
+  * cluster config service: this is for providing cluster node with OS config via http
+  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
+2. New node powers on, it will
+  * broadcast the request for an IP address
+  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
+  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
+  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
+  * then restart the node.
+
+For further understanding, following 2 links from Matchbox are some good readings:
+* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
+* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b80b014b1b1dc50f425e1296f70984c9e9b1cbd
--- /dev/null
+++ b/doc/survey/dynamic_graph.md
@@ -0,0 +1,378 @@
+# Automatic Differentiation with the Tape
+
+## Automatic Differentiation
+
+A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+
+## The Tape
+
+Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+
+1. from the forward pass program itself, or
+1. from the execution trace of the forward pass program, which is often known as the *tape*.
+
+This article surveys systems that follow the latter strategy.
+
+## Dynamic Network
+
+When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+
+Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+
+## An Overview
+
+Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+
+Consider the following code feedforward model.
+
+```python
+x = Variable(randn(20, 1)))
+label = Variable(randint(1))
+W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
+h = matmul(W_1, x)
+pred = matmul(W_2, x)
+loss = softmax(pred, label)
+loss.backward()
+```
+
+### 1) Dynet uses List to encode the Tape
+
+During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    node [
+        fontsize = "16"
+        shape = "ellipse"
+    ];
+    edge [];
+    "node0" [
+        label = "<f0> type: matmul | <f1> input: W_1, x | <f2> output: h"
+        shape = "record"
+    ];
+    "node1" [
+        label = "<f0> type: matmul | <f1> input: W_2, h | <f2> output: pred"
+        shape = "record"
+    ];
+    "node2" [
+        label = "<f0> type: softmax | <f1> input: pred, label | <f2> output: loss"
+        shape = "record"
+    ];
+    "node0":f0 -> "node1":f0 [];
+    "node1":f0 -> "node2":f0 [];
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
+
+### 2) Pytorch uses Node Graph to encode the Tape
+
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    
+    subgraph function {
+        node [
+            fontsize = "16"
+            style = filled
+            shape = "record"
+        ];
+        "matmul0" [ label = "<f0> type: matmul | prev_func: None" ];
+        "matmul1" [ label = "<f0> type: matmul | prev_func: matmul" ];
+        "softmax" [ label = "<f0> type: softmax | prev_func: matmul" ];
+    }
+    
+    subgraph variable {
+        node [
+            fontsize = "16"
+            shape = "Mrecord"
+            style = filled
+            fillcolor = white
+        ];
+        "x" [ label = "<f0> x | <f1> creator: None" ];
+        "label" [ label = "<f0> label | <f1> creator: None" ];
+        "W_1" [ label = "<f0> W_1 | <f1> creator: None" ];
+        "W_2" [ label = "<f0> W_2 | <f1> creator: None" ];
+        "h" [ label = "<f0> h | <f1> creator: None" ];
+        "pred" [ label = "<f0> pred | <f1> creator: matmul" ];
+        "loss" [ label = "<f0> loss | <f1> creator: softmax" ];
+    }
+    
+    subgraph data_flow {
+        "x":f0 -> "matmul0":f0;
+        "W_1":f0 -> "matmul0":f0;
+        "matmul0":f0 -> "h":f0;
+    
+        "h":f0 -> "matmul1":f0;
+        "W_2":f0 -> "matmul1":f0;
+        "matmul1":f0 -> "pred":f0;
+    
+        "pred":f0 -> "softmax":f0;
+        "label":f0 -> "softmax":f0;
+        "softmax":f0 -> "loss":f0;
+    }
+
+    subgraph prev_func {
+        edge [color="red", arrowsize="0.6", penwidth="1", constraint=false];
+        "matmul1":f1 -> "matmul0":f0;
+        "softmax":f1 -> "matmul1":f0;
+        label = "prev_func";
+    }
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
+
+Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
+
+## Design choices
+
+### 1) Dynet's List vs Pytorch's Node Graph
+
+What's good about List:
+1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
+1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+
+What's good about Node Graph:
+1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
+```python
+result = BigNet(data)
+loss = SmallNet(data)
+loss.backward()
+```
+
+### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+
+Dynet builds the list in a symbolic matter. Consider the following example
+```python
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg()
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
+
+Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+
+
+## What can fluid learn from them?
+
+Please refer to `paddle/contrib/dynamic/`.
+
+# Appendix
+
+### Overview
+
+| Framework | Has Tape | Core in C++ | First Release Date |
+|-----------|----------|-------------|--------------------|
+| Autograd  | No       | No          | Mar 5, 2015        |
+| Chainer   | No       | No          | Jun 5, 2015        |
+| Pytorch   | No       | Yes         | Aug 31, 2016       |
+| Dynet     | Yes      | Yes         | Oct 12, 2016       |
+
+### Source Code
+#### Autograd
+[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed.
+```python
+# User API
+def make_grad(fun, x):
+    start_node = VJPNode.new_root()
+    end_value, end_node =  trace(start_node, fun, x)
+    return backward_pass(g, end_node), end_value
+
+# trace the forward pass by creating VJPNodes
+def trace(start_node, fun, x):
+    with trace_stack.new_trace() as t:
+        start_box = new_box(x, t, start_node)
+        end_box = fun(start_box)
+        return end_box._value, end_box._node
+
+def backward_pass(g, end_node):
+    outgrads = {end_node : (g, False)}
+    for node in toposort(end_node):
+        outgrad = outgrads.pop(node)
+        ingrads = node.vjp(outgrad[0])
+        for parent, ingrad in zip(node.parents, ingrads):
+            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
+    return outgrad[0]
+
+# Every VJPNode corresponds to a op_grad
+class VJPNode(Node):
+    __slots__ = ['parents', 'vjp']
+    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
+        self.parents = parents
+        vjpmaker = primitive_vjps[fun]
+        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)
+```
+#### Chainer
+Example Code
+```python
+# (1) Function Set definition, creates FunctionNode
+model = FunctionSet(
+    l1=F.Linear(784, 100),
+    l2=F.Linear(100, 100),
+    l3=F.Linear(100, 10)).to_gpu()
+
+# (2) Optimizer Setup
+opt = optimizers.SGD()
+opt.setup(model)
+
+# (3) Forward computation
+def forward(x, t):
+    h1 = F.relu(model.l1(x))
+    h2 = F.relu(model.l2(h1))
+    y = model.l3(h2)
+    return F.softmax_cross_entropy(y, t)
+
+# (4) Training loop
+for epoch in xrange(n_epoch):
+    for i in xrange(0, N, b_size):
+        x = Variable(to_gpu(...))
+        t = Variable(to_gpu(...))
+        opt.zero_grads()
+        loss = forward(x, t)
+        loss.backward()
+        opt.update()
+```
+In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`.
+```python
+class FunctionNode(object):
+    ...
+    def apply(self, inputs):
+        outputs = self.forward(inputs)
+        ret = tuple([variable.Variable(y, requires_grad=requires_grad)
+                     for y in outputs])
+        # Topological ordering
+        self.rank = max([x.rank for x in inputs]) if input_vars else 0
+        # Add backward edges
+        for y in ret:
+            y.creator_node = self
+        self.inputs = tuple([x.node for x in input_vars])
+        self.outputs = tuple([y.node for y in ret])
+
+        return ret
+```
+`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order.
+```python
+class VariableNode(object):
+    ...
+    def backward(self, retain_grad, loss_scale):
+        if self.creator_node is None:
+            return
+
+        cand_funcs = []
+        seen_set = set()
+        grads = {}
+
+        # Initialize error by 1, if this is a loss variable
+        if self.data.size == 1 and self._grad_var is None:
+            self.grad = numpy.ones_like(self.data)
+        grads[self._node] = self._grad_var
+
+        def add_cand(cand):
+            if cand not in seen_set:
+                # Negate since heapq is min-heap. This is a global variable
+                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+                seen_set.add(cand)
+
+        add_cand(self.creator_node)
+
+        while cand_funcs:
+            _, _, func = heapq.heappop(cand_funcs)
+            gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad)
+
+            for x, gx in enumerate(gxs):
+                if x in grads:
+                    grads[x] += gx
+                else:
+                    grads[x] = gx
+
+                if x.creator_node is not None:
+                    add_cand(x.creator_node)
+```
+
+#### PyTorch
+Example Code
+```python
+x = Variable(torch.ones(5, 5))
+y = Variable(torch.ones(5, 5) * 4)
+z = x ** 2 + x * 2 + x * y + y
+z.backward(torch.ones(5, 5))
+```
+The trace is done by `Variable.creator` and `Function.previous_functions`.
+```python
+class Variable(object):
+    def __init__(self, tensor, creator=None, requires_grad=True):
+        if creator is None:
+            creator = Leaf(self, requires_grad)
+        self.data = tensor
+        self.creator = creator
+        self._grad = None
+
+    def backward(self, gradient=None):
+        if gradient is None:
+            if self.data.numel() != 1:
+                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+            gradient = self.data.new(1).fill_(1)
+        self._execution_engine.run_backward(self, gradient)
+
+class Function(obejct):
+    # ...
+    def _do_forward(self, *input):
+        unpacked_input = tuple(arg.data for arg in input)
+        raw_output = self.forward(*unpacked_input)
+
+        # mark output.creator = self for backward trace
+        output = tuple(Variable(tensor, self) for tensor in raw_output)
+
+        self.previous_functions = [(arg.creator, id(arg)) for arg in input]
+        self.output_ids = {id(var): i for i, var in enumerate(output)}
+        return output
+
+    def _do_backward(self, grad_output):
+        return self.backwaerd(grad_output)
+```
+The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd.
+
+#### DyNet
+Example code
+```python
+model = dy.model()
+W_p = model.add_parameters((20, 100))
+b_p = model.add_parameters(20)
+E = model.add_lookup_parameters((20000, 50))
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg() # init tape
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order.
+```c++
+void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
+  ...  
+  for (int i = num_nodes - 1; i >= 0; --i) {
+    // each node corresponds to an op
+    node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai);
+  }
+  ...
+}
+```
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6e48f4f58269b67450cb012f6dcc59e1083abba
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
new file mode 100644
index 0000000000000000000000000000000000000000..890f70615538af23cd05b9ffd685e870a5644cdb
--- /dev/null
+++ b/doc/templates/conf.py.cn.in
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
+import shlex
+from recommonmark import parser, transform
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'PaddlePaddle'
+author = u'%s developers' % project
+copyright = u'2016, %s' % author
+github_doc_root = ''
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+    '.Rmd': MarkdownParser,
+}
+os.environ['PADDLE_BUILD_DOC'] = '1'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.graphviz'
+]
+mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
+table_styling_embed_css = True
+
+autodoc_member_order = 'bysource'
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md', '.Rmd']
+
+# The encoding of source files.
+source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index_cn'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = 'zh_CN'
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = []
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# Use the .. admonition:: directive for Notes sections.
+# False to use the .. rubric:: directive instead.
+napoleon_use_admonition_for_notes = True
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
new file mode 100644
index 0000000000000000000000000000000000000000..5b09464cb991f96127edec40f7dbbc97a8d82582
--- /dev/null
+++ b/doc/templates/conf.py.en.in
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
+import shlex
+from recommonmark import parser, transform
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
+
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"]
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'PaddlePaddle'
+author = u'%s developers' % project
+copyright = u'2016, %s' % author
+github_doc_root = ''
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+    '.Rmd': MarkdownParser,
+}
+os.environ['PADDLE_BUILD_DOC'] = '1'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+]
+
+
+autodoc_member_order = 'bysource'
+
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md', '.Rmd']
+
+# The encoding of source files.
+source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index_en'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = []
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# Use the .. admonition:: directive for Notes sections.
+# False to use the .. rubric:: directive instead.
+napoleon_use_admonition_for_notes = True
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+        'enable_eval_rst': True,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/templates/layout.html b/doc/templates/layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..5091eb32eaeff77bd40f5d348e887b99b6eff4ea
--- /dev/null
+++ b/doc/templates/layout.html
@@ -0,0 +1,23 @@
+{# layout.html #}
+{# Import the theme's layout. #}
+{% extends "!layout.html" %}
+
+{# SIDE NAV, TOGGLES ON MOBILE #}		
+{% block menu %}
+<nav class="doc-menu-vertical" role="navigation">
+{% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+{{ toctree }}
+</nav>
+{% endblock %}
+
+{%- block extrahead %} 
+<script>
+var _hmt = _hmt || [];
+(function() {
+  var hm = document.createElement("script");
+  hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+  var s = document.getElementsByTagName("script")[0]; 
+  s.parentNode.insertBefore(hm, s);
+})();
+</script>
+{% endblock %}
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d230a1b9217eea6740419822f350096e361a4435
--- /dev/null
+++ b/doc/v2/CMakeLists.txt
@@ -0,0 +1,54 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_v2_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_v2_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c74522cb089b17c8419e9058f76631b0fe0df93
--- /dev/null
+++ b/doc/v2/api/CMakeLists.txt
@@ -0,0 +1,25 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_v2_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/api/config/activation.rst b/doc/v2/api/config/activation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5317e66b64bbd85c61f19700a9d2c1d239dee573
--- /dev/null
+++ b/doc/v2/api/config/activation.rst
@@ -0,0 +1,108 @@
+===========
+Activation
+===========
+
+Abs
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Abs
+    :noindex:
+    
+Exp
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Exp
+    :noindex:
+    
+Identity
+========
+
+..  automodule:: paddle.v2.activation
+    :members: Identity
+    :noindex:
+    
+Linear
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Linear
+    :noindex:
+
+Log
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Log
+    :noindex:
+    
+Square
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Square
+    :noindex:
+    
+Sigmoid
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Sigmoid
+    :noindex:
+    
+Softmax
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Softmax
+    :noindex:
+    
+SequenceSoftmax
+===============
+
+..  automodule:: paddle.v2.activation
+    :members: SequenceSoftmax
+    :noindex:
+    
+Relu
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Relu
+    :noindex:
+    
+BRelu
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: BRelu
+    :noindex:
+    
+SoftRelu
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftRelu
+    :noindex:
+    
+Tanh
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Tanh
+    :noindex:
+    
+STanh
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: STanh
+    :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/v2/api/config/attr.rst b/doc/v2/api/config/attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a93f41b86779200d8bac651614f4d61f4895875f
--- /dev/null
+++ b/doc/v2/api/config/attr.rst
@@ -0,0 +1,6 @@
+Parameter Attribute
+===================
+
+..  automodule:: paddle.v2.attr
+    :members:
+    :noindex:
diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst
new file mode 100644
index 0000000000000000000000000000000000000000..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f
--- /dev/null
+++ b/doc/v2/api/config/evaluators.rst
@@ -0,0 +1,110 @@
+..  _api_v2:
+
+==========
+Evaluators
+==========
+
+Classification 
+==============
+
+classification_error
+--------------------
+..  automodule:: paddle.v2.evaluator
+    :members: classification_error
+    :noindex:
+
+auc
+---
+..  automodule:: paddle.v2.evaluator
+    :members: auc
+    :noindex:
+
+ctc_error
+---------
+..  automodule:: paddle.v2.evaluator
+    :members: ctc_error
+    :noindex:
+
+chunk
+-----
+..  automodule:: paddle.v2.evaluator
+    :members: chunk
+    :noindex:
+
+precision_recall
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  precision_recall
+    :noindex:
+
+Rank
+====
+
+pnpair
+------
+..  automodule:: paddle.v2.evaluator
+    :members:  pnpair
+    :noindex:
+
+Utils
+=====
+
+sum
+---
+..  automodule:: paddle.v2.evaluator
+    :members: sum
+    :noindex:
+
+column_sum
+----------
+..  automodule:: paddle.v2.evaluator
+    :members: column_sum
+    :noindex:
+
+Print
+=====
+
+classification_error_printer
+----------------------------
+..  automodule:: paddle.v2.evaluator
+    :members:  classification_error_printer
+    :noindex:
+
+gradient_printer
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  gradient_printer
+    :noindex:
+
+maxid_printer
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  maxid_printer
+    :noindex:
+
+maxframe_printer
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  maxframe_printer
+    :noindex:
+
+seqtext_printer
+---------------
+..  automodule:: paddle.v2.evaluator
+    :members:  seqtext_printer
+    :noindex:
+
+value_printer
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  value_printer
+    :noindex:
+
+Detection
+==========
+
+detection_map
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  detection_map
+    :noindex:
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5a0cfadfce84df41defdf518b7c3a6222d5b30a1
--- /dev/null
+++ b/doc/v2/api/config/layer.rst
@@ -0,0 +1,552 @@
+..  _api_v2.layer:
+
+======
+Layers
+======
+
+Data layer
+===========
+
+..  _api_v2.layer_data:
+
+data
+----
+..  autofunction:: paddle.v2.layer.data
+    :noindex:
+
+Fully Connected Layers
+======================
+
+..  _api_v2.layer_fc:
+
+fc
+--
+..  autofunction:: paddle.v2.layer.fc
+    :noindex:
+
+selective_fc
+------------
+..  autofunction:: paddle.v2.layer.selective_fc
+    :noindex:
+
+Conv Layers
+===========
+
+conv_operator
+-------------
+..  autofunction:: paddle.v2.layer.conv_operator
+    :noindex:
+
+conv_projection
+---------------
+..  autofunction:: paddle.v2.layer.conv_projection
+    :noindex:
+
+conv_shift
+----------
+..  autofunction:: paddle.v2.layer.conv_shift
+    :noindex:
+
+img_conv
+--------
+..  autofunction:: paddle.v2.layer.img_conv
+    :noindex:
+
+..  _api_v2.layer_context_projection:
+
+context_projection
+------------------
+..  autofunction:: paddle.v2.layer.context_projection
+    :noindex:
+
+row_conv
+--------
+..  autofunction:: paddle.v2.layer.row_conv
+    :noindex:
+
+Image Pooling Layer
+===================
+
+img_pool
+--------
+..  autofunction:: paddle.v2.layer.img_pool
+    :noindex:
+
+spp
+---
+..  autofunction:: paddle.v2.layer.spp
+    :noindex:
+
+maxout
+------
+..  autofunction:: paddle.v2.layer.maxout
+    :noindex:
+
+roi_pool
+--------
+..  autofunction:: paddle.v2.layer.roi_pool
+    :noindex:
+
+pad
+----
+..  autofunction:: paddle.v2.layer.pad
+    :noindex:
+
+Norm Layer
+==========
+
+img_cmrnorm
+-----------
+..  autofunction:: paddle.v2.layer.img_cmrnorm
+    :noindex:
+
+batch_norm
+----------
+..  autofunction:: paddle.v2.layer.batch_norm
+    :noindex:
+
+sum_to_one_norm
+---------------
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
+    :noindex:
+
+cross_channel_norm
+------------------
+..  autofunction:: paddle.v2.layer.cross_channel_norm
+    :noindex:
+
+row_l2_norm
+-----------
+..  autofunction:: paddle.v2.layer.row_l2_norm
+    :noindex:
+
+Recurrent Layers
+================
+
+recurrent
+---------
+..  autofunction:: paddle.v2.layer.recurrent
+    :noindex:
+
+lstmemory
+---------
+..  autofunction:: paddle.v2.layer.lstmemory
+    :noindex:
+
+grumemory
+---------
+..  autofunction:: paddle.v2.layer.grumemory
+    :noindex:
+
+gated_unit
+-----------
+..  autofunction:: paddle.v2.layer.gated_unit
+    :noindex:
+
+Recurrent Layer Group
+=====================
+
+memory
+------
+..  autofunction:: paddle.v2.layer.memory
+    :noindex:
+
+recurrent_group
+---------------
+..  autofunction:: paddle.v2.layer.recurrent_group
+    :noindex:
+
+lstm_step
+---------
+..  autofunction:: paddle.v2.layer.lstm_step
+    :noindex:
+
+gru_step
+--------
+..  autofunction:: paddle.v2.layer.gru_step
+    :noindex:
+
+beam_search
+------------
+..  autofunction:: paddle.v2.layer.beam_search
+    :noindex:
+
+get_output
+----------
+..  autofunction:: paddle.v2.layer.get_output
+    :noindex:
+
+Mixed Layer
+===========
+
+..  _api_v2.layer_mixed:
+
+mixed
+-----
+..  autofunction:: paddle.v2.layer.mixed
+    :noindex:
+
+..  _api_v2.layer_embedding:
+
+embedding
+---------
+..  autofunction:: paddle.v2.layer.embedding
+    :noindex:
+
+scaling_projection
+------------------
+..  autofunction:: paddle.v2.layer.scaling_projection
+    :noindex:
+
+dotmul_projection
+-----------------
+..  autofunction:: paddle.v2.layer.dotmul_projection
+    :noindex:
+
+dotmul_operator
+---------------
+..  autofunction:: paddle.v2.layer.dotmul_operator
+    :noindex:
+
+full_matrix_projection
+----------------------
+..  autofunction:: paddle.v2.layer.full_matrix_projection
+    :noindex:
+
+identity_projection
+-------------------
+..  autofunction:: paddle.v2.layer.identity_projection
+    :noindex:
+
+slice_projection
+-------------------
+..  autofunction:: paddle.v2.layer.slice_projection
+    :noindex:
+
+table_projection
+----------------
+..  autofunction:: paddle.v2.layer.table_projection
+    :noindex:
+
+trans_full_matrix_projection
+----------------------------
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
+    :noindex:
+
+Aggregate Layers
+================
+
+AggregateLevel
+--------------
+..  autoclass:: paddle.v2.layer.AggregateLevel
+    :noindex:
+
+..  _api_v2.layer_pooling:
+
+pooling
+-------
+..  autofunction:: paddle.v2.layer.pooling
+    :noindex:
+
+..  _api_v2.layer_last_seq:
+
+last_seq
+--------
+..  autofunction:: paddle.v2.layer.last_seq
+    :noindex:
+
+..  _api_v2.layer_first_seq:
+
+first_seq
+---------
+..  autofunction:: paddle.v2.layer.first_seq
+    :noindex:
+
+sub_seq
+---------
+..  autofunction:: paddle.v2.layer.sub_seq
+    :noindex:
+
+concat
+------
+..  autofunction:: paddle.v2.layer.concat
+    :noindex:
+
+seq_concat
+----------
+..  autofunction:: paddle.v2.layer.seq_concat
+    :noindex:
+
+seq_slice
+---------
+..  autofunction:: paddle.v2.layer.seq_slice
+    :noindex:
+
+sub_nested_seq
+--------------
+..  autofunction:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
+Reshaping Layers
+================
+
+block_expand
+------------
+..  autofunction:: paddle.v2.layer.block_expand
+    :noindex:
+
+..  _api_v2.layer_expand:
+
+ExpandLevel
+-----------
+..  autoclass:: paddle.v2.layer.ExpandLevel
+    :noindex:
+
+expand
+------
+..  autofunction:: paddle.v2.layer.expand
+    :noindex:
+
+repeat
+------
+..  autofunction:: paddle.v2.layer.repeat
+    :noindex:
+
+rotate
+------
+..  autofunction:: paddle.v2.layer.rotate
+    :noindex:
+
+seq_reshape
+-----------
+..  autofunction:: paddle.v2.layer.seq_reshape
+    :noindex:
+
+Math Layers
+===========
+
+addto
+-----
+..  autofunction:: paddle.v2.layer.addto
+    :noindex:
+
+linear_comb
+-----------
+..  autofunction:: paddle.v2.layer.linear_comb
+    :noindex:
+
+interpolation
+-------------
+..  autofunction:: paddle.v2.layer.interpolation
+    :noindex:
+
+bilinear_interp
+---------------
+..  autofunction:: paddle.v2.layer.bilinear_interp
+    :noindex:
+
+dropout
+--------
+..  autofunction:: paddle.v2.layer.dropout
+    :noindex:
+
+dot_prod
+---------
+.. autofunction:: paddle.v2.layer.dot_prod
+    :noindex:
+
+out_prod
+--------
+.. autofunction:: paddle.v2.layer.out_prod
+    :noindex:
+
+power
+-----
+..  autofunction:: paddle.v2.layer.power
+    :noindex:
+
+scaling
+-------
+..  autofunction:: paddle.v2.layer.scaling
+    :noindex:
+
+clip
+----
+..  autofunction:: paddle.v2.layer.clip
+    :noindex:
+
+resize
+------
+..  autofunction:: paddle.v2.layer.resize
+    :noindex:
+
+slope_intercept
+---------------
+..  autofunction:: paddle.v2.layer.slope_intercept
+    :noindex:
+
+tensor
+------
+..  autofunction:: paddle.v2.layer.tensor
+    :noindex:
+
+..  _api_v2.layer_cos_sim:
+
+cos_sim
+-------
+..  autofunction:: paddle.v2.layer.cos_sim
+    :noindex:
+
+l2_distance
+-----------
+..  autofunction:: paddle.v2.layer.l2_distance
+    :noindex:
+
+trans
+-----
+..  autofunction:: paddle.v2.layer.trans
+    :noindex:
+
+scale_shift
+-----------
+..  autofunction:: paddle.v2.layer.scale_shift
+    :noindex:
+
+factorization_machine
+---------------------
+..  autofunction:: paddle.v2.layer.factorization_machine
+    :noindex:
+
+Sampling Layers
+===============
+
+maxid
+-----
+..  autofunction:: paddle.v2.layer.max_id
+    :noindex:
+
+sampling_id
+-----------
+..  autofunction:: paddle.v2.layer.sampling_id
+    :noindex:
+
+multiplex
+---------
+..  autofunction:: paddle.v2.layer.multiplex
+    :noindex:
+
+..  _api_v2.layer_costs:
+
+Cost Layers
+===========
+
+cross_entropy_cost
+------------------
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
+    :noindex:
+
+cross_entropy_with_selfnorm_cost
+--------------------------------
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+    :noindex:
+
+multi_binary_label_cross_entropy_cost
+-------------------------------------
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+    :noindex:
+
+classification_cost
+-------------------
+.. autofunction:: paddle.v2.layer.classification_cost
+   :noindex:
+
+huber_regression_cost
+-------------------------
+..  autofunction:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+
+huber_classification_cost
+-------------------------
+..  autofunction:: paddle.v2.layer.huber_classification_cost
+    :noindex:
+
+lambda_cost
+-----------
+..  autofunction:: paddle.v2.layer.lambda_cost
+    :noindex:
+
+square_error_cost
+-----------------
+..  autofunction:: paddle.v2.layer.square_error_cost
+    :noindex:
+
+rank_cost
+---------
+..  autofunction:: paddle.v2.layer.rank_cost
+    :noindex:
+
+sum_cost
+---------
+..  autofunction:: paddle.v2.layer.sum_cost
+    :noindex:
+
+crf
+---
+..  autofunction:: paddle.v2.layer.crf
+    :noindex:
+
+crf_decoding
+------------
+..  autofunction:: paddle.v2.layer.crf_decoding
+    :noindex:
+
+ctc
+---
+..  autofunction:: paddle.v2.layer.ctc
+    :noindex:
+
+warp_ctc
+--------
+..  autofunction:: paddle.v2.layer.warp_ctc
+    :noindex:
+
+nce
+---
+..  autofunction:: paddle.v2.layer.nce
+    :noindex:
+
+hsigmoid
+---------
+..  autofunction:: paddle.v2.layer.hsigmoid
+    :noindex:
+
+smooth_l1_cost
+--------------
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
+    :noindex:
+
+multibox_loss
+--------------
+..  autofunction:: paddle.v2.layer.multibox_loss
+    :noindex:
+
+detection_output
+----------------
+..  autofunction:: paddle.v2.layer.detection_output
+    :noindex:
+
+Check Layer
+============
+
+eos
+---
+..  autofunction:: paddle.v2.layer.eos
+    :noindex:
+
+Activation
+==========
+
+prelu
+--------
+..  autofunction:: paddle.v2.layer.prelu
+    :noindex:
diff --git a/doc/v2/api/config/networks.rst b/doc/v2/api/config/networks.rst
new file mode 100644
index 0000000000000000000000000000000000000000..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1
--- /dev/null
+++ b/doc/v2/api/config/networks.rst
@@ -0,0 +1,132 @@
+========
+Networks
+========
+
+The v2.networks module contains pieces of neural network that combine multiple layers.
+
+NLP
+===
+
+sequence_conv_pool
+------------------
+..  automodule:: paddle.v2.networks
+    :members: sequence_conv_pool
+    :noindex:
+
+..  _api_trainer_config_helpers_network_text_conv_pool:
+
+text_conv_pool
+--------------
+..  automodule:: paddle.v2.networks
+    :members: text_conv_pool
+    :noindex:
+
+Images
+======
+
+img_conv_bn_pool
+----------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_bn_pool
+    :noindex:
+
+img_conv_group
+--------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_group
+    :noindex:
+
+..  _api_trainer_config_helpers_network_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+..  automodule:: paddle.v2.networks
+    :members: simple_img_conv_pool
+    :noindex:
+
+small_vgg
+---------
+..  automodule:: paddle.v2.networks
+    :members: small_vgg
+    :noindex:
+
+vgg_16_network
+---------------
+..  automodule:: paddle.v2.networks
+    :members: vgg_16_network
+    :noindex:
+
+Recurrent
+=========
+
+LSTM
+----
+
+lstmemory_unit
+``````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_unit
+    :noindex:
+
+lstmemory_group
+```````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_group
+    :noindex:
+
+simple_lstm
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_lstm
+    :noindex:
+
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_lstm
+    :noindex:
+
+GRU
+---
+
+gru_unit
+````````
+..  automodule:: paddle.v2.networks
+    :members: gru_unit
+    :noindex:
+
+gru_group
+`````````
+..  automodule:: paddle.v2.networks
+    :members: gru_group
+    :noindex:
+
+simple_gru
+``````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru
+    :noindex:
+
+simple_gru2
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru2
+    :noindex:
+
+bidirectional_gru
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_gru
+    :noindex:
+
+simple_attention
+----------------
+..  automodule:: paddle.v2.networks
+    :members: simple_attention
+    :noindex:
+
+dot_product_attention
+---------------------
+..  automodule:: paddle.v2.networks
+    :members: dot_product_attention
+    :noindex:
diff --git a/doc/v2/api/config/optimizer.rst b/doc/v2/api/config/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b32373fdef52a7aa9d64b12cda3f76cb2abf351b
--- /dev/null
+++ b/doc/v2/api/config/optimizer.rst
@@ -0,0 +1,45 @@
+==========
+Optimizer
+==========
+
+Momentum
+========
+..  automodule:: paddle.v2.optimizer
+    :members: Momentum
+    :noindex:
+
+Adam
+====
+..  automodule:: paddle.v2.optimizer
+    :members: Adam
+    :noindex:
+
+Adamax
+======
+..  automodule:: paddle.v2.optimizer
+    :members: Adamax
+    :noindex:
+
+AdaGrad
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: AdaGrad
+    :noindex:
+
+DecayedAdaGrad
+==============
+..  automodule:: paddle.v2.optimizer
+    :members: DecayedAdaGrad
+    :noindex:
+
+AdaDelta
+========
+..  automodule:: paddle.v2.optimizer
+    :members: AdaDelta
+    :noindex:
+
+RMSProp
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: RMSProp
+    :noindex:
diff --git a/doc/v2/api/config/pooling.rst b/doc/v2/api/config/pooling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d26b365c9284632210a1532853e39feedc70758b
--- /dev/null
+++ b/doc/v2/api/config/pooling.rst
@@ -0,0 +1,46 @@
+=======
+Pooling
+=======
+
+BasePool
+========
+..  automodule:: paddle.v2.pooling
+    :members: BasePool
+    :noindex:
+
+Avg
+===
+..  automodule:: paddle.v2.pooling
+    :members: Avg
+    :noindex:
+
+Max
+===
+..  automodule:: paddle.v2.pooling
+    :members: Max
+    :noindex:
+
+Sum
+===
+..  automodule:: paddle.v2.pooling
+    :members: Sum
+    :noindex:
+
+SquareRootN
+===========
+..  automodule:: paddle.v2.pooling
+    :members: SquareRootN
+    :noindex:
+
+CudnnAvg
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnAvg
+    :noindex:
+
+CudnnMax
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnMax
+    :noindex:
+
diff --git a/doc/v2/api/data.rst b/doc/v2/api/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b56c7332cc284649c7e04328e51a7faa78593a39
--- /dev/null
+++ b/doc/v2/api/data.rst
@@ -0,0 +1,10 @@
+==================================
+Data Reader Interface and DataSets
+==================================
+
+..  toctree::
+    :maxdepth: 1
+
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/v2/api/data/data_reader.rst b/doc/v2/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0
--- /dev/null
+++ b/doc/v2/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/v2/api/data/dataset.rst b/doc/v2/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/v2/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/v2/api/data/image.rst b/doc/v2/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/v2/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5813509dce46677444f0234db8e0eaa4f113e3a0
--- /dev/null
+++ b/doc/v2/api/index_en.rst
@@ -0,0 +1,9 @@
+API
+===
+
+..  toctree::
+    :maxdepth: 1
+
+    model_configs.rst
+    data.rst
+    run_logic.rst
diff --git a/doc/v2/api/model_configs.rst b/doc/v2/api/model_configs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..992b559cbd87244612521d4c96f84f997d6c4196
--- /dev/null
+++ b/doc/v2/api/model_configs.rst
@@ -0,0 +1,13 @@
+Model Configuration
+===================
+
+..  toctree::
+    :maxdepth: 1
+
+    config/activation.rst
+    config/layer.rst
+    config/evaluators.rst
+    config/optimizer.rst
+    config/pooling.rst
+    config/networks.rst
+    config/attr.rst
diff --git a/doc/v2/api/overview.rst b/doc/v2/api/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a6f21428de1e4906e4af9433bc1c994f2b2c8b8e
--- /dev/null
+++ b/doc/v2/api/overview.rst
@@ -0,0 +1,12 @@
+V2 API Overview
+================
+
+The PaddlePaddle V2 API is designed to provide a modern user interface for PaddlePaddle V1(the original layer-based platform of PaddlePaddle),
+it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/layer.html>`_ , `Optimizer <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/optimizer.html>`_ , `Evaluator <http://www.paddlepaddle.org/docs/develop/api/en/v2/config/evaluators.html>`_  and `Data Reader <http://www.paddlepaddle.org/docs/develop/api/en/v2/data/data_reader.html>`_ to make the model configuration more familiar to users.
+
+A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
+
+We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
+it has several easy to use methods to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
+
+All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
diff --git a/doc/v2/api/run_logic.rst b/doc/v2/api/run_logic.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5c97651f6536d89d2b5926d4b2907a547aa86b55
--- /dev/null
+++ b/doc/v2/api/run_logic.rst
@@ -0,0 +1,31 @@
+======================
+Training and Inference
+======================
+
+Parameters
+==========
+
+..  automodule:: paddle.v2.parameters
+    :members: Parameters
+    :noindex:
+
+Trainer
+=======
+
+..  automodule:: paddle.v2.trainer
+    :members: SGD
+    :noindex:
+
+Event
+=====
+
+..  automodule:: paddle.v2.event
+    :members:
+    :noindex:
+
+Inference
+=========
+
+..  autofunction:: paddle.v2.infer
+    :noindex:
+    
\ No newline at end of file
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d0dacb104f148c2aeb323365cbd6f014ae00ed5a
--- /dev/null
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,225 @@
+从源码编译
+======================
+
+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   # 1. 获取源码
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
+   docker build -t paddle:dev .
+   # 3. 执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle，有两种方法：
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本，再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录：编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录：编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..664b68da8b7dd3e005ebf3ec34de77729e5ab355
--- /dev/null
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -0,0 +1,237 @@
+Build from Sources
+==========================
+
+.. _requirements:
+
+Requirements
+----------------
+
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+2. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
+We run all the tools by running this image.
+
+.. _build_step:
+
+How To Build
+----------------
+
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
+Or you can build your own image from source as the optional step below:
+
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
+.. code-block:: bash
+
+   # 1. clone the source code
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 2. Optional: build development docker image from source
+   docker build -t paddle:dev .
+   # 3. Run the following command to build a CPU-Only binaries
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container.
+
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+If you wish to run only one unit test, like :code:`test_sum_op`:
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+---------------------------
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
+
+- Not enough disk space
+
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
+
+.. _compile_deps:
+
+Appendix: Compile Dependencies
+-------------------------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Appendix: Build Options
+-------------------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
+
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..106c86bace075764c84bc2a7f7cb09d466fa8794
--- /dev/null
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -0,0 +1,146 @@
+使用Docker安装运行
+================================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
+
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
+
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+------------------------------
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+对于国内用户，我们提供了加速访问的镜像源：
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像：
+
+  .. code-block:: bash
+
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
+
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
+
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址：
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+就这么简单，享受您的旅程！
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+------------------------------
+
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX：**
+
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX：
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25aecb8d0da9feb00006da6259b529b7011d91cb
--- /dev/null
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -0,0 +1,153 @@
+Run in Docker Containers
+=================================
+
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
+
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
+
+After you've read above tutorials you may proceed the following steps.
+
+.. _docker_pull:
+
+Pull PaddlePaddle Docker Image
+------------------------------
+
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+     docker pull docker.paddlepaddlehub.com/paddle
+
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+Choose between different BLAS version:
+
+  .. code-block:: bash
+
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
+
+
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
+
+  .. code-block:: bash
+
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+Launch your training program in Docker
+--------------------------------------
+
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
+
+  .. code-block:: bash
+
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
+
+Also, you can go into the container shell, run or debug your code
+interactively:
+
+  .. code-block:: bash
+
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
+
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
+
+.. _docker_run_book:
+
+PaddlePaddle Book
+------------------
+
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+  .. code-block:: bash
+
+     docker run -p 8888:8888 paddlepaddle/book
+
+For users in China, we provide a faster mirror:
+
+  .. code-block:: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+Then, you would back and paste the address into the local browser:
+
+  .. code-block:: text
+
+     http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+.. _docker_run_gpu:
+
+Train with Docker with GPU
+------------------------------
+
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
+
+  .. code-block:: bash
+
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
+
+  .. code-block:: bash
+
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a9305ac4b6578c14a962f223c647a71e3b8a72b
--- /dev/null
+++ b/doc/v2/build_and_install/index_cn.rst
@@ -0,0 +1,56 @@
+安装与编译
+==========
+
+.. _install_steps:
+
+PaddlePaddle针对不同的用户群体提供了多种安装方式。
+
+专注深度学习模型开发
+--------------------
+
+PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_cn.rst
+
+这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
+
+关注底层框架
+-------------
+
+PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_cn.rst
+
+我们推荐在Docker中运行PaddlePaddle，该方式具有以下优势：
+
+- 无需单独安装第三方依赖
+- 方便分享运行时环境，易于问题的复现
+
+对于有定制化二进制文件需求的用户，我们同样提供了从源码编译安装PaddlePaddle的方法：
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_cn.rst
+
+.. warning::
+
+	需要提醒的是，这种安装方式会涉及到一些第三方库的下载、编译及安装，整个安装过程耗时较长。
+
+
+常见问题汇总
+--------------
+
+如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
+
+:ref:`常见问题解答 <install_faq>`
+
+如果问题没有得到解决，欢迎向PaddlePaddle社区反馈问题：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7990bacbd6966e88e8763e9c5709e410f7e9fed4
--- /dev/null
+++ b/doc/v2/build_and_install/index_en.rst
@@ -0,0 +1,56 @@
+install and Compile
+======================
+
+.. _install_steps:
+
+PaddlePaddle provides various methods of installation for many different users
+
+Focus on Deep Learning Model Development
+----------------------------------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+------------------------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+	:maxdepth: 1
+
+	docker_install_en.rst
+
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
+
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
+
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+
+.. toctree::
+    :maxdepth: 1
+
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
+
+FAQ
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 <install_faq>`
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
+
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/v2/build_and_install/paddleci.png b/doc/v2/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/v2/build_and_install/paddleci.png differ
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..095da19cd41d29bfa72ab23abd24bec45f925a86
--- /dev/null
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -0,0 +1,105 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0
--- /dev/null
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -0,0 +1,123 @@
+Install using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m"
+    :widths: 1, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/v2/design/cluster_train/README.md b/doc/v2/design/cluster_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..177a5f5d54bd924fab34795219ce1f7b270c8e25
--- /dev/null
+++ b/doc/v2/design/cluster_train/README.md
@@ -0,0 +1,182 @@
+# Design Doc: Distributed Training
+
+## Objective
+
+In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
+
+This poses technical challenges to PaddlePaddle:
+
+1. Support fault-recovery.
+1. Support both offline and online training.
+1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
+
+
+## Training Job
+
+A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
+
+1. the *master server process*, which dispatches tasks to
+1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
+1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.
+
+Their relation is illustrated in the following graph:
+
+<img src="src/paddle-model-sharding.png"/>
+
+By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
+
+When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
+
+### Master Server Process
+
+The master server process will:
+
+- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
+- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
+
+
+#### Task
+
+A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
+
+#### Task Queue
+
+The master server has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master server. Each master server process has three task queues.
+
+<img src="src/paddle-task-queues.png"/>
+
+- The todo queue holds tasks to be dispatched. When a job starts, the master server fills in the todo queue with all tasks.
+- The pending queue holds tasks that are currently training by trainers.
+- the done queue holds tasks that are already trained.
+
+The life cycle of a single task is illustrated below:
+
+<img src="src/paddle-task-states.png"/>
+
+1. When a new pass of training starts, all tasks will be placed in the todo queue.
+1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
+1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
+1. If a task fails for any reason in trainer, or takes longer than a specific period of time,  the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
+1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
+
+### Trainer Process
+
+The trainer process will:
+
+- Request tasks from the master.
+- Work on the tasks
+- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+
+### Parameter Server Process
+
+Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
+
+The parameter server will:
+
+- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
+- Periodically save its parameters to distributed file system by overriding the previous save.
+
+### Optimization Algorithms
+
+The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
+
+- Synchronous Stochastic Gradient Descent (sync-SGD)
+
+	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
+
+- Asynchronous Stochastic Gradient Descent (async-SGD)
+
+	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
+
+	- Each trainer uploads its accumulated gradient every n mini-batches.
+	- Every m mini-batches, the trainer downloads new parameters from parameter server.
+	- n and m do not have to be equal.
+
+## Fault Tolerant
+
+The training job will pause if the master server processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
+
+The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
+
+- sync-SGD
+
+	TODO
+
+- async-SGD
+
+	Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
+
+## Fault Recovery
+
+PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
+
+Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
+
+<img src="src/paddle-etcd.png"/>
+
+### Master Server Process
+
+When the master is started by the Kubernetes, it executes the following steps at startup:
+
+1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
+1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
+1. Write its ip address to */master/addr* so that trainers can discover it.
+1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
+
+When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
+
+### Trainer Process
+
+When the trainer is started by the Kubernetes, it executes the following steps at startup:
+
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
+1. Finds and watches */master/addr* to get master's address.
+1. Requests for tasks from the master to start training.
+
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
+
+### Parameter Server Process
+
+When the parameter server is started by Kubernetes, it executes the following steps at startup:
+
+1. Read desired total number of parameter servers from etcd `/ps_desired`
+1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
+
+	The desired number of parameter servers is 3:
+
+	<img src="src/paddle-ps-0.png"/>
+
+	The third parameter server joined:
+
+	<img src="src/paddle-ps-1.png"/>
+
+1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
+1. Now the parameter server is ready for the trainers' requests.
+
+If the parameter server's etcd lease expires, the parameter server will kill itself.
+
+
+## Parameter Server Checkpointing
+See [here](./checkpointing.md)
+
+## Store and dispatching trainning data
+See [here](./data_dispatch.md)
+
+
+## Dynamic Scaling
+
+### Trainer Scaling
+
+TODO
+
+### Parameter Server Scaling
+
+Not planned for v1.
+
+## Training Dataset Format
+
+TODO
+
+## User Interface
+
+TODO
diff --git a/doc/v2/design/cluster_train/checkpointing.md b/doc/v2/design/cluster_train/checkpointing.md
new file mode 100644
index 0000000000000000000000000000000000000000..c87ef2c7d2636208866d05456d5d44316d0bb200
--- /dev/null
+++ b/doc/v2/design/cluster_train/checkpointing.md
@@ -0,0 +1,44 @@
+## 模型参数检查点（Checkpointing）
+模型数据检查点的实现，可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像，来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中，可以通过阶段性的保存每个parameter server的数据快照（snapshot）到 ***分布式存储服务*** 达到容灾的目的，比如每隔10分钟最新的快照，并删除更早的快照。在出现单点故障时，只需要恢复这台节点，或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
+
+<img src="src/checkpointing.png" width="500"/>
+
+### 快照保存的设计如下：
+
+说明：
+
+* parameter server在集群中启动后，自动挂载分布式存储目录，并把快照保存到这个目录下。
+* ***注：每个parameter server的检查点各自独立保存，暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点，因为这样做也没法保证消除随机性。***
+
+检查点保存程序流程：
+
+1. 如果满足条件"每隔10分钟"时，parameter server会获取parameters内存的`read_lock`，启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程，则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`，所以在写入快照的过程中，parameter server会暂停参数更新并等待。
+2. parameter server生成一个UUID，向指定的目录中一个新的文件（文件名为此UUID）写入快照数据。在快照写入完成后，计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容：`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
+3. 删除磁盘目录中不是当前uuid的快照文件。
+4. 释放对paramters内存的锁定，停止保存检查点的线程。
+
+这里需要用户额外注意，在您的实际环境中，训练任务的运行可能会占满trainer和parameter server之间的网络带宽，如果parameter server此时还需要通过网络访问分布式存储以保存快照，可能会造成网络拥塞，而出现阶段性的运行停滞。
+
+### 从快照恢复
+
+在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动，则需要回滚到上一个检查点：
+
+  1. 从etcd中读取节点：`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
+  1. 从磁盘文件中加载uuid文件名的检查点快照文件，并加载其中的参数
+  1. 如果上面两步出现错误，则使用启动参数定义的初始化方法初始化参数
+  1. 开始提供服务
+
+## TODO List
+### 推测执行/加速执行（TODO）
+在异构集群中，如果存在某些trainer执行速度过慢会影响整体集群的速度（如图中Trainer 1），此时master将负责启动一个新的Trainer（Accelerate Trainer 2），使用同样的训练数据block。哪个trainer先完成block的训练，则把另一个慢速的kill掉。
+
+### 动态扩容/缩容
+目前只考虑动态扩容trainer数量，可以减小系统复杂性。
+
+## 术语
+* model: 指深度学习训练之后得到的所有参数，使用这个神经网络可以完成对新数据的预测
+* parameters: 神经网络中的参数，包括权重w和偏置b。一个神经网络的模型由大量的参数组成
+* shard: 分片，通常指将一个整体拆分成多份的其中的一份。
+* model shard: 将一个神经网络参数拆分成多份，每个shard分别存储在其中一台parameter server之上
+* parameter block: 多个parameter block构成一个model shard
+* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低（（平均故障率*平均故障修复时间）^2）只对特殊在线系统考虑两台以上同时故障的容灾。
diff --git a/doc/v2/design/cluster_train/data_dispatch.md b/doc/v2/design/cluster_train/data_dispatch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f5d22ff5e6abcb576d16cbe7391da1967a1ab8e
--- /dev/null
+++ b/doc/v2/design/cluster_train/data_dispatch.md
@@ -0,0 +1,160 @@
+## 训练数据的存储和分发
+
+### 概念解释
+
+### 流程介绍
+生产环境中的训练数据集通常体积很大，并被存储在诸如Hadoop HDFS，Ceph，AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务，包括：
+
+* 数据预处理任务
+* Paddle训练任务
+* 在线模型预测服务
+<div style="align: center">
+<img src="src/paddle-cloud-in-data-center.png" width="800"/>
+</div>
+
+在上图中显示了在一个实际生产环境中的应用（人脸识别）的数据流图。生产环境的日志数据会通过实时流的方式（Kafka）和离线数据的方式（HDFS）存储，并在集群中运行多个分布式数据处理任务，比如流式数据处理（online data process），离线批处理（offline data process）完成数据的预处理，提供给paddle作为训练数据。用户也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
+
+### 训练数据存储
+我们选择[CephFS](http://docs.ceph.com/docs/master/cephfs/)作为存储系统。
+
+- 无论是从[PFSClient](../file_manager/README.md)的角度，还是从[Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)中运行任务的角度，统一用`/pfs/$DATACENTER/home/$USER`来访问用户自己的数据。  
+- `/pfs/$DATACENTER/common`下存放公共数据集合
+	- 做只读挂载 
+
+<div style="align: center">
+<img src="src/file_storage.png" width="700" align=center/>
+</div>
+
+### 文件预处理
+
+
+在开始训练之前, 数据集需要预先被转换成PaddlePaddle分布式训练使用的存储格[RecordIO](https://github.com/PaddlePaddle/Paddle/issues/1947)。我们提供两个转换方式：
+
+1. 用户在本地转换好再上传
+1. 用户上传数据后，在机群上运行转换程序
+
+转换生成的文件名会是以下格式：
+
+```text
+name_prefix-aaaaa-of-bbbbb
+```
+
+"aaaaa"和"bbbbb"都是五位的数字，每一个文件是数据集的一个shard，"aaaaa"代表shard的index，"bbbbb"代表这个shard的最大index。
+
+比如ImageNet这个数据集可能被分成1000个shard，它们的文件名是：
+```text
+imagenet-00000-of-00999
+imagenet-00001-of-00999
+...
+imagenet-00999-of-00999
+```
+
+#### 转换库
+
+无论是在本地或是云端转换，我们都提供Python的转换库，接口是：
+```python
+def convert(output_path, reader, num_shards, name_prefix)
+```
+
+- `output_path`: directory in which output files will be saved.
+- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
+- `num_shards`: the number of shards that the dataset will be partitioned into.
+- `name_prefix`: the name prefix of generated files.
+
+`reader`每次输出一个data instance，这个instance可以是单个值，或者用tuple表示的多个值：
+
+```python
+yield 1 # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+```
+
+每个值的类型可以是整形、浮点型数据、字符串，或者由它们组成的list，以及numpy.ndarray。如果是其它类型，会被Pickle序列化成字符串。
+
+### 示例程序
+
+#### 使用转换库
+
+以下`reader_creator`生成的`reader`每次输出一个data instance，每个data instance包涵两个值：numpy.ndarray类型的值和整型的值：
+```python
+def reader_creator():
+	def reader():
+		for i in range(1000):
+			yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+	return reader
+```
+
+把`reader_creator`生成的`reader`传入`convert`函数即可完成转换：
+```python
+convert("./", reader_creator(), 100, random_images)
+```
+
+以上命令会在当前目录下生成100个文件：
+```text
+random_images-00000-of-00099
+random_images-00001-of-00099
+...
+random_images-00099-of-00099
+```
+
+#### 进行训练
+
+
+PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)，生成给定`RecordIO`文件对应的data reader。**无论在本地还是在云端，reader的使用方式都是一致的**：
+
+```python
+# ...
+reader = paddle.reader.creator.RecordIO("/pfs/datacenter_name/home/user_name/random_images-*-of-*")
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+trainer.train(batch_reader, ...)
+```
+
+以上代码的reader输出的data instance与生成数据集时，reader输出的data instance是一模一样的。
+
+### 上传训练文件
+
+使用下面命令，可以把本地的数据上传到存储集群中。
+
+```bash  
+paddle pfs cp filename /pfs/$DATACENTER/home/$USER/folder/
+```
+
+比如，把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令：
+
+```bash  
+paddle pfs cp random_images-*-of-* /pfs/$DATACENTER/home/$USER/folder/
+```
+
+需要`$DATACENTER`的配置写到配置文件中，例如
+
+```
+# config file
+[datacenter_1]
+username=user
+usercert=user.pem
+userkey=user-key.pem
+endpoint=datacenter1.paddlepaddle.org
+
+[datacenter_2]
+username=user
+usercert=user.pem
+userkey=user-key.pem
+endpoint=datacenter2.paddlepaddle.org
+```
+## TODO
+### 文件访问的权限
+控制用户权限  
+
+- 用户可以把自己的数据分享给别人
+
+### 文件访问方式
+不用mount的方式来访问数据，而是直接用API的接口远程访问
+
+例如：  
+
+```
+f = open('/pfs/datacenter_name/home/user_name/test1.dat')
+```
+
+
+### 支持用户自定义的数据预处理job
diff --git a/doc/v2/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md
new file mode 100644
index 0000000000000000000000000000000000000000..edb0245ea083e791b7f32ac57a330698299fceda
--- /dev/null
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@@ -0,0 +1,101 @@
+# Alalysis of large model distributed training in Paddle
+
+***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
+
+## What is it
+
+We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
+
+## How to use
+
+Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
+
+Accrodingly, configure your embedding layers like:
+
+```python
+SPARSE_REMOTE=True
+
+w1 = data_layer(name="w1", size=dict_size)
+emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+w2 = data_layer(name="w2", size=dict_size)
+emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+...
+```
+
+## Implementation details
+
+```c++
+enum MatType {
+  MAT_NORMAL,
+  MAT_NORMAL_SHARED,
+  MAT_VALUE_SHARED,
+  MAT_SPARSE_ROW_IDS,
+  MAT_SPARSE_ROW_AUTO_GROW,
+  MAT_CACHE_ROW,
+  MAT_SPARSE_ROW,
+  MAT_SPARSE_ROW_PREFETCH,
+  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+};
+```
+
+`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
+
+In `trainer_internal.cpp:L93 trainOneBatch`:
+
+```c++
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+```
+
+When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
+
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+
+```c++
+if (fullSize) {
+    ...
+} else {
+getParams = [&] {
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+};
+applyL1 = [](Parameter& para, real decayRate) {
+    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+};
+}
+```
+
+Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
+
+```c++
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+```
+
+`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
+then `getParameterSparse` remote call returns only one row of data to the client.
diff --git a/doc/v2/design/cluster_train/master_server.md b/doc/v2/design/cluster_train/master_server.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bf3c506f101361875043f8bfd97972b8c981a22
--- /dev/null
+++ b/doc/v2/design/cluster_train/master_server.md
@@ -0,0 +1,91 @@
+# Design Doc: Master Server
+
+For an overview of master server's role, please refer to [distributed training design doc](./README.md). In this design doc we will discuss the master server in more details. The master will be implemented in [Go](https://golang.org/).
+
+## Dataset
+
+<img src="src/dataset.png"/>
+
+A dataset is a list of files in *RecordIO* format. A RecordIO file consists of chunks, whereas each chunk consists some records.
+
+## Task Queue
+
+As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress.
+
+### Task Queue Creation
+
+1. Each trainer will make an RPC call (using Go's [rpc](https://golang.org/pkg/net/rpc/) package) to the master server, telling it the RecordIO files representing the dataset specified by the user. Since every trainer will tell the master server the same dataset, only the first RPC call will be honored.
+
+	The RPC interface is:
+	```go
+	func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error {
+	}
+	```
+1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file.
+
+	The definition of the chunk is:
+	```go
+	type Chunk struct {
+		Idx   int // index of the chunk within the file
+		Path  string
+		Index recordio.Index // chunk index
+	}
+	```
+1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
+
+	The definition of the task is:
+	```go
+	type Task struct {
+		Index  int
+		Chunks []Chunk
+	}
+	```
+
+	The elements in the tasks queues is of type `TaskEntry`, containing a timeout counter (described in [task retry logic](#task-retry-logic)), and a task:
+	```go
+	type TaskEntry struct {
+		NumTimeout int
+		Task       Task
+	}
+	```
+
+	The definition of task queues is:
+	```go
+	type TaskQueues struct {
+		Todo    []TaskEntry
+		Pending map[int]TaskEntry // map from task index to task entry
+		Done    []TaskEntry
+	}
+	```
+
+### Task Queue Persistence
+
+The task queues need to be persisted on [etcd](https://github.com/coreos/etcd) for fault recovery. Since the task queues only change once a task is completed or timed out, which is not very frequent, we can afford to synchronize with etcd every time the task queues change.
+
+We will serialize the task queues data structure with [gob encoding](https://golang.org/pkg/encoding/gob/), compress with gzip, and save into etcd synchronously under key `/task_queues`.
+
+### Task Dispatch
+
+The trainer will make an RPC call to master to get a new task when:
+
+- the trainer first started, or
+- the trainer finishes a task.
+
+The RPC interface is:
+```go
+func (m *RPCServer) GetTask(finished *Task, result *Task) error {
+}
+```
+Argument `finished` will be `nil` when the trainer is just started.
+
+During the RPC call the master will do the following:
+
+- Make a copy of the task queues, and update the copy reflecting the finished tasks and the new pending tasks.
+- Synchronize the copy of task queues with etcd using a transaction conditioned on holding the master lock.
+- Replace the task queues with the copy and report to the trainer with the new tasks if succeeded, or discard the copy and report the error to the trainer if failed.
+
+### Task Retry Logic
+
+When a task is dispatched to the trainer, the master will schedule a function for execution after the timeout duration (based on the moving average of task completion time). If the task entry in still in the pending queue, its timeout counter will increase by one, and the task will be moved to todo queue. If the timeout counter is above the threshold, the master will log the error and discard the task.
+
+Please note that since a timed out task could be completed after it has been dispatched for retry, so it is possible for a task to be processed multiple times. We do not try to prevent it from happening since it's fine to train on the same task multiple times due to the stochastic nature of the stochastic gradient decent algorithm.
diff --git a/doc/v2/design/cluster_train/pserver_client.md b/doc/v2/design/cluster_train/pserver_client.md
new file mode 100644
index 0000000000000000000000000000000000000000..474b8c572cd92fc87e9f7f3f2b19d12cccd158de
--- /dev/null
+++ b/doc/v2/design/cluster_train/pserver_client.md
@@ -0,0 +1,171 @@
+# Design Doc: The Client Library of Parameter Server
+
+For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file.
+
+## Parameter Partition
+
+Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment:
+
+### Sparse Parameter
+
+The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector.
+
+Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention:
+
+If a sparse parameter is partitioned into n shards, they should be named as:
+
+```text
+name:sparse-0
+name:sparse-1
+...
+name:sparse-n-1
+```
+
+The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention.
+
+## Model Optimization Using Gradients
+
+There are two ways to perform model optimization using gradients:
+
+- On Client
+
+  The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization).
+
+- On Parameter Server
+
+  The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients.
+
+## L1 and L2 Regularization
+
+PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary.
+
+## Parameter Initialization
+
+The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers.
+
+### Trainer Selection
+
+To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below:
+
+<img src="./src/init_lock.png">
+
+### Trainer Selection Process
+
+The trainer select process is encapsulated in the C API function:
+```c
+int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto);
+```
+The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below:
+
+<img src="./src/pserver_init.png">
+
+## C Interface
+
+```c
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  unsigned char*      content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+typedef int paddle_pserver_client;
+
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
+
+/**
+ * @brief paddle_begin_init_params begins to initialize parameters on
+ * parameter servers.
+ *
+ * paddle_begin_init_params will be called from multiple trainers,
+ * only one trainer will be selected to initialize the parameters on
+ * parameter servers. Other trainers need to get the initialized
+ * parameters from parameter servers using @paddle_get_params.
+ *
+ * @return 1 if the trainer is selected to initialize parameter
+ * servers, otherwise 0.
+ */
+int paddle_begin_init_params(paddle_pserver_client client);
+
+/**
+ * @brief paddle_init_param initializes the parameter on parameter
+ * servers.
+ *
+ * @param param the parameter to initialize.
+ * @param param_config_proto the configuration for the parameter.
+ * @param config_len the length of param_config_proto
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+
+/**
+ * @brief paddle_finish_init_params tells parameter servers client has
+ * sent all parameters to parameter servers as initialization.
+ *
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_finish_init_params(paddle_pserver_client client);
+
+/**
+ * @brief paddle_send_grads sends gradients to parameter servers for
+ * updating parameters.
+ *
+ * @param grads the array of gradients to send.
+ * @param len the length of the gradient array.
+ * @param learning_rate the learning rate for the gradients.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
+
+/**
+ * @brief paddle_get_params gets parameters from parameter servers.
+ *
+ * paddle_get_params will block until parameters are initialized on
+ * the parameter servers.
+ *
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
+ * @param len the length of the names array and the paddle_parameter
+ * array.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
+
+/**
+ * @brief paddle_save_model indicates parameters to save the parameter
+ * to the given path
+ *
+ * @param path the path to save parameters.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_save_model(paddle_pserver_client client, const char* path);
+```
diff --git a/doc/v2/design/cluster_train/remote_parameter_updater.md b/doc/v2/design/cluster_train/remote_parameter_updater.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77
--- /dev/null
+++ b/doc/v2/design/cluster_train/remote_parameter_updater.md
@@ -0,0 +1,21 @@
+# Design Doc: Remote Parameter Updater for Cluster Train
+
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+
+## Parameter Updater
+
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+
+### Remote Parameter Updater
+
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+
+#### Sparse Remote Parameter Updater
+
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+
+### Interface Design
+
+TBD
diff --git a/doc/v2/design/cluster_train/save_model.md b/doc/v2/design/cluster_train/save_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..b755185c81ad617b9c85c47de0f5f65d2201c658
--- /dev/null
+++ b/doc/v2/design/cluster_train/save_model.md
@@ -0,0 +1,111 @@
+# Design Doc: Save Model
+
+## Overview
+
+The model is the output of the training process. There are two
+ways from which user can obtain a model:
+
+- Save model triggered by user code: user code asks PaddlePaddle to
+  save a model.
+- Convert model from the checkpoint: model being converted from
+  pservers' periodic checkpoint. In this way, the user can cancel a
+  job at any time, and still have a relatively fresh model (we
+  checkpoint around every 5 minutes).
+
+### Trainer Saving Model vs. Pservers Saving Model
+
+Both trainers and pservers have access to the model. So the model can
+be saved from a trainer or pservers. We need to decide where the model
+is saved from.
+
+#### Dense Update vs. Sparse Update
+
+There are two types of model update methods: dense update and sparse
+update (when the model parameter is configured to be sparse).
+
+- Dense update
+
+  Every trainer has it's own full copy of the model. Every model
+  update will update the entire model.
+
+- Sparse update
+
+  The training input is sparse, and the trainer does not have the
+  entire model. It will only download the sub-model necessary related
+  to the input. When updating the model, only the sub-model related to
+  the training input is updated.
+
+
+#### Pservers Saving Model
+
+The benefit of letting pservers save model is they have the entire
+model all the time. However, since pservers are on different nodes, it
+requires a merging process to merge model shards into the same
+model. Thus requires the pservers to write models to a distributed
+filesystem, making the checkpoint shards visible to the merge program.
+
+#### Trainer Saving Model
+
+The benefit of letting one trainer to save the model is it does not
+require a distributed filesystem. And it's reusing the same save model
+logic when training locally - except when doing sparse update, the
+trainer needs to download the entire model during the saving process.
+
+#### Conclusion
+
+Given trainer saving model does not require a distributed filesystem,
+and is an intuitive extension to trainer saving model when training
+locally, we decide to let the trainer save the model when doing
+distributed training.
+
+
+### Convert Model from Checkpoint
+
+TODO
+
+
+## Timeline
+
+We first implement trainer save the model. Converting the latest
+snapshot to a model will be a TODO for future.
+
+
+## Trainer Save Model
+
+### Trainer Election
+
+One trainer will be elected as the one to save the model. When using
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.
+
+### Model Save Path
+
+Each trainer will be given the directory to save the model. The
+elected trainer will save the model to
+`given-directory/trainerID`. Since the trainer ID is unique, this
+would prevent concurrent save to the same file when multiple trainers
+are elected to save the model when split-brain problem happens.
+
+### What Happens When Model Is Saving
+
+It takes some time to save model, we need to define what will happen
+when save model is taking place.
+
+When doing dense update, the trainer uses the local model. Pservers
+does not need to pause model update.
+
+When doing sparse update. The trainer needs to download the entire
+model while saving. To get the most accurate model, the model update
+needs to be paused before the download starts and resumed after the
+download finishes. Otherwise, the trainer gets a model that is
+"polluted": some part of the model is old, some part of the model is
+new.
+
+It's unclear that the "polluted" model will be inferior due to the
+stochastic nature of deep learning, and pausing the model update will
+add more complexity to the system. Since supporting sparse update is a
+TODO item. We defer the evaluation of pause the model update or not
+during saving model to the future.
diff --git a/doc/v2/design/cluster_train/src/checkpointing.png b/doc/v2/design/cluster_train/src/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/v2/design/cluster_train/src/checkpointing.png differ
diff --git a/doc/v2/design/cluster_train/src/data_dispatch.png b/doc/v2/design/cluster_train/src/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/v2/design/cluster_train/src/data_dispatch.png differ
diff --git a/doc/v2/design/cluster_train/src/dataset.graffle b/doc/v2/design/cluster_train/src/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/v2/design/cluster_train/src/dataset.graffle differ
diff --git a/doc/v2/design/cluster_train/src/dataset.png b/doc/v2/design/cluster_train/src/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/v2/design/cluster_train/src/dataset.png differ
diff --git a/doc/v2/design/cluster_train/src/file_storage.graffle b/doc/v2/design/cluster_train/src/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/v2/design/cluster_train/src/file_storage.graffle differ
diff --git a/doc/v2/design/cluster_train/src/file_storage.png b/doc/v2/design/cluster_train/src/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/v2/design/cluster_train/src/file_storage.png differ
diff --git a/doc/v2/design/cluster_train/src/init_lock.graffle b/doc/v2/design/cluster_train/src/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/v2/design/cluster_train/src/init_lock.graffle differ
diff --git a/doc/v2/design/cluster_train/src/init_lock.png b/doc/v2/design/cluster_train/src/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/v2/design/cluster_train/src/init_lock.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-etcd.graffle b/doc/v2/design/cluster_train/src/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-etcd.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-etcd.png b/doc/v2/design/cluster_train/src/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-etcd.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-model-sharding.png b/doc/v2/design/cluster_train/src/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-model-sharding.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-ps-0.png b/doc/v2/design/cluster_train/src/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-ps-0.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-ps-1.png b/doc/v2/design/cluster_train/src/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-ps-1.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-ps.graffle b/doc/v2/design/cluster_train/src/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-ps.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-queues.graffle b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-queues.png b/doc/v2/design/cluster_train/src/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-queues.png differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-states.graffle b/doc/v2/design/cluster_train/src/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-states.graffle differ
diff --git a/doc/v2/design/cluster_train/src/paddle-task-states.png b/doc/v2/design/cluster_train/src/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/v2/design/cluster_train/src/paddle-task-states.png differ
diff --git a/doc/v2/design/cluster_train/src/pserver_init.graffle b/doc/v2/design/cluster_train/src/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/v2/design/cluster_train/src/pserver_init.graffle differ
diff --git a/doc/v2/design/cluster_train/src/pserver_init.png b/doc/v2/design/cluster_train/src/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/v2/design/cluster_train/src/pserver_init.png differ
diff --git a/doc/v2/design/cluster_train/src/submit-job.graffle b/doc/v2/design/cluster_train/src/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/v2/design/cluster_train/src/submit-job.graffle differ
diff --git a/doc/v2/design/cluster_train/src/submit-job.png b/doc/v2/design/cluster_train/src/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/v2/design/cluster_train/src/submit-job.png differ
diff --git a/doc/v2/design/cluster_train/src/trainer.graffle b/doc/v2/design/cluster_train/src/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254
Binary files /dev/null and b/doc/v2/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/v2/design/cluster_train/src/trainer.png b/doc/v2/design/cluster_train/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/design/cluster_train/src/trainer.png differ
diff --git a/doc/v2/design/cluster_train/submit-job.md b/doc/v2/design/cluster_train/submit-job.md
new file mode 100644
index 0000000000000000000000000000000000000000..8377d5489dc64bd2fdc5bb4f7bc737e7b489000d
--- /dev/null
+++ b/doc/v2/design/cluster_train/submit-job.md
@@ -0,0 +1,127 @@
+# Submit a Distributed Training Job
+
+The user can submit a distributed training job with Python code, rather than with a command-line interface.
+
+## Runtime Environment On Kubernetes
+
+For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image.
+
+### Base Docker Image
+
+Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right.
+
+### Runtime Docker Image
+
+The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image.
+
+- Handle Python Dependencies
+
+  You need to provide requirements.txt file in your `trainer-package` folder. Example:
+
+  ```txt
+  pillow
+  protobuf==3.1.0
+  ```
+  More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like:
+  ```bash
+    paddle_example
+      |-quick_start
+        |-trainer.py
+        |-dataset.py
+        |-requirements.txt
+  ```
+
+## Submit Distributed Training Job With Python Code
+<img src="./src/submit-job.png" width="800">
+
+- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job.
+- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes.
+- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well.
+
+You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters:
+```python
+paddle.job.dist_train(
+  trainer=dist_trainer(),
+  paddle_job=PaddleJob(
+    job_name = "paddle-cloud",
+    entry_point = "python %s"%__file__,
+    trainer_package = "/example/word2vec",
+    image = "yancey1989/paddle-job",
+    trainers = 10,
+    pservers = 3,
+    trainer_cpu = 1,
+    trainer_gpu = 1,
+    trainer_mem = "10G",
+    pserver_cpu = 1,
+    pserver_mem = "2G"
+  ))
+```
+
+The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows:
+```python
+def dist_trainer():
+  def trainer_creator():
+    trainer = paddle.v2.trainer.SGD(...)
+    trainer.train(...)
+  return trainer_creator
+```
+
+The pseudo code of `paddle.job.dist_train` is as follows:
+```python
+def dist_train(trainer, paddle_job):
+  # if the code is running on cloud, set PADDLE_ON_CLOUD=YES
+  if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO":
+    #submit the paddle job
+    paddle_job.submit()
+  else:
+    #start the training
+    trainer()
+```
+### PaddleJob Parameters
+parameter | type | explanation
+ --- | --- | ---
+job_name | str | the unique name for the training job
+entry_point | str | entry point for startup trainer process
+trainer_package | str | trainer package file path which user have the access right
+image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image)
+pservers|int| Parameter Server process count
+trainers|int| Trainer process count
+pserver_cpu|int| CPU count for each Parameter Server process
+pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_cpu|int| CPU count for each Trainer process
+trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter
+
+### Deploy Parameter Server, Trainer and Master Process
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
+
+## Job Server
+
+- RESTful API
+
+  Job server provides RESTful HTTP API for receiving the trainer package and displaying
+  PaddlePaddle job related informations.
+  - `POST   /v1/package` receive the trainer package and save them on CephFS
+  - `POST   /v1/trainer/job` submit a trainer job
+  - `GET    /v1/jobs/` list all jobs
+  - `GET    /v1/jobs/<job-name>` the status of a job
+  - `DELETE /v1/jobs/<job-name>` delete a job
+  - `GET    /v1/version` job server version
+
+- Build Runtime Docker Image on Kubernetes
+
+  `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training.
+
+  There are some benefits for building runtime Docker image on JobServer:
+  - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe.
+  - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies.
+  - If we want to change another image type, such as RKT, users do not need to care about it.
+
+- Deploy Parameter Server, Trainer and Master Processes
+
+  `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows:
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
diff --git a/doc/v2/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..826ff3141bc2512b525cb44ac0f18b376ce57e92
--- /dev/null
+++ b/doc/v2/design/interface/00.why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
+                                     uint64_t* width,
+                                     uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/legacy/math/matrix.h"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::capi::CMatrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/capi/CMatrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class CMatrix {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 实现
+
+参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/v2/design/interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md
new file mode 100644
index 0000000000000000000000000000000000000000..9820284523246a062581f322616d196f575c9d29
--- /dev/null
+++ b/doc/v2/design/interface/01.inference_implementation.md
@@ -0,0 +1,131 @@
+# C-API 模型推断实现文档
+
+本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API，通过模型推断API的实现作为一个样例，来进行讨论。至于为什么需要C-API，请参考[Why Plain C](./00.why_plain_c.md)。
+
+## Table of Contents
+   * [C-API 模型推断实现文档](#c-api-模型推断实现文档)
+      * [暴露接口原则](#暴露接口原则)
+      * [目录结构](#目录结构)
+      * [实现方式](#实现方式)
+         * [capi.h](#capih)
+         * [具体某种类型的头文件](#具体某种类型的头文件)
+         * [capi_private.h](#capi_privateh)
+         * [具体某种类型的实现文件](#具体某种类型的实现文件)
+         * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib)
+         * [libpaddle_capi_whole.a](#libpaddle_capi_wholea)
+         * [examples](#examples)
+      * [编译选项](#编译选项)
+
+
+## 暴露接口原则
+
+1. 所有的接口均为C接口。即使用`extern "C"`
+2. 除构造某种类型的函数(`paddle_matrix_create`等)，其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。
+3. 所有类型名为`paddle_类型名`，所有与类型相关的函数，函数名为`paddle_类型名_函数名`
+4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言，那么
+	* 为了暴露的接口尽量简单。只暴露概念的接口，而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。
+	* 暴露这个概念必要函数。`必要`是指，即完成某一个任务的最少函数。
+5. 不在`capi`接口层做过多封装。
+	* 如果某一个Paddle概念必须要暴露，但是又过于琐碎。不在`capi`这一层进行封装，而是直接修改Paddle Core。让Paddle核心中，这一概念不再琐碎。
+
+
+## 目录结构
+
+```text
+Paddle
+  `-- paddle
+        `-- capi
+              `-- examples  # The example project for C-API.
+              `-- tests  # unittests for C-API
+              `-- capi.h  # C-API header file.
+              `-- capi_private.h  # The shared header file between implementation sources.
+              `-- matrix.{h, cpp}
+              `-- gradient_machine.{h, cpp}
+              `-- ...
+```
+
+
+Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件，均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即，安装后的目录结构为
+
+```text
+`-- include
+      `-- paddle
+             `-- capi.h
+             `-- matrix.h
+             `-- gradient_machine.h
+             `-- ...
+`-- lib
+     `-- libpaddle_capi_shared.{so, dylib}  # In mac, dynamic libary's file name extention is `dylib`
+     `-- libpaddle_capi_whole.a  # static library for all symbols of Paddle.
+```
+
+## 实现方式
+
+下面分别介绍某一类文件的实现方式。
+
+### capi.h
+
+`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中，引入了类型的头文件，`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时，使用相对路径的引用方式。即`#include "matrix.h"`
+
+### 具体某种类型的头文件
+
+具体某种类型的头文件，即例如`matrix.h`，`gradient_machine.h`等。在这些头文件中，包含了某种类型的类型定义和暴露的全部函数。
+
+这个头文件不假设其他文件的引用顺序，即使用户直接引用某种类型的头文件，也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型，例如`gradient_machine`需要引用`matrix`，则直接引入另一种类型的头文件，即`#include "matrix.h"`。
+
+### capi_private.h
+
+`capi_prviate.h`是各个实现中共享的头文件，他主要包含了实际暴露的类型结构。在用户使用C-API时，Paddle的类型全部退化成`void *`，即`typedef paddle_matrix void*`。但，对于每种C-API暴露的类型，均是在`capi_private.h`中实现的结构体。
+
+```cpp
+struct CMatrix {
+   int type = MatrixType;
+   std::shared_ptr<paddle::Matrix> mat;
+};
+```
+
+通常，这个结构体包含两个项目。
+
+* `type`是一个类型的标志。对于每种类型，type字段均不尽相同。这样，即使C-API接受的类型全是`void *`，我们也可以确定每一个参数的类型。
+
+  ```cpp
+  void some_c_api_function(void* some_instance) {
+     int* type = (int *) some_instance;
+     switch (*type) {
+       case MatrixType:
+         CMatrix* mat = (CMatrix *) some_instance;
+         ...
+       ...
+     }
+  }
+  ```
+* 这个结构体中的另一个项目是，Paddle Core中这一类型接口的智能指针(shared_ptr)。
+	* 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例，而不必在意Paddle Core是否还在使用这个实例。
+	* 例如，用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后，直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数，这个参数也不会一并删除。
+
+### 具体某种类型的实现文件
+
+具体某种类型的实现文件，即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中，使用C++ 11实现了C-API的接口，并且使用`extern "C"`导出这些接口。在实现过程中，对输入参数的安全性进行了必要的判断，并将C-API接口的参数转发给`Paddle Core`。
+
+### libpaddle\_capi_shared.{so, dylib}
+
+`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。
+
+### libpaddle\_capi_whole.a
+
+`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。
+
+
+### examples
+
+在样例中，使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。
+
+## 编译选项
+
+C-API的编译选项默认关闭，打开这个编译选项，需要在cmake的时候，设置
+
+```bash
+cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF
+```
+
+编译C-API的时候推荐Paddle不嵌入Python解释器，也不生成`SWIG`接口，具体原因参考[Why Plain C](./00.why_plain_c.md)。
diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2509a5c5f4182d8ce3a16a3b7bd92c0d7bf5b056
--- /dev/null
+++ b/doc/v2/design/interface/index_cn.rst
@@ -0,0 +1,7 @@
+多语言接口
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..356e58c39c5ef6ee5ee50ab999b85f88628bfb85
--- /dev/null
+++ b/doc/v2/design/interface/index_en.rst
@@ -0,0 +1,7 @@
+Multilingual Interface
+-----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/mkl/image/engine.png b/doc/v2/design/mkl/image/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/v2/design/mkl/image/engine.png differ
diff --git a/doc/v2/design/mkl/image/gradients.png b/doc/v2/design/mkl/image/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/v2/design/mkl/image/gradients.png differ
diff --git a/doc/v2/design/mkl/image/layers.png b/doc/v2/design/mkl/image/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/v2/design/mkl/image/layers.png differ
diff --git a/doc/v2/design/mkl/image/matrix.png b/doc/v2/design/mkl/image/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/v2/design/mkl/image/matrix.png differ
diff --git a/doc/v2/design/mkl/image/overview.png b/doc/v2/design/mkl/image/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/v2/design/mkl/image/overview.png differ
diff --git a/doc/v2/design/mkl/mkl_packed.md b/doc/v2/design/mkl/mkl_packed.md
new file mode 100644
index 0000000000000000000000000000000000000000..0123315ad4368e68b377f66119949bfd6c1c7860
--- /dev/null
+++ b/doc/v2/design/mkl/mkl_packed.md
@@ -0,0 +1,108 @@
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4876de0045979be20fa45bdc84d2594516f71c03
--- /dev/null
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -0,0 +1,237 @@
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+
+MKL，MKLML以及MKL-DNN三者关系如下表：
+
+<table>
+<thead>
+<tr>
+<th>Name</th>
+<th>Open Source</th>
+<th>License</th>
+<th>Descriptions</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>MKL</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Accelerate math processing routines</td>
+</tr>
+<tr>
+<td>MKLML</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Small package of MKL, especially for Machine Learning</td>
+</tr>
+
+<tr>
+<td>MKL-DNN</td>
+<td>Yes</td>
+<td>Apache 2.0</td>
+<td>Accelerate primitives processing routines especially for Deep Neural Networks</td>
+</tr>
+
+</tbody>
+</table>
+
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
+</div>
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
+
+### CMake
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
+
+- `WITH_MKLML` 控制是否使用MKLML库。
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
+
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
+
+### Layers
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
+
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
+
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
+
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如：
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。
+
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+
+### Benchmarking
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
+
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
+
+我们总结出一些特别需要注意的点：
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
+
+## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3244eedf918b93f9351258f1218dfb2d507c1a9c
--- /dev/null
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,243 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
+
+## 克隆（Clone）
+
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
+```
+
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
+
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像，在源码目录树的根目录中运行：
+
+```bash
+➜  docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
+
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试，可以用如下命令：
+
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
+
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
+
+选择目标分支：
+
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
+
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
+
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后，删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+
+至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交PUll Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5
--- /dev/null
+++ b/doc/v2/dev/contribute_to_paddle_en.md
@@ -0,0 +1,162 @@
+# Contribute Code
+
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the 
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+
+1. Fork
+
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+   To make a copy of your fork to your local computers, please run
+
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+
+1. Create the local feature branch
+
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+
+1. Commit
+
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
+1. Build and test
+
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+
+1. Push and file a pull request
+
+   You can "push" your local work into your forked repo:
+
+   ```bash
+   git push origin my-cool-stuff
+   ```
+
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+
+### Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter.  For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/doc/v2/dev/index_cn.rst b/doc/v2/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aee3c68de05de26df3cd79170fa7f4ecad4bf386
--- /dev/null
+++ b/doc/v2/dev/index_cn.rst
@@ -0,0 +1,24 @@
+开发标准
+========
+PaddlePaddle遵守如下三个部分的代码和文档规范。
+
+PaddlePaddle使用git做版本管理，docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范，不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+
+PaddlePaddle面向国内外用户，包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档，重在问题描述，背景阐述，然后才是解决方案。文档由Sphinx生成，因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档，请参阅如下文档。
+
+..  toctree::
+  :maxdepth: 1
+
+  write_docs_cn.rst
+
+PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer，请参阅如下文档，欢迎提交patch。
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_cn.rst
diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cbff313fc5b9468b58159cf2b04e8464f9bebc78
--- /dev/null
+++ b/doc/v2/dev/index_en.rst
@@ -0,0 +1,28 @@
+Development
+------------
+
+
+PaddlePaddle adheres to the following three sections of code and document specifications.
+
+
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
+
+..  toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_en.md
+
+
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
+  write_docs_en.rst
+
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer_en.rst
diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e5a14346123d342de0b67757cbbce654bd4180dc
--- /dev/null
+++ b/doc/v2/dev/new_layer_cn.rst
@@ -0,0 +1,389 @@
+==================
+如何实现新的网络层
+==================
+
+这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。
+
+1. 推导该层前向和后向传递的方程。
+2. 实现该层的C++类。
+3. 增加梯度检测的单元测试，以保证梯度的正确计算。
+4. 封装该层的Python接口。
+
+推导方程
+================
+
+首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入，计算输出。后向传播给定输出的梯度，计算输入和参数的梯度。
+
+下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
+
+..  image:: src/FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+一个网络层的前向传播部分把输入转化为相应的输出。
+全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入，使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量，并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。
+
+.. math::
+
+   y = f(W^T x + b)
+
+其中 :math:`f(.)` 是一个非线性的*激活方程*，例如sigmoid， tanh，以及Relu。
+
+变换矩阵 :math:`W` 和偏置向量 :math:`b`  是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度，分别计算每个参数的梯度，以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。
+
+假设损失函数是 :math:`c(y)` ，那么
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+假设 :math:`z = W^T x + b` ，那么
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+PaddlePaddle的base layer类可以自动计算上面的导数。
+
+因此，对全连接层来说，我们需要计算：
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+其中 :math:`\mathbf 1` 是一个全1的向量， :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值， :math:`z_j` 是向量 :math:`z` 的第j个值， :math:`x_i` 是向量 :math:`x` 的第i个值。
+
+最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。
+
+实现C++类
+===================
+
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+
+这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
+
+- 类的构造函数和析构函数。
+- :code:`init` 函数。用于初始化参数和设置。
+- :code:`forward` 。实现网络层的前向传播。
+- :code:`backward` 。实现网络层的后向传播。
+- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新，则不需要重写该函数。（大多数网络层不需要支持远程稀疏更新）
+
+
+头文件如下：
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * 全连接层的每个输出都连接到上一层的所有的神经元上。
+     * 它的输入与经过学习的参数做内积并加上偏置（可选）。
+     *
+     * 配置文件接口是fc_layer。
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象，它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。
+
+- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。每个权重对应一个输入。
+- :code:`biases_` 是存有偏置向量的权重。
+
+全连接层没有网络层配置的超参数。如果一个网络层需要配置的话，通常的做法是将配置存于 :code:`LayerConfig& config` 中，并在类构建函数中把它放入一个类成员变量里。
+
+下面的代码片段实现了 :code:`init` 函数。
+
+- 首先，所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。
+- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。
+- 最后，初始化偏置向量。
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* 初始化父类 */
+      Layer::init(layerMap, parameterMap);
+
+      /* 初始化权重表 */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // 获得参数尺寸
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // 新建一个权重
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // 将新建的权重加入权重表
+        weights_.emplace_back(w);
+      }
+
+      /* 初始化biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+实现前向传播的部分有下面几个步骤。
+
+- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
+- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。
+- 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* 若有必要，为output_申请内存 */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // 设置输出的尺寸
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // 对每个输入乘上变换矩阵
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* 加上偏置向量 */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* 激活 */ {
+        forwardActivation();
+      }
+    }
+
+实现后向传播的部分有下面几个步骤。
+
+- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度，调用该函数后，梯度会就地（不使用额外空间）乘上输出的梯度。
+- 计算偏置的梯度。注意，我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后，**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。
+- 最后，计算转换矩阵和输入的梯度，并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度，从而可以做一些与计算重叠的工作（例如，网络通信）。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* 对激活求导 */ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* 计算当前层权重的梯度 */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* 计算输入层的偏差 */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+ :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时，完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时，该批次的输入中仅有一个子集是非零的。因此，该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。
+
+大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+最后，使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符， :code:`FullyConnectedLayer` 是该层的类名。
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下，其会自动被加入编译列表。
+
+
+写梯度检查单元测试
+===============================
+
+写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
+
+所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+
++ 生成网络层配置。网络层配置包含以下几项：
+   - 偏置参数的大小。（例子中是4096）
+   - 层的类型。（例子中是fc）
+   - 层的大小。（例子中是4096）
+   - 激活的类型。（例子中是softmax）
+   - dropout的比例。（例子中是0.1）
++ 配置网络层的输入。在这个例子里，我们仅有一个输入。
+   - 输入的类型（ :code:`INPUT_DATA` ），可以是以下几种：
+       - :code:`INPUT_DATA` ：稠密向量。
+       - :code:`INPUT_LABEL` ：整数。
+       - :code:`INPUT_DATA_TARGET` ：稠密向量，但不用于计算梯度。
+       - :code:`INPUT_SEQUENCE_DATA` ：含有序列信息的稠密向量。
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA` ：含有序列信息和子序列信息的稠密向量。
+       - :code:`INPUT_SEQUENCE_LABEL` ：含有序列信息的整数。
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA` ：0-1稀疏数据。
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` ：浮点稀疏数据。
+   - 输入的名字。（例子中是 :code:`layer_0` ）
+   - 输入的大小。（例子中是8192）
+   - 非零数字的个数，仅对稀疏数据有效。
+   - 稀疏数据的格式，仅对稀疏数据有效。
++ 对每个输入，都需要调用一次 :code:`config.layerConfig.add_inputs();` 。
++ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。
+   - 层和输入的配置。（例子中是 :code:`config` ）
+   - 网络层的类型。（例子中是 :code:`fc` ）
+   - 梯度检查的输入数据的批次大小。（例子中是100）
+   - 输入是否是转置的。大多数层需要设置为 :code:`false` 。（例子中是 :code:`false` ）
+   - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如，softmax激活的输出的和总是1。在这种情况下，我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和（非常数）来计算梯度。（例子中是 :code:`true` ，因为全连接层的激活可以是softmax）
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+如果你要为了测试而增加新的文件，例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+实现python封装
+========================
+
+python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步：
+
+- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。
+- 实现构造函数 :code:`__init__` 。
+	- 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作，这些名字必须要写对。
+	- 之后，计算变换矩阵的大小和格式（是否稀疏）。
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+在网络配置中，网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括：
+
+- :code:`name` 是网络层实例的名字标识符。
+- :code:`type` 是网络层的类型，通过网络层的标识符来指定。
+- :code:`size` 是网络层输出的大小。
+- :code:`bias` 表明这个层的一个实例是否需要偏置。
+- :code:`inputs` 说明这个层的输入，输入是由一个list中的网络层实例的名字组成的。
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+我们建议你为你的Python封装实现一个“助手”，使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。
diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ad723738801908a5f48343574c204bdbfc97ee08
--- /dev/null
+++ b/doc/v2/dev/new_layer_en.rst
@@ -0,0 +1,390 @@
+================
+Write New Layers
+================
+
+This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
+
+- Derive equations for the forward and backward part of the layer.
+- Implement C++ class for the layer.
+- Write gradient check unit test to make sure the gradients are correctly computed.
+- Implement Python wrapper for the layer.
+
+Derive Equations
+================
+
+First we need to derive equations of the *forward* and *backward* part of the layer. The forward part computes the output given an input. The backward part computes the gradients of the input and the parameters given the the gradients of the output.
+
+The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes.
+
+..  image:: src/FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+The *forward part* of a layer transforms an input into the corresponding output.
+Fully connected layer takes a dense input vector with dimension :math:`D_i`. It uses a transformation matrix :math:`W` with size :math:`D_i \times D_o` to project :math:`x` into a :math:`D_o` dimensional vector, and add a bias vector :math:`b` with dimension :math:`D_o` to the vector.
+
+.. math::
+
+   y = f(W^T x + b)
+
+where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
+
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
+
+Suppose our loss function is :math:`c(y)`, then
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+Suppose :math:`z = W^T x + b`, then
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+This derivative can be automatically computed by our base layer class.
+
+Then, for fully connected layer, we need to compute:
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
+
+Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`, and :math:`\frac{\partial z}{\partial W}`. The details of the computation will be given in the next section.
+
+Implement C++ Class
+===================
+
+The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
+
+It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
+
+- constructor and destructor.
+- :code:`init` function. It is used to initialize the parameters and settings.
+- :code:`forward`. It implements the forward part of the layer.
+- :code:`backward`. It implements the backward part of the layer.
+- :code:`prefetch`. It is utilized to determine the rows corresponding parameter matrix to prefetch from parameter server. You do not need to override this function if your layer does not need remote sparse update. (most layers do not need to support remote sparse update)
+
+
+The header file is listed below:
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * A layer has full connections to all neurons in the previous layer.
+     * It computes an inner product with a set of learned weights, and
+     * (optionally) adds biases.
+     *
+     * The config file api is fc_layer.
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+It defines the parameters as class variables. We use :code:`Weight` class as abstraction of parameters. It supports multi-thread update. The details of this class will be described in details in the implementations.
+
+- :code:`weights_` is a list of weights for the transformation matrices. The current implementation can have more than one inputs. Thus, it has a list of weights. One weight corresponds to an input.
+- :code:`biases_` is a weight for the bias vector.
+
+The fully connected layer does not have layer configuration hyper-parameters. If there are some layer hyper-parameters, a common practice is to store it in :code:`LayerConfig& config`, and put it into a class variable in the constructor.
+
+The following code snippet implements the :code:`init` function.
+
+- First, every :code:`init` function must call the :code:`init` function of the base class :code:`Layer::init(layerMap, parameterMap);`. This statement will initialize the required variables and connections for each layer.
+- The it initializes all the weights matrices :math:`W`. The current implementation can have more than one inputs. Thus, it has a list of weights.
+- Finally, it initializes the bias.
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* Initialize the basic parent class */
+      Layer::init(layerMap, parameterMap);
+
+      /* initialize the weightList */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // Option the parameters
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // create a new weight
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // append the new weight to the list
+        weights_.emplace_back(w);
+      }
+
+      /* initialize biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+The implementation of the forward part has the following steps.
+
+- Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function.
+- Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
+- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`.
+- Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* malloc memory for the output_ if necessary */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // Settup the size of the output.
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // Apply the the transformation matrix to each input.
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* add the bias-vector */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* activation */ {
+        forwardActivation();
+      }
+    }
+
+The implementation of the backward part has the following steps.
+
+- :code:`backwardActivation()` computes the gradients of the activation. The gradients will be multiplies in place to the gradients of the output, which can be retrieved using :code:`getOutputGrad()`.
+- Compute the gradients of bias. Notice that we an use :code:`biases_->getWGrad()` to get the gradient matrix of the corresponding parameter. After the gradient of one parameter is updated, it **MUST** call :code:`getParameterPtr()->incUpdate(callback);`. This is utilize for parameter update over multiple threads or multiple machines.
+- Then it computes the gradients of the transformation matrices and inputs, and it calls :code:`incUpdate` for the corresponding parameter. This gives the framework the chance to know whether it has gathered all the gradient to one parameter so that it can do some overlapping work (e.g., network communication)
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* Do derivation for activations.*/ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* Calculate the W-gradient for the current layer */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* Calculate the input layers error */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+The :code:`prefetch` function specifies the rows that need to be fetched from parameter server during training. It is only useful for remote sparse training. In remote sparse training, the full parameter matrix is stored distributedly at the parameter server. When the layer uses a batch for training, only a subset of locations of the input is non-zero in this batch. Thus, this layer only needs the rows of the transformation matrix corresponding to the locations of these non-zero entries. The :code:`prefetch` function specifies the ids of these rows.
+
+Most of the layers do not need remote sparse training function. You do not need to override this function in this case.
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to register the layer. :code:`fc` is the identifier of the layer, and :code:`FullyConnectedLayer` is the class name of the layer.
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list.
+
+
+Write Gradient Check Unit Test
+===============================
+
+An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly.
+
+All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
+
++ Create layer configuration. A layer configuration can include the following attributes:
+   - size of the bias parameter. (4096 in our example)
+   - type of the layer. (fc in our example)
+   - size of the layer. (4096 in our example)
+   - activation type. (softmax in our example)
+   - dropout rate. (0.1 in our example)
++ configure the input of the layer. In our example, we have only one input.
+   - type of the input (:code:`INPUT_DATA`) in our example. It can be one of the following types
+       - :code:`INPUT_DATA`: dense vector.
+       - :code:`INPUT_LABEL`: integer.
+       - :code:`INPUT_DATA_TARGET`: dense vector, but it does not used to compute gradient.
+       - :code:`INPUT_SEQUENCE_DATA`: dense vector with sequence information.
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA`: dense vector with both sequence and sub-sequence information.
+       - :code:`INPUT_SEQUENCE_LABEL`: integer with sequence information.
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA`: 0-1 sparse data.
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA`: float sparse data.
+   - name of the input. (:code:`layer_0` in our example)
+   - size of the input. (8192 in our example)
+   - number of non-zeros, only useful for sparse inputs.
+   - format of sparse data, only useful for sparse inputs.
++ each inputs needs to call :code:`config.layerConfig.add_inputs();` once.
++ call :code:`testLayerGrad` to perform gradient checks. It has the following arguments.
+   - layer and input configurations. (:code:`config` in our example)
+   - type of the layer. (:code:`fc` in our example)
+   - batch size of the gradient check. (100 in our example)
+   - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example)
+   - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax)
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+Implement Python Wrapper
+========================
+
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+
+- Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer.
+- Implements :code:`__init__` constructor function.
+	- It first call :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` base constructor function. :code:`FCLayer` is the Python wrapper class name, and :code:`fc` is the layer identifier name. They must be correct in order for the wrapper to work.
+	- Then it computes the size and format (whether sparse) of each transformation matrix as well as the size.
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+In network configuration, the layer can be specifies using the following code snippets. The arguments of this class are:
+
+- :code:`name` is the name identifier of the layer instance.
+- :code:`type` is the type of the layer, specified using layer identifier.
+- :code:`size` is the output size of the layer.
+- :code:`bias` specifies whether this layer instance has bias.
+- :code:`inputs` specifies a list of layer instance names as inputs.
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+You are also recommended to implement a helper for the Python wrapper, which makes it easier to write models. You can refer to :code:`python/paddle/trainer_config_helpers/layers.py` for examples.
diff --git a/doc/v2/dev/src/FullyConnected.jpg b/doc/v2/dev/src/FullyConnected.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b2241f401434e527f95ee4e0e541a3f2ff78fd1e
Binary files /dev/null and b/doc/v2/dev/src/FullyConnected.jpg differ
diff --git a/doc/v2/dev/src/doc_en.png b/doc/v2/dev/src/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/dev/src/doc_en.png differ
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4231f2bb5cd800c0cd86835b5d07e491fcde4989
--- /dev/null
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -0,0 +1,136 @@
+#############
+如何贡献文档
+#############
+
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
+
+如何构建文档
+============
+
+PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
+
+我们建议使用PaddlePaddle.org工具来构建文档。
+
+使用PaddlePaddle.org工具
+------------------------
+这个是目前推荐的使用方法。除了可以自动编译文档，还可以直接在网页中预览文档，需要注意的是，采用后续说明的其它方式虽然也可以预览文档，但是文档的样式与官网文档是不一致的，使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。
+
+PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后即可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
+
+不使用PaddlePaddle.org工具
+--------------------------
+
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
+
+如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # 如果只需要构建使用文档，则执行以下命令
+   make -j $processors paddle_docs
+
+   # 如果只需要构建API，则执行以下命令
+   make -j $processors paddle_apis
+
+其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
+
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+如何书写文档
+============
+
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
+
+如何更新www.paddlepaddle.org
+============================
+
+更新的文档以PR的形式提交到github中，提交方式参见 `如何贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/dev/write_docs_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6105455e202e4704aa25f0fd9916b9b61a569702
--- /dev/null
+++ b/doc/v2/dev/write_docs_en.rst
@@ -0,0 +1,139 @@
+########################
+Contribute Documentation
+########################
+
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
+
+How to build Documentation
+===========================
+
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
+
+We recommend using PaddlePaddle.org tool to build documentation.
+
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
+
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors paddle_docs
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors paddle_apis
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
+
+How to write Documentation
+===========================
+
+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.
+
+How to update www.paddlepaddle.org
+===================================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0d644777287aea0a572adb6fa40f498f9c147af7
--- /dev/null
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -0,0 +1,224 @@
+.. _install_faq:
+
+###################
+编译安装与单元测试
+###################
+
+..  contents::
+
+1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
+----------------------------------------------------------------
+
+用户在使用PaddlePaddle GPU的Docker镜像的时候，常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
+具体的解决方法是：
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_cn.html>`_ 。
+
+
+2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
+用户强制指定特定的Python版本，具体操作如下：
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+3. CMake源码编译，Paddle版本号为0.0.0
+--------------------------------------
+
+如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`；或者运行 :code:`cmake ..`，出现
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
+
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
+5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. 遇到“非法指令”或者是“illegal instruction”
+--------------------------------------------
+
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
+
+7.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况：
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+并且查询PaddlePaddle单元测试的日志，提示：
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+解决办法是：
+
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+8. 下载MKLML库失败
+------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2
+    make[1]: *** 正在等待未完成的任务....
+
+原因：网速或SSL链接原因，导致MKLML库下载不成功。
+
+解决办法是：手动下载并安装，具体步骤如下。
+
+..  code-block:: bash
+
+    // 1. 进入对应的目录
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. 查看包的大小， 正常情况下是75M，如果小于75M，即下载失败：
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. 手动下载且解压缩，并手动生成download成功标签：
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. 接着编译即可
+
+9. 在Mac上无法安装numpy等Python包，权限错误
+------------------
+
+Mac上对自带的Python和包有严格的权限保护，最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
+
+virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝，并在这多个拷贝之间自由切换，这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
+
+下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境：
+
+安装virtualenv：
+::::::::::::::::
+
+virtualenv本身也是Python的一个包，可以用pip进行安装：
+
+..  code-block:: bash
+
+    sudo -H pip install virtualenv
+
+由于virtualenv需要安装给系统自带的Python，因此需要使用sudo权限。
+
+创建一个新的Python运行环境：
+:::::::::::::::::::
+
+..  code-block:: bash
+
+    virtualenv --no-site-packages paddle
+
+--no-site-packages 参数表示不拷贝已有的任何第三方包，创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
+
+执行完这一步后，当前目录下应该会出现一个名为paddle（或者你取的其他名字）的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
+
+启动运行环境：
+::::::::::::::::
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+执行后会发现命令提示符前面增加了(paddle)字样，说明已经成功启动了名为‘paddle’的Python环境。执行which python，可以发现使用的已经是刚刚创建的paddle目录下的Python。
+
+在这个环境中，我们可以自由地进行Paddle的安装、使用和开发工作，无需担心对系统自带Python的影响。
+
+退出运行环境：
+:::::::::::::::
+
+直接执行：
+
+..  code-block:: bash
+
+    deactivate
+
+可以看到命令提示符前面的(paddle)字样消失。
+
+自动启动某一Python环境：
+::::::::::::::::
+
+如果我们经常使用Paddle，我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境，比较繁琐。为了简便，可以修改终端的配置文件，来让终端每次启动后自动启动特定的Python环境。
+
+执行:
+
+..  code-block:: bash
+
+    vi ~/.bash_profile
+
+打开终端配置文件，并在文件的最后添加一行：
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+保存并关闭文件。
+
+这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7488ed8137d57785f36b9f1e1ed1269f864960bc
--- /dev/null
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -0,0 +1,143 @@
+.. _install_faq:
+
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/doc/v2/faq/cluster/index_cn.rst b/doc/v2/faq/cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e59c1e1a54a0c876d1e6e89f88030de59fb9fc1a
--- /dev/null
+++ b/doc/v2/faq/cluster/index_cn.rst
@@ -0,0 +1,17 @@
+###############
+集群训练与预测
+###############
+
+..  contents::
+
+1. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------------------------
+
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fa942a09625bef78b28456beeb735272b686e061
--- /dev/null
+++ b/doc/v2/faq/cluster/index_en.rst
@@ -0,0 +1,16 @@
+###############################
+Cluster Training and Prediction
+###############################
+
+.. contents::
+
+1. Network connection errors in the log during multi-node cluster training
+------------------------------------------------
+There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
+This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
+
+* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
+
+* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
+
+* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
diff --git a/doc/v2/faq/index_cn.rst b/doc/v2/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4537c7a481e2efbcfed5fa7be2c81c36e13cd108
--- /dev/null
+++ b/doc/v2/faq/index_cn.rst
@@ -0,0 +1,13 @@
+FAQ
+====
+
+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_cn.rst
+  model/index_cn.rst
+  parameter/index_cn.rst
+  local/index_cn.rst
+  cluster/index_cn.rst
diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3fa220792b252617848a1c76bc2be49928e35f64
--- /dev/null
+++ b/doc/v2/faq/index_en.rst
@@ -0,0 +1,13 @@
+FAQ
+====
+
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
+
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_en.rst
+  model/index_en.rst
+  parameter/index_en.rst
+  local/index_en.rst
+  cluster/index_en.rst
diff --git a/doc/v2/faq/local/index_cn.rst b/doc/v2/faq/local/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c6d3c5bfac5a276e253c248ffd415c7789b20b29
--- /dev/null
+++ b/doc/v2/faq/local/index_cn.rst
@@ -0,0 +1,259 @@
+###############
+本地训练与预测
+###############
+
+..  contents::
+
+1. 如何减少内存占用
+-------------------
+
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
+* 其他内存杂项
+
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
+
+减少DataProvider缓冲池内存
+++++++++++++++++++++++++++
+
+PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+
+所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
+个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 :ref:`api_pydataprovider2` 。
+
+神经元激活内存
+++++++++++++++
+
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
+在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
+一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
+的时间步信息成正比。
+
+所以做法可以有两种：
+
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
+* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
+
+参数内存
+++++++++
+
+PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
+文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
+
+可以考虑使用一些优化算法，例如 :code:`momentum`。
+
+2. 如何加速训练速度
+-------------------
+
+加速PaddlePaddle训练可以考虑从以下几个方面\：
+
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用分布式训练驾驭更多的计算资源
+
+减少数据载入的耗时
+++++++++++++++++++
+
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+
+
+加速训练速度
+++++++++++++
+
+PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
+
+这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
+
+使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+这个任务的配置为\:
+
+..  literalinclude:: src/word2vec_config.py
+
+
+利用更多的计算资源
+++++++++++++++++++
+
+利用更多的计算资源可以分为以下几个方式来进行\:
+
+* 单机CPU训练
+
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
+* 单机GPU训练
+
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
+* 多机训练
+
+  * 请参考 :ref:`cluster_train` 。
+
+3. 如何指定GPU设备
+------------------
+
+例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
+
+* 方式1：通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* 方式2：通过命令行参数 ``--gpu_id`` 指定。
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+这里有两种有效的解决方法：
+
+1. 设置 :code:`gradient_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. 设置 :code:`error_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+两种方法的区别：
+
+1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
+2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
+
+除此之外，还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
+
+5.  如何调用 infer 接口输出多个layer的预测结果
+-----------------------------------------------
+
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+需要注意的是：
+
+* 如果指定了2个layer作为输出层，实际上需要的输出结果是两个矩阵；
+* 假设第一个layer的输出A是一个 N1 * M1 的矩阵，第二个 Layer 的输出B是一个 N2 * M2 的矩阵；
+* paddle.v2 默认会将A和B 横向拼接，当N1 和 N2 大小不一样时，会报如下的错误：
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+多个层的输出矩阵的高度不一致导致拼接失败，这种情况常常发生在：
+
+* 同时输出序列层和非序列层；
+* 多个输出层处理多个不同长度的序列;
+
+此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤，来解决上面的问题。这时，infer接口的返回值是一个python list:
+
+* list 中元素的个数等于网络中输出层的个数；
+* list 中每个元素是一个layer的输出结果矩阵，类型是numpy的ndarray；
+* 每一个layer输出矩阵的高度，在非序列输入时：等于样本数；序列输入时等于：输入序列中元素的总数；宽度等于配置中layer的size；
+
+6.  如何在训练过程中获得某一个layer的output
+-----------------------------------------------
+
+可以在event_handler中，通过 :code:`event.gm.getLayerOutputs("layer_name")` 获得在模型配置中某一层的name :code:`layer_name` 在当前
+mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarray` ，可以通过这个输出来完成自定义的评估指标计算等功能。例如下面代码：
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+注意：此方法不能获取 :code:`paddle.layer.recurrent_group` 里step的内容，但可以获取 :code:`paddle.layer.recurrent_group` 的输出。
+
+7.  如何在训练过程中获得参数的权重和梯度
+-----------------------------------------------
+
+在某些情况下，获得当前mini-batch的权重（或称作weights, parameters）有助于在训练时观察具体数值，方便排查以及快速定位问题。
+可以通过在 :code:`event_handler` 中打印其值（注意，需要使用 :code:`paddle.event.EndForwardBackward` 保证使用GPU训练时也可以获得），
+示例代码如下：
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+注意：“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy，会对训练性能造成影响。不要在注重性能的训练场景下使用。
\ No newline at end of file
diff --git a/doc/v2/faq/local/index_en.rst b/doc/v2/faq/local/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fa95b1753dbe293811d7a8601497ad521fa3ecda
--- /dev/null
+++ b/doc/v2/faq/local/index_en.rst
@@ -0,0 +1,248 @@
+#############################
+Parameter Setting
+#############################
+
+..  contents::
+
+1. Reduce Memory Consumption
+-------------------
+
+The training procedure of neural networks demands dozens of gigabytes of host memory or serval gigabytes of device memory, which is a rather memory consuming work. The memory consumed by PaddlePaddle framework mainly includes:
+\:
+
+* Cache memory for DataProvider (only on host memory),
+* Memory for neurons' activation information (on both host memory and device memory),
+* Memory for parameters (on both host memory and device memory),
+* Other memory demands.
+
+Other memory demands is mainly used to support the running demand of PaddlePaddle framework itself, such as string allocation，temporary variables, which are not considered currently.
+
+Reduce DataProvider Cache Memory
+++++++++++++++++++++++++++
+
+PyDataProvider works under asynchronous mechanism, it loads together with the data fetch and shuffle procedure in host memory:
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        Data Files -> Host Memory Pool -> PaddlePaddle Training
+    }
+
+Thus the reduction of the DataProvider cache memory can reduce memory occupancy, meanwhile speed up the data loading procedure before training. However, the size of the memory pool can actually affect the granularity of shuffle，which means a shuffle operation is needed before each data ﬁle reading process to ensure the randomness of data when try to reduce the size of the memory pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+In this way, the memory consumption can be significantly reduced and hence the training procedure can be accelerated. More details are demonstrated in :ref:`api_pydataprovider2`.
+
+The Neurons Activation Memory
+++++++++++++++
+
+Each neuron activation operating in a neural network training process contains certain amount of temporary data such as the activation data (like the output value of a neuron). These data will be used to update parameters in back propagation period. The scale of memory consumed by these data is mainly related with two parameters, which are batch size and the length of each Sequence. Therefore, the neurons activation memory consuming is actually in proportion to the information contains in each mini-batch training.
+
+Two practical ways:
+
+* Reduce batch size. Set a smaller value in network configuration settings(batch_size=1000) can be helpful. But setting batch size to a smaller value may affect the training result due to it is a super parameter of the neural network itself.
+* Shorten the sequence length or cut oﬀ those excessively long sequences. For example, if the length of sequences in a dataset are mostly varies between 100 and 200, but there is sequence lengthen out to 10,000, then it’s quite potentially leads to OOM (out of memory), especially in RNN models such as LSTM.
+
+The Parameters Memory
+++++++++
+
+The PaddlePaddle framework supports almost all popular optimizers. Different optimizers have different memory requirement. For example, the :code:`adadelta` consumes approximately 5 times memory
+
+space than the weights parameter’s scale, which means the :code:`adadelta` needs at least :code:`500M` memory if the model ﬁle contains all
+
+parameters needs :code:`100M`.
+
+Some optimization algorithms such as :code:`momentum` are worth giving a shot.
+
+2. Tricks To Speed Up Training
+-------------------
+
+The training procedure of PaddlePaddle may be speed up when considering following aspects:\：
+
+* Reduce the time consumption of data loading
+* Speed up training epochs
+* Introduce more computing resources with the utilization of distribute training frameworks
+
+Reduce The Time Consumption of Data Loading
+++++++++++++++++++
+
+
+The \ :code:`pydataprovider`\ holds big potential to speed up the data loading procedure if the cache pool and enable memory cache when use it. The principle of the reduction of :code:`DataProvider` cache pool is basically the same with the method which reduct the memory occupation with the set of a smaller cache pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+Beside, the interface :code:`@provider` provides a parameter :code:`cache` to control cache. If set it to :code:`CacheType.CACHE_PASS_IN_MEM`, the data after the first :code:`pass` ( a pass means all data have be fed into the network for training) will be cached in memory and no new data will be read from the :code:`python` side in following :code:`pass` , instead from the cached data in memory. This strategy can also drop the time consuming in data loading process.
+
+
+Accelerating Training Epochs
+++++++++++++
+
+Sparse training is supported in PaddlePaddle. The features needs to be trained is any of :code:`sparse_binary_vector`, :code:`sparse_vector` and :code:`integer_value` . Meanwhile, the Layer interacts with the training data need to turn the Parameter to sparse updating mode by setting :code:`sparse_update=True`.
+Take :code:`word2vec` as an example, to train a language distance, one needs to predict the middle word with two words prior to it and next to it. The DataProvider of this task is:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+The configuration of this task is:
+
+..  literalinclude:: src/word2vec_config.py
+
+Introduce More Computing Resources
+++++++++++++++++++
+
+More computing resources can be introduced with following manners:
+* Single CPU platform training
+
+  * Use multi-threading by set :code:`trainer_count`。
+
+* Single GPU platform training
+
+  * Set :code:`use_gpu` to train on single GPU.
+  * Set :code:`use_gpu` and :code:`trainer_count` to enable multiple GPU training support.
+
+* Cluster Training
+
+  * Refer to :ref:`cluster_train` 。
+
+3. Assign GPU Devices
+------------------
+
+Assume a computing platform consists of 4 GPUs which serial number from 0 to 3:
+
+* Method1: specify a GPU as computing device by set:
+ `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* Method2: Assign by —gpu_id:
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. How to Fix Training Termination Caused By :code:`Floating point exception` During Training.
+------------------------------------------------------------------------
+
+Paddle binary catches floating exceptions during runtime, it will be terminated when NaN or Inf occurs. Floating exceptions are mostly caused by float overflow, divide by zero. There are three main reasons may raise such exception:
+
+* Parameters or gradients during training are oversize, which leads to float overflow during calculation.
+* The model failed to converge and diverges to a big value.
+* Parameters may converge to a singular value due to bad training data. If the scale of input data is too big and contains millions of parameter values, float overflow error may arise when operating matrix multiplication.
+
+Two ways to solve this problem:
+
+1. Set :code:`gradient_clipping_threshold` as:
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+Details can refer to example `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. Set :code:`error_clipping_threshold` as:
+
+..  code-block:: python
+
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+Details can refer to example `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+The main difference between these two methods are:
+
+1. They both block the gradient, but happen in different occasions，the former one happens when then :code:`optimzier` updates the network parameters while the latter happens when the back propagation computing of activation functions.
+2. The block target are different, the former blocks the trainable parameters’ gradient while the later blocks the gradient to be propagated to prior layers.
+
+Moreover, Such problems may be fixed with smaller learning rates or data normalization.
+
+5.  Fetch Multi Layers’ Prediction Result With Infer Interface
+-----------------------------------------------
+
+* Join the layer to be used as :code:`output_layer` layer to the input parameters of  :code:`paddle.inference.Inference()` interface with:
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* Assign certain ﬁelds to output. Take :code:`value` as example, it can be down with following code:
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+It is important to note that:
+
+* If 2 layers are assigned as output layer, then the output results consists of 2 matrixes.
+* Assume the output of first layer A is a matrix sizes N1 * M1, the output of second layer B is a matrix sizes N2 * M2；
+* By default, paddle.v2 will transverse join A and B, when N1 not equal to N2, it will raise following error:
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+The transverse of diﬀerent matrixes of multi layers mainly happens when:
+
+* Output sequence layer and non sequence layer;
+* Multiple output layers process multiple sequence with different length;
+
+Such issue can be avoided by calling infer interface and set :code:`flatten_result=False`. Thus, the infer interface returns a python list, in which
+
+* The number of elements equals to the number of output layers in the network;
+* Each element in list is a result matrix of a layer, which type is numpy.ndarray;
+* The height of each matrix outputted by each layer equals to the number of samples under non sequential mode or equals to the number of elements in the input sequence under sequential mode. Their width are both equal to the layer size in configuration.
+
+6.  Fetch the Output of A Certain Layer During Training
+-----------------------------------------------
+
+In event_handler, the interface :code:`event.gm.getLayerOutputs("layer_name")` gives the forward output value organized in :code:`numpy.ndarray` corresponding to :code:`layer_name` in the mini-batch.
+The output can be used in custom measurements in following way:
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+Note: this function can not get content of :code:`paddle.layer.recurrent_group` step, but output of  :code:`paddle.layer.recurrent_group` can be fetched.
+
+7.  Fetch Parameters’ Weight and Gradient During Training
+-----------------------------------------------
+
+Under certain situations, knowing the weights of currently training mini-batch can provide more inceptions of many problems. Their value can be acquired by printing values in :code:`event_handler` (note that to gain such parameters when training on GPU, you should set :code:`paddle.event.EndForwardBackward`). Detailed code is as following:
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+Note that “acquire the output of a certain layer during training” or “acquire the weights and gradients of parameters during training ” both needs to copy training data from C++ environment to numpy, which have certain degree of inﬂuence on training performance. Don’t use these two functions when the training procedure cares about the performance.
diff --git a/doc/v2/faq/local/src/reduce_min_pool_size.py b/doc/v2/faq/local/src/reduce_min_pool_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba96652f764d26c724ea22697e04572709bf6a4
--- /dev/null
+++ b/doc/v2/faq/local/src/reduce_min_pool_size.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
+    with open('%s.shuf' % filename, 'r') as f:
+        for line in f:
+            yield get_sample_from_line(line)
diff --git a/doc/v2/faq/local/src/word2vec_config.py b/doc/v2/faq/local/src/word2vec_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5b84e8ed4de5123097026a5c7992b06fd321750
--- /dev/null
+++ b/doc/v2/faq/local/src/word2vec_config.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+...  # the settings and define data provider is omitted.
+DICT_DIM = 3000  # dictionary dimension.
+word_ids = data_layer('word_ids', size=DICT_DIM)
+
+emb = embedding_layer(
+    input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            'label', size=DICT_DIM)))
diff --git a/doc/v2/faq/local/src/word2vec_dataprovider.py b/doc/v2/faq/local/src/word2vec_dataprovider.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe67b6d6cbbbdc8a98d497f352cf114a882636f
--- /dev/null
+++ b/doc/v2/faq/local/src/word2vec_dataprovider.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DICT_DIM = 3000
+
+
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+    with open(filename) as f:
+        # yield word ids to predict inner word id
+        # such as [28, 29, 10, 4], 4
+        # It means the sentance is  28, 29, 4, 10, 4.
+        yield read_next_from_file(f)
diff --git a/doc/v2/faq/model/index_cn.rst b/doc/v2/faq/model/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6947948bc79f4dba63954c459afb940e3242c405
--- /dev/null
+++ b/doc/v2/faq/model/index_cn.rst
@@ -0,0 +1,80 @@
+#########
+模型配置
+#########
+
+..  contents::
+
+1. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
+
+2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
+
+* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
+
+3. 两种使用 drop_out 的方法有何区别
+------------------------------------
+
+* 在PaddlePaddle中使用dropout有两种方式
+
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+
+* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
+
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
+
+4. 不同的 recurrent layer 的区别
+----------------------------------
+以LSTM为例，在PaddlePaddle中包含以下 recurrent layer：
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+按照具体实现方式可以归纳为2类：
+
+1. 由 recurrent_group 实现的 recurrent layer：
+
+  * 用户在使用这一类recurrent layer时，可以访问由recurrent unit在一个时间步内计算得到的中间值（例如：hidden states, memory cells等）；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ；
+
+2. 将recurrent layer作为一个整体来实现：
+
+  * 用户在使用这一类recurrent layer，只能访问它们的输出值；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现；
+
+将recurrent layer作为一个整体来实现， 能够针对CPU和GPU的计算做更多优化， 所以相比于recurrent group的实现方式， 第二类 recurrent layer 计算效率更高。 在实际应用中，如果用户不需要访问LSTM的中间变量，而只需要获得recurrent layer计算的输出，我们建议使用第二类实现。
+
+此外，关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元：
+
+  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程，它并不是一个完整的recurrent layer，也不能接收序列数据作为输入；
+  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用；
+
+5. PaddlePaddle的softmax能否指定计算的维度
+-----------------------------------------
+
+PaddlePaddle的softmax不能指定计算维度，只能按行计算。
+在图像任务中，对于NCHW，如果需要在C维度计算softmax，可以先使用 :code:`paddle.layer.switch_order` 改变维度顺序，即将NCHW转换成NHWC，再做一定的reshape，最后计算softmax。
+
+6. PaddlePaddle是否支持维数可变的数据输入
+------------------------------------------
+
+PaddlePaddle提供的 :code:`paddle.data_type.dense_array` 支持维数可变的数据输入。在使用时，将对应数据层的维数设置成一个大于输入数据维数的值用于占位即可。
diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67a33e08e192e5627ac3b0abd76e979f21ed2079
--- /dev/null
+++ b/doc/v2/faq/model/index_en.rst
@@ -0,0 +1,81 @@
+###################
+Model Configuration
+###################
+
+..  contents::
+
+1. How to deal with error :code:`Duplicated layer name`
+----------------------------------------------------------
+
+The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
+
+2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
+----------------------------------------------------------------------
+
+* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus,  :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
+
+* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name`  and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
+
+
+3. What is the difference between the two ways of using dropout
+-----------------------------------------------------------------
+
+* There are two ways to use dropout in PaddlePaddle
+
+  * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
+
+* PaddlePaddle implements dropout in the activation function rather than in the layer.
+
+* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
+
+4. The differences between different recurrent layers
+--------------------------------------------------------
+Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+According to implementations, recurrent layer can be classified into 2 types:
+
+1. Recurrent layer implemented by recurrent_group:
+
+  * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
+  * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
+
+2. Recurrent layer implemented as a complete operation：
+
+  * Users can only access output values when using this type of recurrent layers.
+  * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and  :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer；
+
+By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
+
+In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
+
+  * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
+  * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
+
+5. Can Softmax's calculation dimension be specified？
+--------------------------------------------------------------------
+
+We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
+In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
+
+6. Does PaddlePaddle support variable-dimensional data inputs
+----------------------------------------------------------------
+
+PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
diff --git a/doc/v2/faq/parameter/index_cn.rst b/doc/v2/faq/parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..987e8cf088be4ee8daa7c28fdc855506cbfd31c7
--- /dev/null
+++ b/doc/v2/faq/parameter/index_cn.rst
@@ -0,0 +1,201 @@
+#########
+参数设置
+#########
+
+..  contents::
+
+1. 如何选择SGD算法的学习率
+--------------------------
+
+在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
+
+通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
+
+如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
+
+2. 如何设置学习率退火（learning rate annealing）
+------------------------------------------------
+
+在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+
+PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
+
+* "constant"
+  
+  lr = learning_rate
+
+* "poly"
+
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  其中，num_samples_processed为已训练样本数，下同。
+
+* "caffe_poly"
+
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
+
+* "pass_manual"
+
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="pass_manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
+
+3. 如何初始化参数
+-----------------
+
+默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
+
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
+
+..  code-block:: python
+
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+
+4. 如何共享参数
+---------------
+
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
+
+简单的全连接网络，参数共享的配置示例为\:
+
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
+
+5. 如何加载预训练参数
+------------------------
+
+* 对加载预训练参数的层，设置其参数属性 :code:`is_static=True`，使该层的参数在训练过程中保持不变。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    emb_para = paddle.attr.Param(name='emb', is_static=True)
+    paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* 从模型文件将预训练参数载入 :code:`numpy.array`，在创建parameters后，使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息，用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    def load_parameter(file_name, h, w):
+        with open(file_name, 'rb') as f:
+            f.read(16)  # skip header.
+            return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    parameters = paddle.parameters.create(my_cost)
+    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. 存储的参数格式是什么，如何和明文进行相互转化
+--------------------------------------------------
+
+PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中，1~4字节表示PaddlePaddle版本信息，请直接填充0；5~8字节表示每个参数占用的字节数，当保存的网络参数为float类型时为4，double类型时为8；9~16字节表示保存的参数总个数。
+
+将PaddlePaddle保存的模型参数还原回明文时，可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数，此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时，未指定按照double精度编译，默认情况下按照float精度计算，保存的参数也是float类型。这时在使用 :code:`numpy.array` 时，一般设置 :code:`dtype=float32` 。示例如下：
+
+..  code-block:: python
+
+    def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                fmt="%.6f", delimiter=",")
+
+
+将明文参数转化为PaddlePaddle可加载的模型参数时，首先构造头信息，再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
+
+..  code-block:: python
+
+    def gen_rand_param(param_file, width, height, need_trans):
+        np.random.seed()
+        header = struct.pack("iil", 0, 4, height * width)
+        param = np.float32(np.random.rand(height, width))
+        with open(param_file, "w") as fparam:
+            fparam.write(header + param.tostring())
+
+7. A protocol message was rejected because it was too big
+------------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_recurrent.py>`_ 示例。
+
+
diff --git a/doc/v2/faq/parameter/index_en.rst b/doc/v2/faq/parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9edb8dd620f972d019db9c0063cefce616de0ebd
--- /dev/null
+++ b/doc/v2/faq/parameter/index_en.rst
@@ -0,0 +1,198 @@
+##################
+Parameter Settings
+##################
+
+.. contents::
+
+1. How to Choose the Learning Rate of SGD Algorithm
+--------------------------
+
+An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
+
+Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
+
+If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
+
+2. How to Implement Learning Rate Annealing
+------------------------------------------------
+
+We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
+
+.. code-block:: python
+
+    Optimizer = paddle.optimizer.Adam(
+        Learning_rate=1e-3,
+        Learning_rate_decay_a=0.5,
+        Learning_rate_decay_b=0.75,
+        Learning_rate_schedule="poly",)
+
+PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
+
+* "constant"
+  
+  Lr = learning_rate
+
+* "poly"
+
+  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  Variable :code:`num_samples_processed` is the number of trained samples.
+
+* "caffe_poly"
+
+  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="manual",
+          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
+
+* "pass_manual"
+
+  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="pass_manual",
+          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
+
+3. How to Initialize Parameters
+-----------------
+
+By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
+
+* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
+
+.. code-block:: python
+
+    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
+
+4. How to Share Parameters
+---------------
+
+PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
+
+A simple fully connected network has its configuration of parameter sharing as follows \:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
+
+5. How to Load Pre-training Parameters
+------------------------
+* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Emb_para = paddle.attr.Param(name='emb', is_static=True)
+    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Def load_parameter(file_name, h, w):
+        With open(file_name, 'rb') as f:
+            F.read(16) # skip header.
+            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    Parameters = paddle.parameters.create(my_cost)
+    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. Format of the Stored Parameter and How to Convert the File to Plain Text
+--------------------------------------------------
+
+The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
+
+When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
+
+.. code-block:: python
+
+    Def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        Vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                Fmt="%.6f", delimiter=",")
+
+
+When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
+
+.. code-block:: python
+
+    Def gen_rand_param(param_file, width, height, need_trans):
+        Np.random.seed()
+        Header = struct.pack("iil", 0, 4, height * width)
+        Param = np.float32(np.random.rand(height, width))
+        With open(param_file, "w") as fparam:
+            Fparam.write(header + param.tostring())
+
+7. A Protocol Message Rejected Because of its Large Size
+-------------------------------------------------- ----------
+
+If you are training NLP related models, and the following error occurs:
+
+.. code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
+
+.. code-block:: python
+
+     Src_dict = dict()
+     For line_count, line in enumerate(open(src_dict_path, "r")):
+        Src_dict[line.strip()] = line_count
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict": src_dict})
+
+The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
+
+.. code-block:: python
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict_path": src_dict_path})
+
+The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
diff --git a/doc/v2/getstarted/concepts/src/infer.py b/doc/v2/getstarted/concepts/src/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe256f234a1c7d29c33f3b65b8302646df0c45c
--- /dev/null
+++ b/doc/v2/getstarted/concepts/src/infer.py
@@ -0,0 +1,32 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
diff --git a/doc/v2/getstarted/concepts/src/train.py b/doc/v2/getstarted/concepts/src/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85d5d8a3acee61d11488e5b842831a79072680a
--- /dev/null
+++ b/doc/v2/getstarted/concepts/src/train.py
@@ -0,0 +1,71 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import numpy as np
+
+# init paddle
+paddle.init(use_gpu=False)
+
+# network config
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+
+# create parameters
+parameters = paddle.parameters.create(cost)
+# create optimizer
+optimizer = paddle.optimizer.Momentum(momentum=0)
+# create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+
+
+# event_handler to print training info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 1 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
+                                                  event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+
+# define training dataset reader
+def train_reader():
+    train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
+    train_y = np.array([[-2], [-3], [-7], [-7]])
+
+    def reader():
+        for i in xrange(train_y.shape[0]):
+            yield train_x[i], train_y[i]
+
+    return reader
+
+
+# define feeding map
+feeding = {'x': 0, 'y': 1}
+
+# training
+trainer.train(
+    reader=paddle.batch(
+        train_reader(), batch_size=1),
+    feeding=feeding,
+    event_handler=event_handler,
+    num_passes=100)
diff --git a/doc/v2/getstarted/concepts/use_concepts_cn.rst b/doc/v2/getstarted/concepts/use_concepts_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..608f49f5a969b3291eb43bf2acf582af74e566a1
--- /dev/null
+++ b/doc/v2/getstarted/concepts/use_concepts_cn.rst
@@ -0,0 +1,155 @@
+############
+基本使用概念
+############
+
+PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
+这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
+在使用该文档之前，请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+
+
+配置网络
+============
+
+加载PaddlePaddle
+----------------------
+
+在进行网络配置之前，首先需要加载相应的Python库，并进行初始化操作。
+
+..	code-block:: bash
+
+    import paddle.v2 as paddle
+    import numpy as np
+    paddle.init(use_gpu=False)
+
+
+搭建神经网络
+-----------------------
+
+搭建神经网络就像使用积木搭建宝塔一样。在PaddlePaddle中，layer是我们的积木，而神经网络是我们要搭建的宝塔。我们使用不同的layer进行组合，来搭建神经网络。
+宝塔的底端需要坚实的基座来支撑，同样，神经网络也需要一些特定的layer作为输入接口，来完成网络的训练。
+
+例如，我们可以定义如下layer来描述神经网络的输入：
+
+..	code-block:: bash
+
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+
+其中x表示输入数据是一个维度为2的稠密向量，y表示输入数据是一个维度为1的稠密向量。
+
+PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+
+在定义输入layer之后，我们可以使用其他layer进行组合。在组合时，需要指定layer的输入来源。
+
+例如，我们可以定义如下的layer组合：
+
+..	code-block:: bash
+
+    y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+    cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+
+其中，x与y为之前描述的输入层；而y_predict是接收x作为输入，接上一个全连接层；cost接收y_predict与y作为输入，接上平方误差层。
+
+最后一层cost中记录了神经网络的所有拓扑结构，通过组合不同的layer，我们即可完成神经网络的搭建。
+
+
+训练模型
+============
+
+在完成神经网络的搭建之后，我们首先需要根据神经网络结构来创建所需要优化的parameters，并创建optimizer。
+之后，我们可以创建trainer来对网络进行训练。
+
+..	code-block:: bash
+
+    parameters = paddle.parameters.create(cost)
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+其中，trainer接收三个参数，包括神经网络拓扑结构、神经网络参数以及迭代方程。
+
+在搭建神经网络的过程中，我们仅仅对神经网络的输入进行了描述。而trainer需要读取训练数据进行训练，PaddlePaddle中通过reader来加载数据。
+
+..	code-block:: bash
+
+    # define training dataset reader
+    def train_reader():
+        train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
+        train_y = np.array([[-2], [-3], [-7], [-7]])
+        def reader():
+            for i in xrange(train_y.shape[0]):
+                yield train_x[i], train_y[i]
+        return reader
+
+最终我们可以调用trainer的train方法启动训练：
+
+..	code-block:: bash
+
+    # define feeding map
+    feeding = {'x': 0, 'y': 1}
+
+    # event_handler to print training info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+    # training
+    trainer.train(
+        reader=paddle.batch(train_reader(), batch_size=1),
+        feeding=feeding,
+        event_handler=event_handler,
+        num_passes=100)
+
+关于PaddlePaddle的更多使用方法请参考 `进阶指南 <../../howto/index_cn.html>`_。
+
+线性回归完整示例
+==============
+
+下面给出在三维空间中使用线性回归拟合一条直线的例子：
+
+..  literalinclude:: src/train.py
+    :linenos:
+
+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
+有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/v2/getstarted/concepts/use_concepts_en.rst b/doc/v2/getstarted/concepts/use_concepts_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..406b0cbb913894dc333d8e4561c207793c33e475
--- /dev/null
+++ b/doc/v2/getstarted/concepts/use_concepts_en.rst
@@ -0,0 +1,3 @@
+Basic Concept
+=============
+TBD
diff --git a/doc/v2/getstarted/index_cn.rst b/doc/v2/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..75af7354be93a6eeabfa9ccf86903505402a7ca6
--- /dev/null
+++ b/doc/v2/getstarted/index_cn.rst
@@ -0,0 +1,19 @@
+新手入门
+============
+
+
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/v2/getstarted/index_en.rst b/doc/v2/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94b306895c9ddf6140cf600131930a6675a583eb
--- /dev/null
+++ b/doc/v2/getstarted/index_en.rst
@@ -0,0 +1,19 @@
+GET STARTED
+============
+
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+  
+  
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+  
+..  toctree::
+  :maxdepth: 1
+  
+  concepts/use_concepts_en.rst
diff --git a/doc/v2/getstarted/quickstart_cn.rst b/doc/v2/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d511cead262dabafd095f68adb5ffc596a7fe596
--- /dev/null
+++ b/doc/v2/getstarted/quickstart_cn.rst
@@ -0,0 +1,47 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：:ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/v2/getstarted/quickstart_en.rst b/doc/v2/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..70f7fe0646068aa79cd72955c6848ac0250c2300
--- /dev/null
+++ b/doc/v2/getstarted/quickstart_en.rst
@@ -0,0 +1,51 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..8878ee9d85064ba27708ed92790aa9b83ba316e5
--- /dev/null
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -0,0 +1,181 @@
+## 安装、编译与链接C-API预测库
+
+### 直接下载安装
+
+从CI系统中下载最新的C-API开发包进行安装，用户可以从下面的表格中找到需要的版本：
+
+<table>
+<thead>
+<tr>
+<th>版本说明</th>
+<th>C-API</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cpu_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_avx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
+
+### 从源码编译
+
+用户也可以从 PaddlePaddle 核心代码编译C-API链接库，只需在编译时配制下面这些编译选项：
+
+<table>
+<thead>
+<tr>
+<th>选项</th>
+<th>值</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+执行上述代码生成Makefile文件后，执行：`make && make install`。成功编译后，使用C-API所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件）均会存放于`PADDLE_ROOT`目录中。
+
+编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构（包括了编译出的PaddlePaddle头文件和链接库，以及第三方依赖链接库和头文件（如果需要，由链接方式决定））：
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### 链接说明
+
+目前提供三种链接方式：
+
+1. 链接`libpaddle_capi_shared.so` 动态库（这种方式最为简便，链接相对容易，**在无特殊需求情况下，推荐使用此方式**），需注意：
+    1. 如果编译时指定编译CPU版本，且使用`OpenBLAS`数学库，在使用C-API开发预测程序时，只需要链接`libpaddle_capi_shared.so`这一个库。
+    1. 如果是用编译时指定CPU版本，且使用`MKL`数学库，由于`MKL`库有自己独立的动态库文件，在使用PaddlePaddle C-API开发预测程序时，需要自己链接MKL链接库。
+    1. 如果编译时指定编译GPU版本，CUDA相关库会在预测程序运行时动态装载，需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
+
+2. 链接静态库 `libpaddle_capi_whole.a`，需注意：
+    1. 需要指定`-Wl,--whole-archive`链接选项。
+    1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库，可在`PADDLE_ROOT/third_party`下找到。
+    1. 如果在编译 C-API 时使用OpenBLAS数学库，需要显示地链接`libopenblas.a`。
+    1. 如果在编译 C-API 是使用MKL数学库，需要显示地链接MKL的动态库。
+
+3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`，需注意：
+    1. 这种链接方式主要用于移动端预测。
+    1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
+    1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
+    1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..70a6edef27e75af6b38d7d4824c928eba0d29b9a
--- /dev/null
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -0,0 +1,180 @@
+## Install and Build
+
+### Download & Install 
+
+  Download the latest C-API development package from CI system and install. You can find the required version in the table below:
+<table>
+<thead>
+<tr>
+<th>Version Tips</th>
+<th>C-API</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cpu_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_avx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz/?branch=0.14.0" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
+
+### From source
+
+  Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
+  
+<table>
+<thead>
+<tr>
+<th>Options</th>
+<th>Value</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+
+It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
+
+Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+After running the above code to generate Makefile , run: `make && make install`.  After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
+
+If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### Linking Description:
+
+There are three kinds of linking methods:
+
+1. Linking with dynamic library `libpaddle_capi_shared.so`（This way is much more convenient and easier, **Without special requirements, it is recommended**）, refer to the following：
+    1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
+    1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
+    1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
+
+2. Linking with static library `libpaddle_capi_whole.a`，refer to the following：
+    1. Specify `-Wl,--whole-archive` linking options.
+    1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
+    1. Use OpenBLAS library if compiling C-API，must explicitly link `libopenblas.a`.
+    1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
+
+3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`，refer to the following：
+    1. This linking methods is mainly used for mobile prediction.
+    1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
+    1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and  `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
+    1. The third-party dependencies need explicitly link same as method 2 above. 
diff --git a/doc/v2/howto/capi/images/csr.png b/doc/v2/howto/capi/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/v2/howto/capi/images/csr.png differ
diff --git a/doc/v2/howto/capi/images/sequence_data.png b/doc/v2/howto/capi/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/v2/howto/capi/images/sequence_data.png differ
diff --git a/doc/v2/howto/capi/images/workflow_of_CAPI.png b/doc/v2/howto/capi/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/v2/howto/capi/images/workflow_of_CAPI.png differ
diff --git a/doc/v2/howto/capi/index_cn.rst b/doc/v2/howto/capi/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7f100717983f5e950b801e6b05ee48bfff273c62
--- /dev/null
+++ b/doc/v2/howto/capi/index_cn.rst
@@ -0,0 +1,26 @@
+C-API预测库
+==================
+
+当我们训练完一个神经网络模型之后，下一步就是用模型来做预测。预测就是准备输入数据，经过模型处理之后，得到预测结果的过程。
+
+相比于模型训练，预测有如下特点：
+
+#. 预测不需要训练过程中反向传播和参数更新的部分。
+#. 预测不需要标签(label)。
+#. 预测很多时候需要和用户系统整合在一起。
+
+因为上述特点，模型预测SDK需要单独设计，并具备以下特点：
+
+#. 预测SDK不包含反向传播和参数更新部分，以减小SDK的体积。
+#. 预测SDK需要提供一个简洁的用户接口，方便使用。
+#. 因为输入数据可能有多种结构，对输入数据的格式做清晰简洁的封装。
+#. 为了和用户系统兼容，SDK的接口需要是满足C标准的接口。
+
+PaddlePaddle提供了C-API，用于解决上述问题。关于C-API的使用，我们提供了如下指南：
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ec39c9d5223442cf6872edaf7befeb5053b538e
--- /dev/null
+++ b/doc/v2/howto/capi/index_en.rst
@@ -0,0 +1,26 @@
+C-API Inference Library
+========================
+
+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+
+Compared with model training, prediction has the following features:
+
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_en.md
+  organization_of_the_inputs_en.md
+  workflow_of_capi_en.md
diff --git a/doc/v2/howto/capi/organization_of_the_inputs_cn.md b/doc/v2/howto/capi/organization_of_the_inputs_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..343526c213110cb9c6abaf9a12b3d634ad3fabe9
--- /dev/null
+++ b/doc/v2/howto/capi/organization_of_the_inputs_cn.md
@@ -0,0 +1,289 @@
+## 输入/输出数据组织
+
+这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据，以及如何解析神经网络前向计算的输出结果。
+
+### 输入/输出数据类型
+在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+
+1. 一维整型数组
+1. 二维浮点型矩阵
+
+    - 稠密矩阵
+    - 稀疏矩阵
+
+说明：
+
+1. 一维数组**仅支持整型值**；
+    - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
+    - 分类任务中类别标签；
+1. 逻辑上高于二维的数据（例如含有多个通道的图片，视频等）在程序实现中都会转化为二维矩阵，转化方法在相应的领域都有通用解决方案，需要使用者自己了解并完成转化；
+1. 二维矩阵可以表示行向量和列向量，任何时候如果需要浮点型数组（向量），都应使用C-API中的矩阵来表示，而不是C-API中的一维数组。
+1. 不论是一维整型数组还是二维浮点数矩阵，**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时，无需关心和处理序列信息。关于什么是“序列信息”，下文会详细进行介绍。
+
+### 基本使用概念
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`Argument`内部由`IVector`（对应着上文提到的一维整型数组）和`Matrix`（对应着上文提到的二维浮点型矩阵）来实际存储数据；由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
+
+- **注**：
+    1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。
+    1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。
+    1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。
+
+### 组织输入数据
+- 一维整型数组
+
+    概念上可以将`paddle_ivector`理解为一个一维的整型数组，通常用于表示离散的类别标签，或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。
+    ```c
+    int ids[] = {1, 2, 3};
+     paddle_ivector ids_array =
+         paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false);
+     CHECK(paddle_arguments_set_ids(in_args, 0, ids_array));
+    ```
+
+- **稠密矩阵**
+    - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列，矩阵里的元素是浮点数。对神经网络来说，矩阵的高度`m`是一次预测接受的样本数目，宽度$n$是神经网络定义时，`paddle.layer.data`的`size`。
+    - 下面的代码片段创建了一个高度为1，宽度为`layer_size`的稠密矩阵，矩阵中每个元素的值随机生成。
+
+    ```c
+    paddle_matrix mat = paddle_matrix_create(
+                            /* height = batch size */ 1,
+                            /* width = dimensionality of the data layer */ layer_size,
+                            /* whether to use GPU */ false);
+
+    paddle_real* array;
+    // Get the pointer pointing to the start address of the first row of the
+    // created matrix.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    // Fill the matrix with a randomly generated test sample.
+    srand(time(0));
+    for (int i = 0; i < layer_size; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    // Assign the matrix to the argument.
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+    ```
+
+- **稀疏矩阵**
+
+  PaddlePaddle C-API 中 稀疏矩阵使用[CSR（Compressed Sparse Row Format）](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。
+  <p align="center">
+  <img src="https://user-images.githubusercontent.com/5842774/34159369-009fd328-e504-11e7-9e08-36bc6dc5e505.png" width=700><br> 图1. 稀疏矩阵存储示意图
+  </p>
+
+  CSR存储格式通过：（1）非零元素的值（上图中的`values`）；（2）行偏移(上图中的`row offsets`)：每一行元素在`values`中的起始偏移，`row offsets`中元素个数总是等于行数 + 1；（3）非零元素的列号（上图中的`column indices`）来确定稀疏矩阵的内容。
+
+  在PaddlePaddle C-API中，通过调用以下接口创建稀疏矩阵：
+
+  ```c
+  PD_API paddle_matrix paddle_matrix_create_sparse(
+      uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+  ```
+
+  1. 创建稀疏矩阵时需要显示地指定矩阵的（1）高度（`height`，在神经网络中等于一次预测处理的样本数）（2）宽度（`width`，`paddle.layer.data`的`size`）以及（3）非零元个数（`nnz`）。
+  1. 当上述接口第4个参数`isBinary`指定为`true`时，**只需要设置行偏移（`row_offset`）和列号(`colum indices`)，不需要提供元素值（`values`）**，这时行偏移和列号指定的元素默认其值为1。
+
+  下面的代码片段创建了一个CPU上的二值稀疏矩阵：
+
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 (colIndices) / sizeof(int),
+                                 NULL /*values array is NULL.*/,
+                                 0 /*size of the value arrary is 0.*/));
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+  ```
+  下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵：
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+  float values[] = {0.5, 0.5, 0.5};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 sizeof(colIndices) / sizeof(int),
+                                 values,
+                                 sizeof(values) / sizeof(float)));
+  ```
+  注意事项：
+  1. 移动端预测**不支持**稀疏矩阵及相关的接口。
+
+### 组织序列信息
+
+多个排成一列的元素（可以是整型、浮点数、浮点数向量等）构成一个序列，元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中，序列输入/输出数据是在上文介绍的**数据输入（一维整型数组，二维浮点数矩阵）基础上，附加上序列信息**。下面详细解释什么是“序列信息”。
+
+我们将神经网络一次计算接受的所有输入样本称之为一个`batch`（可以含有一条或多条样本），每一个序列在整个`batch`中的偏移，就是PaddlePaddle中所指的**序列信息**，称之为“sequence start positions”。PaddlePaddle 支持两种序列类型：
+
+1. 单层序列
+    - 序列中的每一个元素是非序列，是进行计算的基本单位，不可再进行拆分。
+    - 例如：自然语言中的句子是一个序列，序列中的元素是词语；
+1. 双层序列
+    - 序列中的每一个元素又是一个序列。
+    - 例如：自然语言中的段落是一个双层序列；段落是由句子构成的序列；句子是由词语构成的序列。
+    - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。
+
+这篇文档之后部分会统一使用`sequence_start_positions`来特指：PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。
+
+对双层序列来讲，不仅要提供每一个外层序列在整个`batch`中的偏移，每一个外层序列又含有若干个内层序列，需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说：**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。
+
+**注：**
+1. 不论序列中的元素在内存中占用多少实际存储空间，`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位，而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。
+1. 非序列输入不携带`sequence_start_positions`，非序列输入无需构造`sequence_start_positions`。
+1. **不论是单层序列还是双层序列的序列信息，都使用`paddle_ivector`（也就是PaddlePaddle中的一维整型数组）来存储。**
+
+图2 是PaddlePaddle中单层序列和双层序列存储示意图。
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34159714-1f81a9be-e505-11e7-8a8a-4902146ec899.png" width=800><br>图2. 序列输入示意图
+</p>
+
+- 单层序列
+
+    图2 (a) 展示了一个含有4个序列的`batch`输入：
+    1. 4个序列的长度分别为：5、3、2、4；
+    1. 这时的`sequence_start_positions`为：`[0, 5, 8, 10, 14]`；
+    1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，都可以通过调用下面的接口为原有的数据输入附加上序列信息，使之变为一个单层序列输入，代码片段如下：
+
+    ```c
+    int seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+    // Suppose the network only has one input data layer.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+    ```
+
+- 双层序列
+
+    图2 (b) 展示了一个含有4个序列的`batch`输入；
+    1. 4个序列的长度分别为：5、3、2、4；这四个序列又分别含有3、2、1、2个子序列；
+    1. 这时的需要同时提供：
+        - 外层序列在`batch`中的起始偏移`：[0, 5, 8, 10, 14]`；
+        - 内层序列在`batch`中的起始偏移：`[0, 2, 3, 5, 7， 8， 10， 13， 14]`；
+    1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**，分别为数据输入添加外层序列和内层序列的序列信息，使之变为一个双层序列输入，代码片段如下：
+    ```c
+    // set the sequence start positions for the outter sequences.
+    int outter_seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos =
+        paddle_ivector_create(outter_seq_pos_array,
+                              sizeof(outter_pos_array) / sizeof(int),
+                              false,
+                              false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    // If the input is a sequence not the nested sequence, the third parameter is
+    // fixed to be 0.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+    // set the sequence start positions for the outter sequences.
+    int inner_seq_pos_array[] = {0, 2, 3, 5, 7， 8， 10， 13， 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos));
+    ```
+
+注意事项：
+1. 当一个`batch`中含有多个序列，**不支持序列长度为`0`的序列（也就是空输入）** 作为输入。不同计算层对空输入的处理策略有可能不同，潜在会引起未定义行为，或者引起行时错误，请在输入时进行合法性检查。
+
+### Python 端数据类型说明
+
+下表列出了Python端训练接口暴露的数据类型（`paddle.layer.data`函数`type`字段的取值）对应于调用C-API需要创建的数据类型：
+
+<html>
+<table border="2" frame="border">
+<table>
+<thead>
+<tr>
+<th style="text-align:left">Python 端数据类型</th>
+<th style="text-align:left">C-API 输入数据类型</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value</td>
+<td style="text-align:left">整型数组，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector</td>
+<td style="text-align:left">浮点型稠密矩阵，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sequence</td>
+<td style="text-align:left">整型数组，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sub_sequence</td>
+<td style="text-align:left">整型数组，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加双层序列信息</td>
+</tr>
+</tbody>
+</table>
+</html>
+<br>
+
+
+### 输出数据
+
+PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`，`argument`通过`paddle_matrix`或`paddle_ivector`存数数据，如果输出是一个序列，那么会携带有`sequence_start_positions`信息。调用C-API相关接口，读取需要的结果即可。
+
+### 总结
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为`argument`。
+- `argument`并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`argument`内部由`paddle_ivector`（一维整型数组）和`paddle_matrix`（二维浮点型矩阵）来实际存储数据。
+如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
+
+于是，在组织神经网络输入时，需要思考完成以下工作：
+
+1. 为每一个输入/输出创建`argument`。
+    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
+1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
+    - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。
+    - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。
+1. 如果输入是序列数据，需要创建并填写`sequence_start_positions`信息。
+    - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。
+    - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。
+    - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。
diff --git a/doc/v2/howto/capi/organization_of_the_inputs_en.md b/doc/v2/howto/capi/organization_of_the_inputs_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..250d3b2f749aed018e63527e817899c843dff996
--- /dev/null
+++ b/doc/v2/howto/capi/organization_of_the_inputs_en.md
@@ -0,0 +1,3 @@
+## Input/Output Data Organization
+
+TBD
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..db1568a2afbea3cca0d4e1fe053ba9536a60ab3d
--- /dev/null
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -0,0 +1,124 @@
+## C-API使用流程
+
+这篇文档介绍 PaddlePaddle C-API 整体使用流程。
+
+### 使用流程
+
+使用 C-API 的工作流程如图1所示，分为（1）准备预测模型和（2）预测程序开发两大部分。
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34658453-365f73ea-f46a-11e7-9b3f-0fd112b27bae.png" width=500><br> 图1. C-API使用流程示意图
+</p>
+
+- 准备预测模型
+
+    1. 只将神经网络结构进行序列化。
+        - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
+    1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
+        - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。
+        - 预测时只需加载一个文件便于发布。
+    - **注意**：以上两种方式只需选择其一即可。
+- 调用 C-API 开发预测序
+
+    1. 初始化PaddlePaddle运行环境。
+    1. 加载预测模型。
+    1. 创建神经网络输入，组织输入数据。
+    1. 进行前向计算，获得计算结果。
+    1. 清理和结束。
+
+### 准备预测模型
+
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。
+
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+
+下面，我们将训练结束后存储下来的模型转换成预测模型。
+
+1. 序列化神经网络模型配置
+
+    PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数，使用 C-API 进行预测时，需要将网络结构使用 protobuf 进行序列化，写入文件中。
+
+    调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中，示例代码如下：
+
+    ```python
+    from paddle.utils.dump_v2_config import dump_v2_config
+    from mnist_v2 import network
+
+    predict = network(is_infer=True)
+    dump_v2_config(predict, "trainer_config.bin", True)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+
+    使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
+
+2. 合并模型文件(可选)
+
+    一些情况为了便于发布，希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求，可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化，将序列化结果写入一个文件内。
+
+    代码示例如下：
+
+    ```python
+    from paddle.utils.merge_model import merge_v2_model
+    from mnist_v2 import network
+
+    net = network(is_infer=True)
+    param_file = "models/params_pass_4.tar"
+    output_file = "output.paddle.model"
+    merge_v2_model(net, param_file, output_file)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+
+#### 注意事项
+1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
+1. **预测使用的网络结构往往不同于训练**，通常需要去掉网络中的：（1）类别标签层；（2）损失函数层；（3）`evaluator`等，只留下核心计算层，请注意是否需要修改网络结构。
+1. 预测时，可以获取网络中定义的任意多个（大于等于一个）层前向计算的结果，需要哪些层的计算结果作为输出，就将这些层加入一个Python list中，作为调用`dump_v2_config`的第一个参数。
+
+### 编写预测代码
+
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+
+#### step 1. 初始化PaddlePaddle运行环境
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+
+#### step2. 加载模型
+
+这里介绍C-API使用中的一个重要概念：Gradient Machine。
+
+概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
+
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。
+
+- 注意事项
+
+    1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
+    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
+    1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
+
+#### step 3. 创建神经网络输入，组织输入数据
+
+基本使用概念：
+- 在PaddlePaddle内部，神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出数据有机地组织在一起。
+- 在`Argument`内部由：1. `Matrix`（二维矩阵，存储浮点类型输入/输出）；2. `IVector`（一维数组，**仅用于存储整型值**，多用于自然语言处理任务）来实际存储数据。
+
+C-API支持的所有输入数据类型和他们的组织方式，请参考“输入/输出数据组织”一节。
+
+这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
+
+在组织神经网络输入，获取输出时，需要思考完成以下工作：
+
+1. 为每一个输入/输出创建`argument`；
+1. 为每一个`argument`创建`paddle_matrix`来存储数据；
+
+与输入不同的是，不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。
+
+#### step 4. 前向计算
+
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+
+#### step 5. 清理
+
+结束预测之后，对使用的中间变量和资源进行清理和释放。
diff --git a/doc/v2/howto/capi/workflow_of_capi_en.md b/doc/v2/howto/capi/workflow_of_capi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..1692ecd56520675f02ad25ef73761330ebd0e740
--- /dev/null
+++ b/doc/v2/howto/capi/workflow_of_capi_en.md
@@ -0,0 +1,3 @@
+## C-API Workflow
+
+TBD
diff --git a/doc/v2/howto/cluster/cmd_argument_cn.md b/doc/v2/howto/cluster/cmd_argument_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0ba093cbf2eac5c3b60a0b071b31776a11998f3
--- /dev/null
+++ b/doc/v2/howto/cluster/cmd_argument_cn.md
@@ -0,0 +1,167 @@
+# 启动参数说明
+
+下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
+
+## 启动参数服务器
+
+执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+```
+
+如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
+
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+
+## 启动计算节点
+
+执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+
+```bash
+$ python train.py
+```
+
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
+
+使用环境变量：
+
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+```
+
+使用参数：
+
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
+
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前trainer的线程数目
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
+
+```python
+trainer = paddle.trainer.SGD(..., is_local=False)
+```
+
+参数说明
+
+- is_local: **必选, 默认True**, 是否使用PServer更新参数
+
+## 准备数据集
+
+参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
+
+在线上系统中，通常会使用MapReduce任务的输出结果作为训练结果，这样训练文件的个数会比较多，而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件：
+
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
+
+```bash
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
+
+在进行分布式训练时，每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中，系统会提供一个分布式存储服务，这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储，则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。
+
+对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
+
+## 准备训练程序
+
+我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
+
+最后，工作空间应如下所示：
+
+```bash
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
+
+- `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
+- `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
+
+- `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
+- `test_data_dir`：包含测试数据集的目录。
+
+## 异步 SGD 更新
+
+我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
+例如，设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数：
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: 是否为异步SGD更新模式。
+- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制，接收到足够的gradient(
+  `async_lagged_grad_discard_ratio * num_gradient_servers`)之后，后面的gradient
+  将会被抛弃。
diff --git a/doc/v2/howto/cluster/cmd_argument_en.md b/doc/v2/howto/cluster/cmd_argument_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..df1381a00fa0fa129eecffe002164c489a4183aa
--- /dev/null
+++ b/doc/v2/howto/cluster/cmd_argument_en.md
@@ -0,0 +1,169 @@
+# Command-line arguments
+
+We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
+
+## Starting parameter server
+
+Type the below command to start a parameter server which will wait for trainers to connect:
+
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
+```
+
+If you wish to run parameter servers in background, and save a log file, you can type:
+
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
+```
+
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.
+- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
+
+## Starting trainer
+
+Type the command below to start the trainer(name the file whatever you want, like "train.py")
+
+```bash
+$ python train.py
+```
+
+Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables.
+
+Use environment viriables:
+
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+python train.py
+```
+
+Pass arguments:
+
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
+
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, number of threads in current trainer.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
+
+```python
+trainer = paddle.trainer.SGD(..., is_local=False)
+```
+
+Parameter Description
+
+- is_local: **required, default True**, whether update parameters by PServer.
+
+## Prepare Training Dataset
+
+Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
+
+In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers:
+
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
+
+```bash
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
+
+When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node.
+
+Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
+
+## Prepare Training program
+
+We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
+
+Your workspace may looks like:
+
+```bash
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
+
+- `my_lib.py`: user defined libraries, like PIL libs. This is optional.
+- `word_dict.pickle`: dict file for training word embeding.
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
+
+- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
+- `test_data_dir`: containing testing data.
+
+## Async SGD Update
+
+We can set some parameters of the optimizer to make it support async SGD update.
+For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: Is Async-SGD or not.
+- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
+  when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
+  current async gradient will be discard silently.
diff --git a/doc/v2/howto/cluster/index_cn.rst b/doc/v2/howto/cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2583457c54116b7a1d797d4f7b7c2c4789c6d882
--- /dev/null
+++ b/doc/v2/howto/cluster/index_cn.rst
@@ -0,0 +1,36 @@
+分布式训练
+==========
+
+深度学习模型的效果好坏与数据量的大小往往有直接的关系：相同的模型，在增大训练数据集后一般都能取得更好的效果。但是当数据量增大到一定程度后，单台计算机已经难以承受。这时，使用多台计算机进行分布式训练就是一个很自然的解决方案。在分布式训练中，训练数据被分割为多份，参与训练的多台机器分别读取自己的数据进行训练，并协同对整体模型的参数进行更新。
+
+分布式训练一般有着如下图所示的架构：
+
+.. image:: src/ps_cn.png
+   :width: 500
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+通过计算节点和参数服务器的分布式协作，可以完成神经网络的同步随机梯度下降（SGD）方法的训练。PaddlePaddle同时支持同步随机梯度下降（SGD）和异步随机梯度下降（ASGD）。
+
+在开始集群训练之前，需要先进行集群配置、PaddlePaddle安装等准备工作，了解如何通过这些步骤来配置分布式训练所需的基本环境：
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_cn.md
+
+集群训练有大量可配置的参数，例如使用的机器数量、通信端口等。了解如何通过设置启动参数的方式，对分布式训练的过程进行配置：
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_argument_cn.md
+
+PaddlePaddle可以兼容各种不同的集群。每种集群各有优势，使用的具体方式也有区别：
+
+..  toctree::
+  :maxdepth: 1
+
+  multi_cluster/index_cn.rst
diff --git a/doc/v2/howto/cluster/index_en.rst b/doc/v2/howto/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..31eda57c4fb3947d92df45ea8dbb9274c9814140
--- /dev/null
+++ b/doc/v2/howto/cluster/index_en.rst
@@ -0,0 +1,38 @@
+Distributed Training
+====================
+
+The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
+
+Distributed training generally has framwork as shown below:
+
+.. image:: src/ps_en.png
+   :width: 500
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+The training of synchronous random gradient descent for neural network can be achieved by cooperation of trainers and parameter servers.
+
+PaddlePaddle supports both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+Before starting the cluster training, you need to prepare the cluster configuration, PaddlePaddle installation, and other preparations. To understand how to configure the basic environment for distributed training, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
+  preparations_en.md
+
+Cluster training has a large number of configurable parameters, such as the number of machines used, communication ports, etc. To learn how to configure the distributed training process by setting startup these parameters, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_argument_en.md
+
+PaddlePaddle is compatible with a variety of different clusters. Each cluster has its own advantages, To learn how to run PaddlePaddle in different types of them, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
+  multi_cluster/index_en.rst
diff --git a/doc/v2/howto/cluster/multi_cluster/fabric_cn.md b/doc/v2/howto/cluster/multi_cluster/fabric_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0385e401b399a51fad112e604dc56cb2f84c0a4b
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/fabric_cn.md
@@ -0,0 +1,42 @@
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/v2/howto/cluster/multi_cluster/fabric_en.md b/doc/v2/howto/cluster/multi_cluster/fabric_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..bac9ffe1526a06a3a23b1d8acf33a5fb74b7e50d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/fabric_en.md
@@ -0,0 +1,43 @@
+# Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/v2/howto/cluster/multi_cluster/index_cn.rst b/doc/v2/howto/cluster/multi_cluster/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eabf95eda0b20f91913201a6b4e5b56fa440597e
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/index_cn.rst
@@ -0,0 +1,35 @@
+在不同集群中运行
+================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:
+
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md
+
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:
+
+..  toctree::
+  :maxdepth: 1
+
+  openmpi_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
diff --git a/doc/v2/howto/cluster/multi_cluster/index_en.rst b/doc/v2/howto/cluster/multi_cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9bc1eb2e3796d95dd69b165e916e263ea34b87f6
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/index_en.rst
@@ -0,0 +1,35 @@
+Use different clusters
+======================
+
+The user's cluster environment is not the same. To facilitate everyone's deployment, we provide a variety of cluster deployment methods to facilitate the submission of cluster training tasks, which will be introduced as follows:
+
+`Kubernetes <http://kubernetes.io>`_ is a scheduling framework of Google open source container cluster, supporting a complete cluster solution for large-scale cluster production environment. The following guidelines show PaddlePaddle's support for Kubernetes:
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_en.md
+  k8s_distributed_en.md
+
+`OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
+
+..  toctree::
+  :maxdepth: 1
+
+  openmpi_en.md
+
+`Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_en.md
+
+We also support the deployment of PaddlePaddle on AWS. Learn more about:
+
+..  toctree::
+  :maxdepth: 1
+
+  k8s_aws_en.md
+
+The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc753aa42f19631c49a451a797f28365e65ed1d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
@@ -0,0 +1,672 @@
+# Kubernetes on AWS
+
+我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练，让我们从核心概念开始
+
+## PaddlePaddle分布式训练的核心概念
+
+### 分布式训练任务
+
+一个分布式训练任务可以看做是一个Kubernetes任务
+每一个Kubernetes任务都有相应的配置文件，此配置文件指定了像任务的pod个数之类的环境变量信息
+
+在分布式训练任务中，我们可以如下操作：
+
+1. 在分布式文件系统中，准备分块数据和配置文件（在此次教学中，我们会用到亚马逊分布式存储服务（EFS））
+2. 创建和提交一个kubernetes任务配置到集群中开始训练
+
+### Parameter Server和Trainer
+
+在paddlepaddle集群中有两个角色：参数服务器（pserver）者和trainer， 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数，并可以利用本地数据更新模型。在这个训练过程中，trainer发送模型更新到参数服务器中，参数服务器职责就是聚合这些更新，以便于trainer可以把全局模型同步到本地。
+
+为了能够和pserver通信，trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制（比如：DNS、hostname）要比静态的IP地址要好一些，因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式，这种方式是可以更改的。
+
+参数服务器和trainer一块被打包成一个docker镜像，这个镜像会运行在被Kubernetes集群调度的pod中。
+
+### 训练者ID
+
+每一个训练过程都需要一个训练ID，以0作为基础值，作为命令行参数传递。训练过程因此用这个ID去读取数据分片。
+
+### 训练
+
+PaddlePaddle容器的入口是一个shell脚本，这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity，在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。
+
+每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务，可以用“pod id”作为训练ID。入口脚本详细工作流程如下：
+
+1. 查找apiserver得到pod信息，通过ip排序来分配一个trainer_id。
+2. 从EFS持久化卷中复制训练数据到容器中。
+3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数，然后开始启动流程。
+4. 以trainer_id来训练将自动把结果写入到EFS卷中。
+
+
+## AWS的Kubernetes中的PaddlePaddle
+
+### 选择AWS服务区域
+这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前，请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域：EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。
+
+### 创建aws账户和IAM账户
+
+在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限，作为IAM用户可以创建/操作aws集群
+
+注册aws账户，请遵循用户指南。在AWS账户下创建IAM用户和用户组，请遵循用户指南
+
+请注意此教程需要如下的IAM用户权限：
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### 下载kube-aws and kubectl
+
+#### kube-aws
+
+在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具
+
+##### kube-aws完整性验证
+提示：如果你用的是非官方版本（e.g RC release）的kube-aws，可以跳过这一步骤。引入coreos的应用程序签名公钥:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+指纹验证：
+
+```
+gpg2 --fingerprint FC8A365E
+```
+正确的指纹是： `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+我们可以从发布页面中下载kube-aws，教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases).
+
+验证tar包的GPG签名：
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### 安装kube-aws
+解压:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+添加到环境变量:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口
+
+利用`curl`工具从Kubernetes发布页面中下载`kubectl`
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### 配置AWS证书
+
+首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具
+
+然后配置aws账户信息:
+
+```
+aws configure
+```
+
+
+添加如下信息:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+描述任何运行在你账户中的实例来验证凭据是否工作:
+
+```
+aws ec2 describe-instances
+```
+
+### 定义集群参数
+
+#### EC2秘钥对
+
+秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。
+
+遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对
+
+你可以使用创建好的秘钥对名称来配置集群.
+
+在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ，所以请确认在这个区域（Oregon）中创建秘钥对。
+
+在浏览器中下载一个`key-name.pem`文件用来访问EC2实例，我们待会会用到.
+
+
+#### KMS秘钥
+
+亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥，你可以跳过创建新秘钥这一步，提供现存秘钥的ARN字符串。
+
+利用aws命令行创建kms秘钥:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+我们稍后用到`Arn` 的值.
+
+在IAM用户许可中添加多个内联策略.
+
+进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮，点击刚才创建的用户，然后点击`Add inline policy`按钮，选择`Custom Policy`
+
+粘贴内联策略:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : 值必须是"2012-10-17".
+`AWS_ACCOUNT_ID`: 你可以从命令行中获取:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME，稍后会用到。
+请注意，堆栈名称必须是正则表达式：[a-zA-Z][-a-zA-Z0-9*]*， 在名称中不能有"_"或者"-"，否则kube-aws在下面步骤中会抛出异常
+
+#### 外部DNS名称
+
+当集群被创建后，基于DNS名称控制器将会暴露安全的TLS API.
+
+DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。
+
+我们稍后会用到DNS名称，如果没有DNS名称的话，你可以选择一个（比如：`paddle`）还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP，稍后步骤中会查找集群IP.
+
+#### S3 bucket
+
+在启动Kubernetes集群前需要创建一个S3 bucket
+
+在AWS上创建s3 bucket会有许多的bugs，所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。
+
+链接到 `Create Bucket`，确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。
+
+#### 初始化assets
+
+在本机创建一个目录用来存放产生的assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+这里的`us-west-2a`用于参数`--availability-zone`，但必须在AWS账户的有效可用区中
+
+如果不能切换到其他的有效可用区（e.g., `us-west-2a`, or `us-west-2b`），请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。
+
+现在在asset目录中就有了集群的主配置文件cluster.yaml。
+
+默认情况下kube-aws会创建一个工作节点，修改`cluster.yaml`让`workerCount`从1个节点变成3个节点.
+
+#### 呈现asset目录内容
+
+在这个简单的例子中，你可以使用kuber-aws生成TLS身份和证书
+
+```
+kube-aws render credentials --generate-ca
+```
+
+下一步在asset目录中生成一组集群assets.
+
+```
+kube-aws render stack
+```
+asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联
+
+### 启动Kubernetes集群
+
+#### 创建一个在CloudFormation模板上定义好的实例
+
+现在让我们创建集群（在命令行中选择任意的 `PREFIX`）
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称
+
+
+#### 配置DNS
+
+你可以执行命令 `kube-aws status`来查看创建后集群的API.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+如果你用DNS名称，在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### 查询IP地址
+
+用命令`dig`去检查负载均衡器的域名来获取ip地址.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+在上面的例子中，`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态
+
+*如果你有DNS名称*，设置记录到ip上，然后你可以跳过“Access the cluster”这一步
+
+*如果没有自己的DNS名称*
+
+编辑/etc/hosts文件用DNS关联IP
+
+##### 更新本地的DNS关联
+编辑`/etc/hosts`文件用DNS关联IP
+##### 在VPC上添加route53私有名称服务
+ - 打开[Route53 Console](https://console.aws.amazon.com/route53/home)
+ - 根据配置创建域名zone
+   - domain名称为: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - 添加记录
+    - 点击zone中刚创建的“paddle”
+    - 点击按钮“Create record set”
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - 检查名称服务
+    - 连接通过kube-aws via ssh创建的任何实例
+    - 运行命令"host paddle"，看看是否ip为返回的kube-controller的私有IP
+
+#### 进入集群
+
+集群运行后如下命令会看到:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### 集群安装弹性文件系统
+
+训练数据存放在AWS上的EFS分布式文件系统中.
+
+1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组
+  1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id
+  <center>![](src/worker_security_group.png)</center>
+
+  2. 增加安全组`paddle-efs` ，以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个.
+  <center>![](src/add_security_group.png)</center>
+
+2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### 开始在AWS上进行paddlepaddle的训练
+
+#### 配置Kubernetes卷指向EFS
+
+首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上
+
+用 `pv.yaml`形式来保存
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+运行下面的命令来创建持久卷:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷
+
+用`pvc.yaml`来保存.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+行下面命令来创建持久卷声明:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### 准备训练数据
+
+启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份.
+
+用`paddle-data-job.yaml`保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+运行下面的命令来启动任务:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+任务运行大概需要7分钟，可以使用下面命令查看任务状态，直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功，这里here有怎样创建镜像的源码
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放，可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码
+
+#### 开始训练
+
+现在可以开始运行paddle的训练任务，用`paddle-cluster-job.yaml`进行保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod，当pod启动后3个任务将被完成。
+
+`env` 参数代表容器的环境变量，在这里指定paddlepaddle的参数.
+
+`ports` 指定TCP端口7164 - 7167和`pserver`进行连接，port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟
+
+运行下面命令来启动任务.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+检查pods信息
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+检查指定pod的控制台输出
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`).
+
+运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态，将会在大约20分钟完成
+
+`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中，这里[here](src/k8s_train/README.md) 有创建docker镜像的源码.
+
+#### 检查训练输出
+
+训练输出（模型快照和日志）将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上，查看mount过的EFS和训练输出.
+
+1. ssh登录EC2工作节点
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址，进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP`
+
+2. 挂载EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+文件夹`efs`上有这结构相似的node信息:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` 是`pserver`的log日志，`train.log`是`trainer`的log日志，模型快照和描述存放在`pass-0000*`.
+
+### Kubernetes集群卸载或删除
+
+#### 删除EFS
+
+到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷
+
+#### 删除安全组
+
+去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`.
+
+#### 删除S3 bucket
+
+进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket
+
+#### 销毁集群
+
+```
+kube-aws destroy
+```
+
+命令会立刻返回，但需要大约5分钟来销毁集群
+
+可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e8e87be711bd45177ed77c81c531606e801d1f0
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md
@@ -0,0 +1,688 @@
+# Kubernetes on AWS
+
+We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
+
+## Distributed PaddlePaddle Training Core Concepts
+
+### Distributed Training Job
+
+A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
+
+Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables.
+
+In a distributed training job, we would:
+
+1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
+
+### Parameter Servers and Trainers
+
+There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
+
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
+
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved.
+
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
+
+### Trainer ID
+
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
+
+### Training
+
+The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
+
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
+1. Copy the training data from EFS persistent volume into container.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
+
+
+## PaddlePaddle on AWS with Kubernetes
+
+### Choose AWS Service Region
+This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link
+https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/
+Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3.
+In this tutorial, we use "Oregon(us-west-2)" as example.
+
+### Create AWS Account and IAM Account
+
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
+
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create IAM users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+
+Please be aware that this tutorial needs the following privileges for the user in IAM:
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### Download kube-aws and kubectl
+
+#### kube-aws
+
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
+##### Verify kube-aws integrity
+Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp.
+Import the CoreOS Application Signing Public Key:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+Validate the key fingerprint:
+
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
+
+Validate the tarball's GPG signature:
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### Install kube-aws
+Extract the binary:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+Add kube-aws to your path:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
+
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### Configure AWS Credentials
+
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
+
+And then configure your AWS account information:
+
+```
+aws configure
+```
+
+
+Fill in the required fields:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+Verify that your credentials work by describing any instances you may already have running on your account:
+
+```
+aws ec2 describe-instances
+```
+
+### Define Cluster Parameters
+
+#### EC2 key pair
+
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+
+Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
+
+After creating a key pair, you will use the key pair name to configure the cluster.
+
+Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon).
+
+Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later.
+
+
+#### KMS key
+
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+
+You can create a KMS key with the aws command line tool:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+We will need to use the value of `Arn` later.
+
+And then let's add several inline policies in your IAM user permission.
+
+Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`.
+
+Paste into following inline policies:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : Its value has to be exactly "2012-10-17".
+`AWS_ACCOUNT_ID`: You can get it from following command line:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. 
+Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps.
+
+#### External DNS name
+
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps.
+
+#### S3 bucket
+
+You need to create an S3 bucket before startup the Kubernetes cluster.
+
+There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2).
+
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon).
+
+
+#### Initialize Assets
+
+Create a directory on your local machine to hold the generated assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`)
+
+
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+
+By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3.
+
+
+#### Render contents of the asset directory
+
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+
+```
+kube-aws render credentials --generate-ca
+```
+
+The next command generates the default set of cluster assets in your asset directory.
+
+```
+kube-aws render stack
+```
+Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder.
+
+
+### Kubernetes Cluster Start Up
+
+#### Create the instances defined in the CloudFormation template
+
+Now let's create your cluster (choose any `PREFIX` for the command below):
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
+
+
+#### Configure DNS
+
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+
+If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### Find IP address
+
+Use command `dig` to check the load balancer hostname to get the ip address.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
+
+*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster".
+
+*If you do not own a DNS name*:
+##### Update local DNS association
+Edit `/etc/hosts` to associate above ip with the DNS name.
+##### Add Route53 private name service in VPC
+ - Open [Route53 Console](https://console.aws.amazon.com/route53/home)
+ - Create hosted zone with following config
+   - Domain name: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - Add A record
+    - Click on the zone "paddle" just created
+    - Click the button "Create record set"
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - Verify name service
+    - Connect to any instance created by kube-aws via ssh
+    - Run command "host paddle", see if the ip returned is the private ip of kube-controller
+
+#### Access the cluster
+
+Once the API server is running, you should see:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes 
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### Setup Elastic File System for Cluster
+
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS.
+
+1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)
+  1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below)
+  <center>![](src/worker_security_group.png)</center>
+  2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets).
+  <center>![](src/add_security_group.png)</center>
+
+2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### Start PaddlePaddle Training Demo on AWS
+
+#### Configure Kubernetes Volume that Points to EFS
+
+First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn.
+
+Save following snippet as `pv.yaml`
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+Run following command to create a persistent volumn:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+
+Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume.
+
+Save following snippet as `pvc.yaml`.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+Run following command to create a persistent volumn claim:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### Prepare Training Data
+
+We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created.
+
+save following snippet as `paddle-data-job.yaml`
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+Run following command to launch the job:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+
+Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1`
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+
+#### Start Training
+
+Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml`
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods.
+
+`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables.
+
+`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency.
+
+Run following command to launch the job.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+Inspect individual pods
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+Inspect individual console output
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`).
+
+Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
+
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+
+#### Inspect Training Output
+
+Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output.
+
+1. ssh Into Worker EC2 instance
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance.
+
+2. Mount EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+Now folder `efs` will have structure similar to:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`.
+
+### Kubernetes Cluster Tear Down
+
+#### Delete EFS
+
+Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created.
+
+#### Delete security group
+
+Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`.
+
+
+#### Delete S3 Bucket
+
+Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created.
+
+#### Destroy Cluster
+
+```
+kube-aws destroy
+```
+
+The command will return immediately, but it might take 5 min to tear down the whole cluster.
+
+You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process.
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1a11f7165a2f9da9dd044641274447e7943a597
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_cn.md
@@ -0,0 +1,206 @@
+# Kubernetes单机训练
+
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+
+## 制作Docker镜像
+
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。
+
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。
+
+### 运行容器
+
+```
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
+```
+
+### 下载数据
+
+进入容器`/root/paddle/demo/quick_start/data`目录，使用`get_data.sh`下载数据
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### 修改启动脚本
+
+下载完数据后，修改`/root/paddle/demo/quick_start/train.sh`文件，内容如下（增加了一条cd命令）
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### 提交镜像
+
+修改启动脚本后，退出容器，使用`docker commit`命令创建新镜像。
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## 使用 Kubernetes 进行训练
+
+>针对任务运行完成后容器自动退出的场景，Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。
+
+### 编写yaml文件
+
+在训练时，输出结果可能会随着容器的消耗而被删除，需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像，可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job)，简单的yaml文件如下：
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### 创建PaddlePaddle Job
+
+使用上文创建的yaml文件创建Kubernetes Job，命令为：
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+查看job的详细情况：
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### 查看训练结果
+
+根据Job对应的Pod信息，可以查看此Pod运行的宿主机。
+
+```
+kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+我们还可以登录到宿主机上查看训练结果。
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..167089b8074b33e3b094fa3ec8e377630cec42ac
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md
@@ -0,0 +1,312 @@
+# Kubernetes分布式训练
+
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
+
+## 整体方案
+
+在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
+
+上图描述了一个3节点的分布式训练场景，在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
+
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，按照下面步骤即可：
+
+1. [制作PaddlePaddle镜像](#制作镜像)
+1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
+1. [编写本次训练的YAML文件，创建一个Kubernetes job](#创建Job)
+1. [训练结束后查看输出结果](#查看输出)
+
+下面就根据这几个步骤分别介绍。
+
+### 制作镜像
+
+PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
+
+- 拷贝训练文件到容器内
+- 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
+
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+然后将构建成功的镜像上传到镜像仓库。
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+
+### 准备训练数据
+
+这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据，也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image.
+
+在启动Job之前，需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+完成后volume中的文件内容大致如下：
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
+
+### 创建Job
+
+Kubernetes可以通过YAML文件来创建相关对象，然后可以使用命令行工具创建job。
+
+Job YAML文件描述了这次训练使用的Docker镜像，需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数，也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义，可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如，本次训练的YAML文件可以写成：
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
+
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
+
+
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`
+
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
+
+```bash
+kubectl create -f job.yaml
+```
+
+创建成功后，Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像，启动容器开始训练。
+
+
+### 查看输出
+
+在训练过程中，可以在共享存储上查看输出的日志和模型，例如output目录下就存放了输出结果。注意node_0，node_1，node_2这几个目录表示PaddlePaddle节点与trainer_id，并不是Kubernetes中的node概念。
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+我们可以通过日志查看容器训练的情况，例如：
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+
+## 一些细节的补充
+
+### 使用环境变量
+
+使用容器方式运行训练任务的Kubernetes Job，通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本，将环境变量转换成paddle的命令行参数：
+```
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Pod间通信
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+* *注意*: `getPodList()`会获取当前namespace下的所有pod，如果已经有pod运行，可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+### 启动任务
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2dc4da8451af317df76c5b3df328b6f58429610
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -0,0 +1,372 @@
+# Distributed Training on Kubernetes
+
+We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
+previous document.
+In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
+on Kubernetes cluster.
+
+## Overall Architecture
+
+Before creating a training job, the users need to slice the training data and deploy
+the Python scripts along with it into the distributed file system
+(We can use the different type of Kuberentes Volumes to mount different distributed
+file systems). Before training starts, The program will copy the training data into the
+Container and also save the models at the same path during training. The global architecture
+is as follows:
+
+![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png)
+
+The above figure describes a distributed training architecture which contains 3 nodes, each 
+Pod mounts a folder of the distributed file system to save training data and models
+by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
+3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
+PaddlePaddle starts up the communication between PServer and Trainer and read training
+data for this training job.
+
+As the description above, we can start up a PaddlePaddle distributed training job on a 
+Kubernetes ready cluster with the following steps:
+
+1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
+1. [Split training data and upload to the distributed file system](#Upload Training Data)
+1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
+1. [Check the output](#Check The Output)
+
+We will introduce these steps as follows:
+
+### Build a Docker Image
+
+Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
+
+- Copying the training data into container.
+- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
+
+Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
+- https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
+
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+And then upload the new Docker Image to a Docker hub:
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
+you need to use your repository instead of it. We will replace it with your respository name to
+represent the Docker Image which built in this step.
+
+### Prepare Training Data
+
+We can download and split the training job by creating a Kubernetes Job, or custom your image
+by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train).
+
+Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
+the different file system, the generated dataset would be saved on this volume.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+Create the Job with the following command:
+
+```bash
+> kubectl create -f xxx.yaml
+```
+
+If created successfully, you can see some information like this:
+
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+The `paddle-cluster-job` above is the job name for this training job; we need 3
+PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
+the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
+
+
+### Create a Job
+
+Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
+to create it.
+
+The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
+[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
+The following is an example for this training job:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+In the above YAML file:
+- `metadata.name`, The job name.
+- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
+- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
+  is equal to `completions`.
+- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
+  the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
+  to configure the host path we want to mount.
+- `env`, the environment variables in the Container, we pass some startup arguments by
+  this approach, some details are as following:
+  - JOB_PATH：the mount path in the container
+  - JOB_NAME：the job name
+  - TRAIN_CONFIG_DIR：the job path in the container, we can find the training data path by
+    combine with JOB_NAME.
+  - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
+    device name.
+  - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
+  - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
+    for dense prameter update. 
+  - CONF_PADDLE_PORTS_NUM_SPARSE：the argument `--ports_num_for_sparse` of `Paddle PServer`,
+    the port number for sparse parameter update.
+  - CONF_PADDLE_GRADIENT_NUM：the number of training node, the argument 
+  `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
+
+You can find some details information at [here]
+(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
+
+```bash
+kubectl create -f job.yaml
+```
+
+Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
+pull the Docker image and begin to train.
+
+
+### Checkout the Output
+
+At the process of training, we can check the logs and the output models which is stored in
+the `output` folder.
+
+**NOTE**, `node_0`, `node_1` and `node_2` represent the
+`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can checkout the status of each training Pod by viewing the logs:
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+## Some Additional Details
+
+### Using Environment Variables
+
+Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
+Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
+to the start up arguments of PaddlePaddle process:
+
+```bash
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Communication between Pods
+
+At the begin of `start_paddle.py`, it would initializes and parses the arguments.
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some 
+Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
+Kubernetes Pod or Replicaset in the future.
+
+The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
+to generate `trainer_id`.
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
+so that we can start up them by `startPaddle(idMap, train_args_dict)`.
+
+### Create Job
+
+The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
+`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
+environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
+finally find `trainerId` from `idMap` according to its IP address.
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..96ff652705726fc56fa0078593cd2a695fcdb5e2
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_en.md
@@ -0,0 +1,210 @@
+# Kubernetes
+
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.
+
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.
+
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start PaddlePaddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md b/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..954b2215cc3136ec5b3e1cdc2f6d3f508f814516
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md
@@ -0,0 +1,41 @@
+# 在OpenMPI集群中启动训练
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/openmpi_en.md b/doc/v2/howto/cluster/multi_cluster/openmpi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5c02b336b8a974f546499acae32edac24219be9
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/openmpi_en.md
@@ -0,0 +1,41 @@
+# OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png b/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/create_efs.png b/doc/v2/howto/cluster/multi_cluster/src/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/create_efs.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png b/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6d3a12ae393aa594b8e6e9a5f726109426937284
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile
@@ -0,0 +1,7 @@
+FROM alpine
+
+RUN apk update && apk upgrade && apk add coreutils
+ADD quick_start /quick_start
+ADD get_data.sh /bin/
+RUN chmod +x /bin/get_data.sh
+ENTRYPOINT ["/bin/get_data.sh"]
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..83cef7affd0ac4d3a1ca08ea5b046fa81e1bc630
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md
@@ -0,0 +1,6 @@
+To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
+
+```
+cp -r ../../../../../../demo/quick_start .
+docker build . -t prepare-data-image-name
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d187ba5ac8d03f69dfdefd4f63610ed7921575be
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+out_dir=$OUT_DIR
+split_count=$SPLIT_COUNT
+
+set -e
+
+mkdir -p $out_dir
+cp -r /quick_start $out_dir/
+
+mkdir -p $out_dir/0/data
+cd $out_dir/0/data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+tar zxvf preprocessed_data.tar.gz
+rm preprocessed_data.tar.gz
+
+split -d --number=l/$split_count -a 5 train.txt train.
+mv train.00000 train.txt
+
+cd $out_dir
+end=$(expr $split_count - 1)
+for i in $(seq 1 $end); do
+    mkdir -p $i/data
+    cp -r 0/data/* $i/data
+    mv $i/data/train.`printf %05d $i` $i/data/train.txt
+done;
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..77f021a89a70d934bf70424eaa3c6dc3f7c93a28
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddlepaddle/paddle:latest
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+RUN chmod +x /root/start.sh
+CMD ["bash"," -c","/root/start.sh"]
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96bf65497ffa23e90c4c9350504f86367b48daf2
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md
@@ -0,0 +1,5 @@
+To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
+
+```
+docker build . -t train-image-name
+```
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..12dfe1e6386885a6989d3887f21c6922f137a9ae
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+set -eu
+
+jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
+cd /root
+cp -rf $jobconfig/* .
+
+python /root/start_paddle.py \
+  --dot_period=10 \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
+  --log_period=50 \
+  --num_passes=10 \
+  --trainer_count=$TRAINER_COUNT \
+  --saving_period=1 \
+  --local=0 \
+  --config=trainer_config.lr.py \
+  --use_gpu=0
diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
new file mode 100755
index 0000000000000000000000000000000000000000..935c12bb67e1fe08bc135a7a2220fcd43c548482
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py
@@ -0,0 +1,170 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+import socket
+import os
+import argparse
+
+# configuration for cluster
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+
+tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+
+
+def refine_unknown_args(cmd_args):
+    '''
+    refine unknown parameters to handle some special parameters
+    '''
+    new_args = []
+    for arg in cmd_args:
+        if arg.startswith("--") and arg.find("=") != -1:
+            equal_pos = arg.find("=")  # find first = pos
+            arglist = list(arg)
+            arglist[equal_pos] = " "
+            arg = "".join(arglist)
+            arg = arg.lstrip("-")
+            new_args += arg.split(" ")
+        elif arg.startswith("--") and arg.find("=") == -1:
+            arg = arg.lstrip("-")
+            new_args.append(arg)
+        else:
+            new_args.append(arg)
+    return new_args
+
+
+def isPodAllRunning(podlist):
+    '''
+    check all pod is running
+    '''
+    require = len(podlist["items"])
+    running = 0
+    for pod in podlist["items"]:
+        if pod["status"]["phase"] == "Running":
+            running += 1
+    print "waiting for pods running, require:", require, "running:", running
+    if require == running:
+        return True
+    return False
+
+
+def getPodList():
+    '''
+    get all container status of the job
+    '''
+    apiserver = "https://" + \
+        os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \
+        os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")
+
+    pod = API + NAMESPACE + "/pods?"
+    job = JOBNAME
+    if os.path.isfile(tokenpath):
+        tokenfile = open(tokenpath, mode='r')
+        token = tokenfile.read()
+        Bearer = "Bearer " + token
+        headers = {"Authorization": Bearer}
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            headers=headers,
+                            verify=False).json()
+    else:
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            verify=False).json()
+
+
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+
+
+def startPaddle(idMap={}, train_args_dict=None):
+    '''
+    start paddle pserver and trainer
+    '''
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+    logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
+    if not os.path.exists(JOB_PATH_OUTPUT):
+        os.makedirs(JOB_PATH_OUTPUT)
+    if not os.path.exists(logDir):
+        os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH + \
+        "/" + str(trainerId) + "/data/*" + " ./data/"
+    os.system(copyCommand)
+    startPserver = 'nohup paddle pserver' + \
+        " --port=" + str(PADDLE_PORT) + \
+        " --ports_num=" + str(PADDLE_PORTS_NUM) + \
+        " --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \
+        " --nics=" + PADDLE_NIC + \
+        " --comment=" + "paddle_process_by_paddle" + \
+        " --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\
+        " > " + logDir + "/server.log 2>&1 &"
+    print startPserver
+    os.system(startPserver)
+    # wait until pservers completely start
+    time.sleep(20)
+    startTrainer = program + args + " 2>&1 | tee " + \
+        logDir + "/train.log"
+    print startTrainer
+    os.system(startTrainer)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        prog="start_paddle.py", description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(20)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+    startPaddle(idMap, train_args_dict)
diff --git a/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png b/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png b/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png b/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png differ
diff --git a/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png b/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png differ
diff --git a/doc/v2/howto/cluster/preparations_cn.md b/doc/v2/howto/cluster/preparations_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce40697e703503b66f6306e15ebdb0ce1329991d
--- /dev/null
+++ b/doc/v2/howto/cluster/preparations_cn.md
@@ -0,0 +1,16 @@
+## 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/v2/howto/cluster/preparations_en.md b/doc/v2/howto/cluster/preparations_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b77b293907ae0548134fc65ceed3aa0ed0b845d
--- /dev/null
+++ b/doc/v2/howto/cluster/preparations_en.md
@@ -0,0 +1,17 @@
+## Preparations
+
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
diff --git a/doc/v2/howto/cluster/src/Dockerfile b/doc/v2/howto/cluster/src/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e178bf4da0f32fca9586b5b69a2c7419de5d9cb1
--- /dev/null
+++ b/doc/v2/howto/cluster/src/Dockerfile
@@ -0,0 +1,7 @@
+FROM paddlepaddle/paddle:latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/src/efs_mount.png b/doc/v2/howto/cluster/src/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/v2/howto/cluster/src/efs_mount.png differ
diff --git a/doc/v2/howto/cluster/src/managed_policy.png b/doc/v2/howto/cluster/src/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/v2/howto/cluster/src/managed_policy.png differ
diff --git a/doc/v2/howto/cluster/src/ps_cn.png b/doc/v2/howto/cluster/src/ps_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/howto/cluster/src/ps_cn.png differ
diff --git a/doc/v2/howto/cluster/src/ps_en.png b/doc/v2/howto/cluster/src/ps_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/howto/cluster/src/ps_en.png differ
diff --git a/doc/v2/howto/cluster/src/trainer.png b/doc/v2/howto/cluster/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/howto/cluster/src/trainer.png differ
diff --git a/doc/v2/howto/cluster/src/trainer_cn.png b/doc/v2/howto/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/howto/cluster/src/trainer_cn.png differ
diff --git a/doc/v2/howto/cluster/src/word2vec/api_train_v2.py b/doc/v2/howto/cluster/src/word2vec/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9107e24c175f1fbf29d86e222e4b66031a5b505e
--- /dev/null
+++ b/doc/v2/howto/cluster/src/word2vec/api_train_v2.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gzip
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def main():
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py b/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..791504094f3ecae925226ff1d90f20f91d4c018d
--- /dev/null
+++ b/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import paddle.v2 as paddle
+import pickle
+
+embsize = 32
+hiddensize = 256
+N = 5
+cluster_train_file = "./train_data_dir/train/train.txt"
+cluster_test_file = "./test_data_dir/test/test.txt"
+node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+if not node_id:
+    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def cluster_reader_cluster(filename, node_id):
+    def cluster_reader():
+        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
+            for l in f:
+                csv_data = [int(cell) for cell in l.split(",")]
+                yield tuple(csv_data)
+
+    return cluster_reader
+
+
+def main():
+    # get arguments from env
+
+    # for local training
+    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
+    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
+    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
+
+    if not cluster_train:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
+    else:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
+            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
+            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
+            ports_num_for_sparse=int(
+                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
+            num_gradient_servers=int(
+                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
+            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
+            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
+    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
+    word_dict = pickle.load(fn)
+    fn.close()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        cluster_reader_cluster(cluster_test_file, node_id), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/v2/howto/cluster/src/word2vec/prepare.py b/doc/v2/howto/cluster/src/word2vec/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42548fbf03a0298e1e397c868e4d531801ec89a
--- /dev/null
+++ b/doc/v2/howto/cluster/src/word2vec/prepare.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import tarfile
+import os
+import pickle
+
+SPLIT_COUNT = 3
+N = 5
+
+
+def file_len(fd):
+    for i, l in enumerate(fd):
+        pass
+    return i + 1
+
+
+def split_from_reader_by_line(filename, reader, split_count):
+    fn = open(filename, "w")
+    for batch_id, batch_data in enumerate(reader()):
+        batch_data_str = [str(d) for d in batch_data]
+        fn.write(",".join(batch_data_str))
+        fn.write("\n")
+    fn.close()
+
+    fn = open(filename, "r")
+    total_line_count = file_len(fn)
+    fn.close()
+    per_file_lines = total_line_count / split_count + 1
+    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
+    os.system(cmd)
+
+
+word_dict = paddle.dataset.imikolov.build_dict()
+with open("word_dict.pickle", "w") as dict_f:
+    pickle.dump(word_dict, dict_f)
+
+split_from_reader_by_line("train.txt",
+                          paddle.dataset.imikolov.train(word_dict, N),
+                          SPLIT_COUNT)
+split_from_reader_by_line("test.txt",
+                          paddle.dataset.imikolov.test(word_dict, N),
+                          SPLIT_COUNT)
diff --git a/doc/v2/howto/cmd_parameter/arguments_cn.md b/doc/v2/howto/cmd_parameter/arguments_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dea231ca5487978d59a4d0a570431722ed6b3bf
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/arguments_cn.md
@@ -0,0 +1,394 @@
+# 参数概述
+
+虽然Paddle看起来包含了众多参数，但是大部分参数是为开发者提供的，或者已经在集群提交环境中自动设置，因此用户并不需要关心它们。在此，根据这些参数的使用场合，我们将它们划分为不同的类别。例如，`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中，而有些参数需要在集群多机训练中使用等。
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">参数</th>
+<th scope="col" class="left">本地训练</th>
+<th scope="col" class="left">集群训练</th>
+<th scope="col" class="left">本地测试</th>
+<th scope="col" class="left">集群测试</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">通用</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">训练/测试</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">递归神经网络(RNN)</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/v2/howto/cmd_parameter/arguments_en.md b/doc/v2/howto/cmd_parameter/arguments_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1963067bda949b11ececefed3db7db1432c6223
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/arguments_en.md
@@ -0,0 +1,394 @@
+# Argument Outline
+
+It looks like there are a lot of arguments. However, most of them are for developers or alrealy set automatically in cluster submitting environment and users do not need to care about them. Here, we divide these arguments into serveral classes according to the scenario that they are used in. For example, the arguments in `common` can be used in all scenes. Some arguments can be only used in certain layers. Some are needed by multi machines training in cluster, etc.
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">args</th>
+<th scope="col" class="left">local train</th>
+<th scope="col" class="left">cluster train</th>
+<th scope="col" class="left">local test</th>
+<th scope="col" class="left">cluster test</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">common</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">train</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">train/test</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">test</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/PnpairValidation</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">RNN</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">Async SGD</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">Performance Tuning</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">Data Provider</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">RandomNumber</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">UnitTest</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">Matrix/Vector</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/v2/howto/cmd_parameter/detail_introduction_cn.md b/doc/v2/howto/cmd_parameter/detail_introduction_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4625ba68cf23e5697554ba94efaf0b873f2c1de
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/detail_introduction_cn.md
@@ -0,0 +1,323 @@
+# 细节描述
+
+## 通用
+
+* `--job`
+  - 工作模式，包括: **train, test, checkgrad**，其中checkgrad主要为开发者使用，使用者不需要关心。
+  - 类型: string (默认: train)
+
+* `--config`
+  - 用于指定网络配置文件。
+  - 类型: string (默认: null).
+
+* `--use_gpu`
+  - 训练过程是否使用GPU，设置为true使用GPU模式，否则使用CPU模式。
+  - 类型: bool (默认: 1).
+
+* `--local`
+  - 训练过程是否为本地模式，设置为true使用本地训练或者使用集群上的一个节点，否则使用多机训练。
+  - 类型: bool (默认: 1).
+
+* `--trainer_count`
+  - 指定一台机器上使用的线程数。例如，trainer_count = 4, 意思是在GPU模式下使用4个GPU，或者在CPU模式下使用4个线程。每个线程（或GPU）分配到当前数据块样本数的四分之一。也就是说，如果在训练配置中设置batch_size为512，每个线程分配到128个样本用于训练。
+  - 类型: int32 (默认: 1).
+
+* `--num_passes`
+  - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时，意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
+  - 类型: int32 (默认: 100).
+
+* `--config_args`
+  - 传递给配置文件的参数。格式: key1=value1,key2=value2.
+  - 类型: string (默认: null).
+
+* `--version`
+  - 是否打印版本信息。
+  - 类型: bool (默认: 0).
+
+* `--show_layer_stat`
+  - 是否显示**每个批次数据**中每层的数值统计.
+  - 类型: bool (默认: 0).
+
+## 训练
+
+* `--log_period`
+  - 每log_period个批次打印日志进度.
+  - 类型: int32 (默认: 100).
+
+* `--dot_period`
+  - 每dot_period个批次输出符号'.'.
+  - 类型: int32 (默认: 1).
+
+* `--saving_period`
+  - 每saving_period轮保存训练参数.
+  - 类型: int32 (默认: 1).
+
+* `--save_dir`
+  - 保存模型参数的目录，需要明确指定，但不需要提前创建。
+  - 类型: string (默认: null).
+
+* `--start_pass`
+  - 从start_pass轮开始训练，会加载上一轮的参数。
+  - 类型: int32 (默认: 0).
+
+* `--show_parameter_stats_period`
+  - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
+  - 类型: int32 (默认: 0).
+
+* `--save_only_one`
+  - 只保存最后一轮的参数，而之前的参数将会被删除。
+  - 类型: bool (默认: 0).
+
+* `--load_missing_parameter_strategy`
+  - 当模型参数不存在时，指定加载的方式。目前支持fail/rand/zero三种操作.
+    - `fail`: 程序直接退出.
+    - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
+    - `zero`: 所有参数置为零.
+  - 类型: string (默认: fail).
+
+* `--init_model_path`
+   - 初始化模型的路径。如果设置该参数，start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
+   - 类型: string (默认: null).
+
+* `--saving_period_by_batches`
+   - 在一轮中每saving_period_by_batches个批次保存一次参数。
+   - 类型: int32 (默认: 0).
+
+* `--log_error_clipping`
+  - 当在网络层配置中设置**error_clipping_threshold**时，该参数指示是否打印错误截断日志。如果为true，**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--log_clipping`
+  - 当在训练配置中设置**gradient_clipping_threshold**时，该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--use_old_updater`
+  - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater，主要为开发者使用，使用者通常无需关心.
+  - 类型: bool (默认: 0).
+
+* `--enable_grad_share`
+  - 启用梯度参数的阈值，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - 梯度参数的分块数目，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 64).
+
+## 测试
+
+* `--test_pass`
+  - 加载test_pass轮的模型用于测试.
+  - 类型: int32 (默认: -1).
+
+* `--test_period`
+   - 如果为0，每轮结束时对所有测试数据进行测试；如果不为0，每test_period个批次对所有测试数据进行测试.
+  - 类型: int32 (默认: 0).
+
+* `--test_wait`
+  - 指示当指定轮的测试模型不存在时，是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试，可以使用该参数.
+  - 类型: bool (默认: 0).
+
+* `--model_list`
+  - 测试时指定的存储模型列表的文件.
+  - 类型: string (默认: "", null).
+
+* `--predict_output_dir`
+  - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定，默认为null，意思是不保存结果。在测试阶段，如果你想要保存某些层的特征图，请指定该目录。需要注意的是，网络层的输出是经过激活函数之后的值.
+  - 类型: string (默认: "", null).
+
+* `--average_test_period`
+  - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除，默认为0，意思是不使用平均参数执行测试.
+  - 类型: int32 (默认: 0).
+
+* `--distribute_test`
+  - 在分布式环境中测试，将多台机器的测试结果合并.
+  - 类型: bool (默认: 0).
+
+* `--predict_file`
+  - 保存预测结果的文件名。该参数默认为null，意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层，每轮都会保存预测结果.
+  - 类型: string (默认: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - 指示使用哪个GPU核.
+  - 类型: int32 (默认: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - 如果为true，一个GPU设备上不允许配置多个模型.
+  - 类型: bool (默认: 1).
+
+* `--parallel_nn`
+  - 指示是否使用多线程来计算一个神经网络。如果为false，设置gpu_id指定使用哪个GPU核（训练配置中的设备属性将会无效）。如果为true，GPU核在训练配置中指定（gpu_id无效）.
+  - 类型: bool (默认: 0).
+
+* `--cudnn_dir`
+  - 选择路径来动态加载NVIDIA CuDNN库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cuda_dir`
+  - 选择路径来动态加载NVIDIA CUDA库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - 指定cuDNN的最大工作空间容限，单位是MB，默认为4096MB=4GB. 
+  - 类型: int32 (默认: 4096MB=4GB)
+
+## 自然语言处理(NLP): RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
+  - 类型: bool (默认: 0).
+
+* `--prev_batch_state`
+  - 标识是否为连续的batch计算.
+  - 类型: bool (默认: 0).
+
+* `--beam_size`
+  - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上，都会产生当前层状态的所有继承结果，按启发式损失的大小递增排序。然而，每层上只能保存固定数目个最好的状态，该数目是提前定义好的，称之为集束大小.
+  - 类型: int32 (默认: 1).
+
+* `--diy_beam_search_prob_so`
+  - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
+  - 类型: string (默认: "", null).
+
+## 数据支持(DataProvider)
+
+* `--memory_threshold_on_load_data`
+  - 内存容限阈值，当超过该阈值时，停止加载数据.
+  - 类型: double (默认: 1.0).
+
+## 单元测试
+
+* `--checkgrad_eps`
+  - 使用checkgrad模式时的参数变化大小.
+  - 类型: double (默认: 1e-05).
+
+## 参数服务器和分布式通信
+
+* `--start_pserver`
+  - 指示是否开启参数服务器(parameter server).
+  - 类型: bool (默认: 0).
+
+* `--pservers`
+  - 参数服务器的IP地址，以逗号间隔.
+  - 类型: string (默认: "127.0.0.1").
+
+* `--port`
+  - 参数服务器的监听端口.
+  - 类型: int32 (默认: 20134).
+
+* `--ports_num`
+  - 发送参数的端口号，根据默认端口号递增.
+  - 类型: int32 (默认: 1).
+
+* `--trainer_id`
+  - 在分布式训练中，每个训练节点必须指定一个唯一的id号，从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
+  - 类型: int32 (默认: 0).
+
+* `--num_gradient_servers`
+  - 梯度服务器的数量，该参数在集群提交环境中自动设置.
+  - 类型: int32 (默认: 1).
+
+* `--small_messages`
+  - 如果消息数据太小，建议将该参数设为true，启动快速应答，无延迟.
+  - 类型: bool (默认: 0).
+
+* `--sock_send_buf_size`
+  - 限制套接字发送缓冲区的大小。如果仔细设置的话，可以有效减小网络的阻塞.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - 限制套接字接收缓冲区的大小.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - 参数服务器的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--parameter_block_size_for_sparse`
+  - 参数服务器稀疏更新的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--log_period_server`
+  - 在参数服务器终端每log_period_server个批次打印日志进度.
+  - 类型: int32 (默认: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - 在参数服务器上加载和保存参数，只有当设置了sparse_remote_update参数时才有效.
+  - 类型: bool (默认: 0).
+
+* `--pserver_num_threads`
+  - 同步执行操作的线程数.
+  - 类型: bool (默认: 1).
+
+* `--ports_num_for_sparse`
+  - 发送参数的端口号，根据默认值递增(port + ports_num)，用于稀疏训练中.
+  - 类型: int32 (默认: 0).
+
+* `--nics`
+  - 参数服务器的网络设备名称，已经在集群提交环境中完成设置.
+  - 类型: string (默认: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - 使用rdma还是tcp传输协议，该参数已经在集群提交环境中完成设置.
+  - 类型: string (默认: "tcp").
+
+## 异步随机梯度下降(Async SGD)
+* `--async_count`
+  - 定义异步训练的长度，如果为0，则使用同步训练.
+  - 类型: int32 (默认: 0).
+
+* `--async_lagged_ratio_min`
+  - 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
+  - 类型: double (默认: 1.0).
+
+* `--async_lagged_ratio_default`
+  - 如果在网络配置中未设置async_lagged_grad_discard_ratio，则使用该参数作为默认值.
+  - 类型: double (默认: 1.5).
+
+## 性能调优(Performance Tuning)
+
+* `--log_barrier_abstract`
+  - 如果为true，则显示阻隔性能的摘要信息.
+  - 类型: bool (默认: 1).
+
+* `--log_barrier_show_log`
+  - 如果为true，则总会显示阻隔摘要信息，即使间隔很小.
+  - 类型: bool (默认: 0).
+
+* `--log_barrier_lowest_nodes`
+  - 最少显示多少个节点.
+  - 类型: int32 (默认: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
+  - 类型: bool (默认: 0).
+
+* `--show_check_sparse_distribution_log`
+  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
+  - 类型: bool (默认: 0).
+
+* `--check_sparse_distribution_batches`
+  - 每运行多少个批次执行一次稀疏参数分布的检查.
+  - 类型: int32 (默认: 100).
+
+* `--check_sparse_distribution_ratio`
+  - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio *  check_sparse_distribution_batches次，程序停止.
+  - 类型: double (默认: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - 不同参数服务器上数据大小的最大值与最小值的比率.
+  - 类型: double (默认: 2).
+
+## 矩阵/向量/随机数
+* `--enable_parallel_vector`
+  - 启动并行向量的阈值.
+  - 类型: int32 (默认: 0).
+
+* `--seed`
+  - 随机数的种子。srand(time)的为0.
+  - 类型: int32 (默认: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - 是否将全局种子应用于本地线程的随机数.
+  - 类型: bool (默认: 0).
diff --git a/doc/v2/howto/cmd_parameter/detail_introduction_en.md b/doc/v2/howto/cmd_parameter/detail_introduction_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b681ebc81a355dfc1a7638a4463dff6979929a45
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/detail_introduction_en.md
@@ -0,0 +1,327 @@
+```eval_rst
+..  _cmd_detail_introduction:
+```
+
+# Detail Description
+
+## Common
+
+* `--job`
+  - Job mode, including: **train, test, checkgrad**, where checkgrad is mainly for developers and users do not need to care about.
+  - type: string (default: train)
+
+* `--config`
+  - Use to specfiy network configure file.
+  - type: string (default: null).
+
+* `--use_gpu`
+  - Whether to use GPU for training, false is cpu mode and true is gpu mode.
+  - type: bool (default: 1).
+
+* `--local`
+  - Whether the training is in local mode or not. True when training locally or using one node in cluster. False when using multiple machines in cluster.
+  - type: bool (default: 1).
+
+* `--trainer_count`
+  - Define the number of threads used in one machine. For example, trainer_count = 4, means use 4 GPU in GPU mode and 4 threads in CPU mode. Each thread (or GPU) is assigned to 1/4 samples in current batch. That is to say, if setting batch_size of 512 in trainer config, each thread train 128 samples.
+  - type: int32 (default: 1).
+
+* `--num_passes`
+   - When `--job=train`, means training for num_passes passes. One pass means training all samples in dataset one time. When `--job=test`, means testing data from model of test_pass to  model of (num_passes - 1).
+   - type: int32 (default: 100).
+
+* `--config_args`
+  - arguments passed to config file. Format: key1=value1,key2=value2.
+  - type: string (default: null).
+
+* `--version`
+  - Whether to print version information.
+  - type: bool (default: 0).
+
+* `--show_layer_stat`
+  - Whether to show the statistics of each layer **per batch**.
+  - type: bool (default: 0).
+
+## Train
+
+* `--log_period`
+  - Log progress every log_period batches.
+  - type: int32 (default: 100).
+
+* `--dot_period`
+  - Print '.' every dot_period batches.
+  - type: int32 (default: 1).
+
+* `--saving_period`
+  - Save parameters every saving_period passes
+  - type: int32 (default: 1).
+
+* `--save_dir`
+  - Directory for saving model parameters. It needs to be specified, but no need to be created in advance.
+  - type: string (default: null).
+
+* `--start_pass`
+  - Start training from this pass. It will load parameters from the previous pass.
+  - type: int32 (default: 0).
+
+* `--show_parameter_stats_period`
+  - Show parameter statistic during training every show_parameter_stats_period batches. It will not show by default.
+  - type: int32 (default: 0).
+
+* `--save_only_one`
+  - Save the parameters only in last pass, while the previous parameters will be removed.
+  - type: bool (default: 0).
+
+* `--load_missing_parameter_strategy`
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
+    - `fail`: program will exit.
+    - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
+    - `zero`: all parameters are zero.
+  - type: string (default: fail).
+
+* `--init_model_path`
+   - Path of the initialization model. If it was set, start\_pass will be ignored. It can be used to specify model path in testing mode as well.
+   - type: string (default: null).
+
+* `--saving_period_by_batches`
+   - Save parameters every saving_period_by_batches batches in one pass.
+   - type: int32 (default: 0).
+
+* `--log_error_clipping`
+  - Whether to print error clipping log when setting **error_clipping_threshold** in layer config. If it is true, log will be printed in backward propagation **per batch**. This clipping effects on **gradient of output**.
+  - type: bool (default: 0).
+
+* `--log_clipping`
+  - Enable print log clipping or not when setting **gradient_clipping_threshold** in trainer config. This clipping effects on **gradient w.r.t. (with respect to) weight**.
+  - type: bool (default: 0).
+
+* `--use_old_updater`
+  - Whether to use the old RemoteParameterUpdater. Default use ConcurrentRemoteParameterUpdater. It is mainly for deverlopers and users usually do not need to care about.
+  - type: bool (default: 0).
+
+* `--enable_grad_share`
+  - threshold for enable gradient parameter, which is shared for batch multi-cpu training.
+  - type: int32 (default: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - block number of gradient parameter, which is shared for batch multi-cpu training.
+  - type: int32 (default: 64).
+
+## Test
+
+* `--test_pass`
+  - Load parameter from this pass to test.
+  - type: int32 (default: -1).
+
+* `--test_period`
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
+  - type: int32 (default: 0).
+
+* `--test_wait`
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
+  - type: bool (default: 0).
+
+* `--model_list`
+  - File that saves the model list when testing. 
+  - type: string (default: "", null).
+
+* `--predict_output_dir`
+  - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
+  - type: string (default: "", null).
+
+* `--average_test_period`
+  - Do test on average parameter every `average_test_period` batches. It MUST be devided by FLAGS_log_period. Default 0 means do not test on average parameter.
+  - type: int32 (default: 0).
+
+* `--distribute_test`
+  - Testing in distribute environment will merge results from multiple machines.
+  - type: bool (default: 0).
+
+* `--predict_file`
+  - File name for saving predicted result. Default, this argument is null, meaning save nothing. Now, this argument is only used in AucValidationLayer and PnpairValidationLayer, and saves predicted result every pass.
+  - type: string (default: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - Which gpu core to use.
+  - type: int32 (default: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - If true, do not allow multiple models on one GPU device.
+  - type: bool (default: 1).
+
+* `--parallel_nn`
+  - Whether to use multi-thread to calculate one neural network or not. If false, use gpu_id specify which gpu core to use (the device property in trainer config will be ingored). If true, the gpu core is specified in trainer config (gpu_id will be ignored).
+  - type: bool (default: 0).
+
+* `--cudnn_dir`
+  - Choose path to dynamic load NVIDIA CuDNN library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
+  - type: string (default: "", null)
+
+* `--cuda_dir`
+  - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
+  - type: string (default: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
+## NLP: RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - Whether to use batch method for calculation in simple RecurrentLayer.
+  - type: bool (default: 0).
+
+* `--prev_batch_state`
+  - batch is continue with next batch.
+  - type: bool (default: 0).
+
+* `--beam_size`
+  - Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
+  - type: int32 (default: 1).
+
+* `--diy_beam_search_prob_so`
+  - Specify shared dynamic library. It can be defined out of paddle by user.
+  - type: string (default: "", null).
+
+## DataProvider
+
+* `--memory_threshold_on_load_data`
+  - Stop loading data when memory is not sufficient.
+  - type: double (default: 1.0).
+
+## Unit Test
+
+* `--checkgrad_eps`
+  - parameter change size for checkgrad.
+  - type: double (default: 1e-05).
+
+## Parameter Server and Distributed Communication
+
+* `--start_pserver`
+  - Whether to start pserver (parameter server).
+  - type: bool (default: 0).
+
+* `--pservers`
+  - Comma separated IP addresses of pservers.
+  - type: string (default: "127.0.0.1").
+
+* `--port`
+  - Listening port for pserver.
+  - type: int32 (default: 20134).
+
+* `--ports_num`
+  - The ports number for parameter send, increment based on default port number.
+  - type: int32 (default: 1).
+
+* `--trainer_id`
+  - In distributed training, each trainer must be given an unique id ranging from 0 to num_trainers-1. Trainer 0 is the master trainer. User do not need to care this flag.
+  - type: int32 (default: 0).
+
+* `--num_gradient_servers`
+  - Numbers of gradient servers. This arguments is set automatically in cluster submitting environment.
+  - type: int32 (default: 1).
+
+* `--small_messages`
+  - If message size is small, recommend set it True to enable quick ACK and no delay
+  - type: bool (default: 0).
+
+* `--sock_send_buf_size`
+  - Restrict socket send buffer size. It can reduce network congestion if set carefully.
+  - type: int32 (default: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - Restrict socket recieve buffer size.
+  - type: int32 (default: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - Parameter block size for pserver, will automatically calculate a suitable value if it's not set.
+  - type: int32 (default: 0).
+
+* `--parameter_block_size_for_sparse`
+  - Parameter block size for sparse update pserver, will automatically calculate a suitable value if it's not set.
+  - type: int32 (default: 0).
+
+* `--log_period_server`
+  - Log progress every log_period_server batches at pserver end.
+  - type: int32 (default: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - Load and save parameters in pserver. Only work when parameter set sparse_remote_update.
+  - type: bool (default: 0).
+
+* `--pserver_num_threads`
+  - number of threads for sync op exec.
+  - type: bool (default: 1).
+
+* `--ports_num_for_sparse`
+  - The ports number for parameter send, increment based on default (port + ports_num). It is used by sparse Tranning.
+  - type: int32 (default: 0).
+
+* `--nics`
+  - Network device name for pservers, already set in cluster submitting environment.
+  - type: string (default: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - Use rdma or tcp transport protocol, already set in cluster submitting environment.
+  - type: string (default: "tcp").
+
+## Async SGD
+* `--async_count`
+  - Defined the asynchronous training length, if 0, then use synchronized training.
+  - type: int32 (default: 0).
+
+* `--async_lagged_ratio_min`
+  - Control the minimize value of `config_.async_lagged_grad_discard_ratio()`.
+  - type: double (default: 1.0).
+
+* `--async_lagged_ratio_default`
+  - If async_lagged_grad_discard_ratio is not set in network config, use it as defalut value.
+  - type: double (default: 1.5).
+
+## Performance Tuning
+
+* `--log_barrier_abstract`
+  - If true, show abstract barrier performance information.
+  - type: bool (default: 1).
+
+* `--log_barrier_show_log`
+  - If true, always show barrier abstract even with little gap.
+  - type: bool (default: 0).
+
+* `--log_barrier_lowest_nodes`
+  - How many lowest node will be logged.
+  - type: int32 (default: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - Whether to check that the distribution of sparse parameter on all pservers is balanced.
+  - type: bool (default: 0).
+
+* `--show_check_sparse_distribution_log`
+  - show log details for sparse parameter distribution in pserver.
+  - type: bool (default: 0).
+
+* `--check_sparse_distribution_batches`
+  - Running sparse parameter distribution check every so many batches.
+  - type: int32 (default: 100).
+
+* `--check_sparse_distribution_ratio`
+  - If parameters dispatched to different pservers have an unbalanced distribution for check_sparse_distribution_ratio *  check_sparse_distribution_batches times, crash program.
+  - type: double (default: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - The ratio of maximum data size / minimun data size for different pserver.
+  - type: double (default: 2).
+
+## Matrix/Vector/RandomNumber
+* `--enable_parallel_vector`
+  - threshold for enable parallel vector.
+  - type: int32 (default: 0).
+
+* `--seed`
+  - random number seed. 0 for srand(time)
+  - type: int32 (default: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - Whether to use global seed in rand of thread local.
+  - type: bool (default: 0).
diff --git a/doc/v2/howto/cmd_parameter/index_cn.rst b/doc/v2/howto/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6900bb1443e611d326e8d5640e794ac2b9079beb
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/index_cn.rst
@@ -0,0 +1,26 @@
+..  _cmd_line_index:
+
+命令行参数设置
+===============
+深度学习算法的实现有着多样化的特点，运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。
+
+在这一部分，首先以几个实际场景为例，展示了部分命令行参数的使用:
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+
+接着对所有参数的使用场合进行概述和分类:
+
+..  toctree::
+  :maxdepth: 1
+
+  arguments_cn.md
+
+最后给出细节描述，详细解释这些参数的属性和意义:
+
+..  toctree::
+  :maxdepth: 1
+
+  detail_introduction_cn.md
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f49683948ef78f363e2439cc25332431830eeb24
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -0,0 +1,26 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
+  arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
+  detail_introduction_en.md
diff --git a/doc/v2/howto/cmd_parameter/use_case_cn.md b/doc/v2/howto/cmd_parameter/use_case_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..db8c39d950771726346ff9c9481990abc13036cf
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/use_case_cn.md
@@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/v2/howto/cmd_parameter/use_case_en.md b/doc/v2/howto/cmd_parameter/use_case_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e287f0c4b9617cbc6504596512bf408c56dc10f9
--- /dev/null
+++ b/doc/v2/howto/cmd_parameter/use_case_en.md
@@ -0,0 +1,182 @@
+# Use Case
+
+## Local Training
+
+These command line arguments are commonly used by local training experiments, such as image classification, natural language processing, et al.
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(default:true)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(default:1)
+  --test_period=M \                      #(default:0) 
+  --num_passes=N \                       #(defalut:100)
+  --log_period=K \                       #(default:100)
+  --dot_period=1000 \                    #(default:1)
+  #[--show_parameter_stats_period=100] \ #(default:0)
+  #[--saving_period_by_batches=200] \    #(default:0)
+```
+`show_parameter_stats_period` and `saving_period_by_batches` are optional according to your task.
+
+### 1) Pass Command Argument to Network config
+
+`config_args` is a useful parameter to pass arguments to network config.
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+And `get_config_arg` can be used to parse these arguments in network config as follows:
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: the name specified in the `--config_args`
+- type: value type, bool, int, str, float etc.
+- default_value: default value if not set.
+
+### 2) Use Model to Initialize Network
+
+add argument:
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## Local Testing
+
+Method 1:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- use init\_model\_path to specify test model.
+- only can test one model.
+
+Method 2:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- use model_list to specify test models
+- can test several models, where model.list likes:
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+Method 3:
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+This way must use model path saved by Paddle like this: `model/pass-%5d`. Testing model is from M-th pass to (N-1)-th pass. For example: M=12 and N=14 will test `model/pass-00012` and `model/pass-00013`.
+
+## Sparse Training
+
+Sparse training is usually used to accelerate calculation when input is sparse data with highly dimension. For example, dictionary dimension of input data is 1 million, but one sample just have several words. In paddle, sparse matrix multiplication is used in forward propagation and sparse updating is perfomed on weight updating after backward propagation.
+
+### 1) Local training
+
+You need to set **sparse\_update=True** in network config.  Check the network config documentation for more details.
+
+### 2) cluster training
+
+Add the following argument for cluster training of a sparse model. At the same time you need to set **sparse\_remote\_update=True** in network config. Check the network config documentation for more details.
+
+```
+--ports_num_for_sparse=1    #(default: 0)
+```
+
+## parallel_nn
+`parallel_nn` can be set to mixed use of GPUs and CPUs to compute layers. That is to say, you can deploy network to use a GPU to compute some layers and use a CPU to compute other layers. The other way is to split layers into different GPUs, which can **reduce GPU memory** or **use parallel computation to accelerate some layers**.
+
+If you want to use these characteristics, you need to specify device ID in network config (denote it as deviceId) and add command line argument:
+
+```
+--parallel_nn=true
+```
+### case 1: Mixed Use of GPU and CPU
+Consider the following example:
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
+
+- device=-1: use the CPU for layer fc3.
+
+- trainer_count:
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
+
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
+
+### Case 2: Specify Layers in Different Devices
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+In this case, we assume that there are 4 GPUs in one machine.
+
+- trainer_count=1:
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.
+
+- trainer_count=2:
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.
+
+- trainer_count=4:
+  - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
+
+**Allocation of device ID when `device!=-1`**:
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             specified in layer.
+gpu_id:               0 by default.
+threadId:             thread ID, range: 0,1,..., trainer_count-1
+numDevices_:          device (GPU) count in machine.
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/v2/howto/index_cn.rst b/doc/v2/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0268907bceb11cd53a4630c3f8b8e0424abe247
--- /dev/null
+++ b/doc/v2/howto/index_cn.rst
@@ -0,0 +1,37 @@
+进阶使用
+========
+
+PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。使用方式请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_parameter/index_cn.rst
+
+PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务，具体环境配置和使用说明请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  cluster/index_cn.rst
+
+PaddlePaddle提供了用于预测的C-API，关于C-API的使用，我们提供了如下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  capi/index_cn.rst
+
+PaddlePaddle支持多种灵活和高效的循环神经网络，具体配置使用方式请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/index_cn.rst
+
+关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优，请参考：
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..35ef197f58f1f865e2cdbdebb567d5637284637a
--- /dev/null
+++ b/doc/v2/howto/index_en.rst
@@ -0,0 +1,37 @@
+HOW TO
+========
+
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_parameter/index_en.rst
+
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+
+..  toctree::
+  :maxdepth: 1
+
+  cluster/index_en.rst
+
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
+  capi/index_en.rst
+
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/index_en.rst
+
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_en.rst
diff --git a/doc/v2/howto/optimization/gpu_profiling_cn.rst b/doc/v2/howto/optimization/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2396716bddd4810fa77c738d41f5482aa6d6055
--- /dev/null
+++ b/doc/v2/howto/optimization/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+============
+GPU性能调优
+============
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/v2/howto/optimization/gpu_profiling_en.rst b/doc/v2/howto/optimization/gpu_profiling_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e439be9bba8935cdd65f1c131cfd3725530ec0e
--- /dev/null
+++ b/doc/v2/howto/optimization/gpu_profiling_en.rst
@@ -0,0 +1,240 @@
+====================
+Tune GPU Performance 
+====================
+
+..  contents::
+
+This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
+
+- What is profiling?
+- Why we need profiling?
+- How to do profiling?
+- Profile tools
+- Hands-on Tutorial
+- Profiling tips
+
+What's profiling?
+=================
+In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
+complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
+Most commonly, profiling information serves to aid program optimization.
+
+Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
+understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
+profiling, it can interpret why does an operation take a long time?
+
+Why we need profiling?
+======================
+Since training deep neural network typically take a very long time to get over, performance is gradually becoming
+the most important thing in deep learning field. The first step to improve performance is to understand what parts
+are slow.  There is no point in improving performance of a region which doesn’t take much time!
+
+
+How to do profiling?
+====================
+To achieve maximum performance, there are five steps you can take to reach your goals.
+
+- Profile the code
+- Find the slow parts
+- Work out why they’re slow
+- Make them fast
+- Profile the code again
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+Profiler Tools
+==============
+For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
+
+**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
+In this tutorial, we will focus on nvprof and nvvp.
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+The above code snippet includes two methods, you can use any of them to profile the regions of interest.
+
+1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+You can find more details about how to use both of them in the next session.
+
+Hands-on Approach
+=================
+
+Built-in Timer
+--------------
+
+To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
+Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
+As a simple example, consider the following:
+
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. Execute your code and observe the results (see the emphasize-lines).
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof profiler
+---------------
+
+To use this command line profiler **nvprof**, you can simply issue the following command:
+
+1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
+
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. Use Nvidia profiler **nvprof** to profile the binary.
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
+
+Then, you can get the following profiling result:
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp profiler
+-------------
+
+For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
+run application through GUI.
+
+**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
+As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
+allow us to fully utilize all warps on the GPU.
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
+For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+Profiling tips
+==============
+
+- The **nvprof** and **nvvp** output is a very good place to start.
+- The timeline is a good place to go next.
+- Only dig deep into a kernel if it’s taking a significant amount of your time.
+- Where possible, try to match profiler output with theory.
+    1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
+    2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
+- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
+
+
+Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
+Your mileage may vary!
+
+Reference
+=========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/v2/howto/optimization/nvvp1.png b/doc/v2/howto/optimization/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp1.png differ
diff --git a/doc/v2/howto/optimization/nvvp2.png b/doc/v2/howto/optimization/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp2.png differ
diff --git a/doc/v2/howto/optimization/nvvp3.png b/doc/v2/howto/optimization/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp3.png differ
diff --git a/doc/v2/howto/optimization/nvvp4.png b/doc/v2/howto/optimization/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/v2/howto/optimization/nvvp4.png differ
diff --git a/doc/v2/howto/rnn/hierarchical_layer_cn.rst b/doc/v2/howto/rnn/hierarchical_layer_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2f8f408b40299890da694862a7b9418cf9ff07f2
--- /dev/null
+++ b/doc/v2/howto/rnn/hierarchical_layer_cn.rst
@@ -0,0 +1,89 @@
+###########################
+支持双层序列作为输入的Layer
+###########################
+
+..	contents::
+
+概述
+====
+
+在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
+
+双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
+
+我们可以按照如下层次定义非序列，单层序列，以及双层序列。
+
++ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
++ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
++ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
+
+在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
+
+pooling
+========
+
+pooling 的使用示例如下。
+
+..	code-block:: bash
+
+        seq_pool = pooling(input=layer,
+                           pooling_type=pooling.Max(),
+                           agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
+
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
+
+  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列，或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
+
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
+
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
+
+last_seq 和 first_seq
+=====================
+
+last_seq 的使用示例如下（first_seq 类似）。
+
+..	code-block:: bash
+
+        last = last_seq(input=layer,
+                        agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
+
+  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
+
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
+
+expand
+======
+
+expand 的使用示例如下。
+
+..	code-block:: bash
+
+        ex = expand(input=layer1,
+                    expand_as=layer2,
+                    expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时（默认值）：
+
+  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列或一个双层序列，输出序列的类型（双层序列或单层序列）和序列中含有元素的数目同 layer2 一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+
+- `expand_level=ExpandLevel.FROM_SEQUENCE` 时：
+
+  - 作用：一个单层序列经过运算扩展成一个双层序列
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
diff --git a/doc/v2/howto/rnn/hierarchical_layer_en.rst b/doc/v2/howto/rnn/hierarchical_layer_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fb668f1babb47f49b2dab6d2411565e99599d8b0
--- /dev/null
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
@@ -0,0 +1,89 @@
+###########################
+Layers that Support Hierarchical Sequences as Input
+###########################
+ 
+.. contents::
+ 
+Overview 
+====
+ 
+A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
+ 
+A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
+ 
+We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
+ 
++ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
++ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
++ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
+ 
+In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
+ 
+`pooling`
+========
+ 
+The use of pooling is as follows:
+ 
+.. code-block:: bash
+ 
+        Seq_pool = pooling(input=layer,
+                           Pooling_type=pooling.Max(),
+                           Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
+ 
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+ 
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
+ 
+`last_seq` and `first_seq`
+=====================
+ 
+An example of using `last_seq` is as follows (usage of `first_seq` is similar).
+ 
+.. code-block:: bash
+ 
+        Last = last_seq(input=layer,
+                        Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
+ 
+`expand`
+======
+ 
+The use of expand is as follows.
+ 
+.. code-block:: bash
+ 
+        Ex = expand(input=layer1,
+                    Expand_as=layer2,
+                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
+ 
+  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
+  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
+  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
+ 
+- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
+ 
+  - Effect: a single-level sequence is extended to a double-level sequence
+  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
+  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9d6d417075485dceb1ee71f527b408aa6a6638ea
--- /dev/null
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -0,0 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
+单双层RNN API对比介绍
+#####################
+
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+示例1：双层RNN，子序列间无Memory
+================================
+
+在双层RNN中的经典情况是将内层的每一个时间序列数据，分别进行序列操作；并且内层的序列操作之间独立无依赖，即不需要使用Memory\ 。
+
+在本示例中，单层RNN和双层RNN的网络配置，都是将每一句分好词后的句子，使用LSTM作为encoder，压缩成一个向量。区别是RNN使用两层序列模型，将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下：
+
+* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+读取双层序列数据
+----------------
+
+首先，本示例中使用的原始数据如下\:
+
+- 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- 这是普通的单层时间序列的DataProvider代码，其说明如下：
+  
+  * DataProvider共返回两个数据，分别是words和label。即上述代码中的第19行。
+
+    - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
+    - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- 对于同样的数据，双层时间序列的DataProvider的代码。其说明如下：
+
+  - DataProvider共返回两组数据，分别是sentences和labels。即在双层序列的原始数据中，每一组内的所有句子和labels
+  - sentences是双层时间序列的数据。由于它内部包含了每组数据中的所有句子，且每个句子表示为对应的词表索引数组，因此它是integer_value_sub_sequence 类型的，即双层时间序列。
+  - labels是每组内每个句子的标签，故而是一个单层时间序列。
+
+
+模型配置的模型配置
+------------------------------------------
+
+首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+其次，我们看一下语义相同的双层RNN的网络配置\:
+
+* PaddlePaddle中的许多layer并不在意输入是否是时间序列，例如\ :code:`embedding_layer`\ 。在这些layer中，所有的操作都是针对每一个时间步来进行的。
+
+* 在该配置的7-26行(高亮部分)，将双层时间序列数据先变换成单层时间序列数据，再对每一个单层时间序列进行处理。
+
+  * 使用\ :code:`recurrent_group`\ 这个函数进行变换，在变换时需要将输入序列传入。由于我们想要的变换是双层时间序列=> 单层时间序列，所以我们需要将输入数据标记成\ :code:`SubsequenceInput`\ 。
+  
+  * 在本例中，我们将原始数据的每一组，通过\ :code:`recurrent_group`\ 进行拆解，拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。
+
+* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。
+
+* 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+示例2：双层RNN，子序列间有Memory
+================================
+
+本示例意图使用单层RNN和双层RNN实现两个完全等价的全连接RNN。
+
+* 对于单层RNN，输入数据为一个完整的时间序列，例如\ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ 。
+
+* 对于双层RNN，输入数据为在单层RNN数据里面，任意将一些数据组合成双层时间序列，例如\ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`。
+
+模型配置的模型配置
+------------------
+
+我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
+
+- 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- 双层RNN，外层memory是一个元素：
+
+  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
+  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    PaddlePaddle目前只支持在每个时间步中，Memory的时间序列长度一致的情况。
+
+示例3：双层RNN，输入不等长
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
+
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
+
+示例3对于单层RNN和双层RNN数据完全相同。
+
+* 对于单层RNN的数据一共有两个样本，他们分别是\ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ 和\ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ 。对于每一个单层RNN的数据，均有两组特征。
+
+* 在单层数据的基础上，双层RNN数据随意加了一些隔断，例如将第一条数据转化为\ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ 。
+
+* 需要注意的是PaddlePaddle目前只支持子序列数目一样的多输入双层RNN。例如本例中的两个特征，均有三个子序列。每个子序列长度可以不一致，但是子序列的数目必须一样。
+
+
+模型配置
+--------
+
+和示例2中的配置类似，示例3的配置使用了单层RNN和双层RNN，实现两个完全等价的全连接RNN。
+
+* 单层RNN\:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* 双层RNN\ \:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+在上面代码中，单层和双层序列的使用和示例2中的示例类似，区别是同时处理了两个输入。而对于双层序列，两个输入的子序列长度也并不相同。但是，我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状，和\ :code:`emb2`\ 的序列形状一致。
+
+
+词汇表
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory是PaddlePaddle实现RNN时候使用的一个概念。RNN即时间递归神经网络，通常要求时间步之间具有一些依赖性，即当前时间步下的神经网络依赖前一个时间步神经网络中某一个神经元输出。如下图所示。
+
+..  graphviz:: src/glossary_rnn.dot
+
+上图中虚线的连接，即是跨越时间步的网络连接。PaddlePaddle在实现RNN的时候，将这种跨越时间步的连接用一个特殊的神经网络单元实现。这个神经网络单元就叫Memory。Memory可以缓存上一个时刻某一个神经元的输出，然后在下一个时间步输入给另一个神经元。使用Memory的RNN实现便如下图所示。
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+使用这种方式，PaddlePaddle可以比较简单的判断哪些输出是应该跨越时间步的，哪些不是。
+
+..  _glossary_timestep:
+
+时间步
+------
+
+参考时间序列。
+
+
+..  _glossary_sequence:
+
+时间序列
+--------
+
+时间序列(time series)是指一系列的特征数据。这些特征数据之间的顺序是有意义的。即特征的数组，而不是特征的集合。而这每一个数组元素，或者每一个系列里的特征数据，即为一个时间步(time step)。值得注意的是，时间序列、时间步的概念，并不真正的和『时间』有关。只要一系列特征数据中的『顺序』是有意义的，即为时间序列的输入。
+
+举例说明，例如文本分类中，我们通常将一句话理解成一个时间序列。比如一句话中的每一个单词，会变成词表中的位置。而这一句话就可以表示成这些位置的数组。例如 :code:`[9, 2, 3, 5, 3]` 。
+
+关于时间序列(time series)的更详细准确的定义，可以参考 `维基百科页面 Time series <https://en.wikipedia.org/wiki/Time_series>`_ 或者 `维基百科中文页面 时间序列 <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_ 。
+
+另外，Paddle中经常会将时间序列成为 :code:`Sequence` 。他们在Paddle的文档和API中是一个概念。 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+RNN 在PaddlePaddle的文档中，一般表示 :code:`Recurrent neural network`，即时间递归神经网络。详细介绍可以参考 `维基百科页面 Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ 或者 `中文维基百科页面 <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ 中关于时间递归神经网络的介绍。
+
+RNN 一般在PaddlePaddle中，指对于一个时间序列输入数据，每一个时间步之间的神经网络具有一定的相关性。例如，某一个神经元的一个输入为上一个时间步网络中某一个神经元的输出。或者，从每一个时间步来看，神经网络的网络结构中具有有向环结构。
+
+..  _glossary_双层RNN:
+
+双层RNN
+-------
+
+双层RNN顾名思义，即RNN之间有一次嵌套关系。输入数据整体上是一个时间序列，而对于每一个内层特征数据而言，也是一个时间序列。即二维数组，或者数组的数组这个概念。 而双层RNN是可以处理这种输入数据的网络结构。
+
+例如，对于段落的文本分类，即将一段话进行分类。我们将一段话看成句子的数组，每个句子又是单词的数组。这便是一种双层RNN的输入数据。而将这个段落的每一句话用lstm编码成一个向量，再对每一句话的编码向量用lstm编码成一个段落的向量。再对这个段落向量进行分类，即为这个双层RNN的网络结构。
+
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a4485f7b5edf21871444801230ab1ee191b1137b
--- /dev/null
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -0,0 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
+API comparision between RNN and hierarchical RNN
+#####################
+
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+Reading hierarchical sequence data
+----------------
+
+Firstly, the original data in this example is as follows \:
+
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+
+
+Model configuration
+------------------------------------------
+
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+
+model configuration
+------------------
+
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- hierarchical RNN, the outer layer's memory is an element. 
+
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ .
+
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+
+
+model configuration
+--------
+
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+
+* single-layer RNN\:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* hierarchical RNN\ \:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+
+
+Glossary
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+
+..  graphviz:: src/glossary_rnn.dot
+
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+
+..  _glossary_timestep:
+
+time step
+------
+
+refers to time series
+
+
+..  _glossary_sequence:
+
+time series
+--------
+
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+
+..  _glossary_hierarchical_RNN:
+
+hierarchical RNN
+-------
+
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
+
diff --git a/doc/v2/howto/rnn/index_cn.rst b/doc/v2/howto/rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2032fb9e296ab024c68da1348064580c8c88d5be
--- /dev/null
+++ b/doc/v2/howto/rnn/index_cn.rst
@@ -0,0 +1,34 @@
+RNN模型
+===========
+循环神经网络（RNN）是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。
+这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。
+
+第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌：首先以简单的循环神经网络（vanilla RNN）为例，
+说明如何封装配置循环神经网络组件；然后更进一步的通过序列到序列（sequence to sequence）模型，逐步讲解如何构建完整而复杂的循环神经网络模型。
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_cn.rst
+
+Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键，第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理，
+对Recurrent Group接口进行了详细说明。另外，对双层RNN（对应的输入为双层序列）及Recurrent Group在其中的使用进行了介绍。
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+
+第三部分对双层序列进行了解释说明，列出了PaddlePaddle中支持双层序列作为输入的Layer，并对其使用进行了逐一介绍。
+
+..  toctree::
+  :maxdepth: 1
+
+  hierarchical_layer_cn.rst
+
+第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例，辅以效果相同的单层RNN网络配置作为对比，讲解了多种情况下双层RNN的使用。
+
+..  toctree::
+  :maxdepth: 1
+
+  hrnn_rnn_api_compare_cn.rst
diff --git a/doc/v2/howto/rnn/index_en.rst b/doc/v2/howto/rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6e8b5c61b23ca2725dc0c9761c8dd4165033973c
--- /dev/null
+++ b/doc/v2/howto/rnn/index_en.rst
@@ -0,0 +1,32 @@
+RNN Models
+==========
+Recurrent neural networks(RNN) are an important tool to model sequential data. PaddlePaddle provides flexible interface for building complex recurrent neural network. We will demonstrate how to use PaddlePaddle to build RNN models in the following 4 parts.
+
+In the first part, we will guide you how to configure recurrent neural network in PaddlePaddle from simple to complex. First, we will use a vanilla recurrent neural network as an example to show how to configure recurrent neural network architecture. Then We will use the sequence to sequence model as an example to demonstrate how you can configure complex recurrent neural network models gradually.
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
+
+Recurrent Group is the key unit to build complex recurrent neural network models. The second part describes related concepts and Basic principles of Recurrent Group, and give a detailed description of Recurrent Group API interface. In addition, it also introduces Sequence-level RNN(hierarchical sequence as input) and the usage of Recurrent Group in it.
+
+..  toctree::
+  :maxdepth: 1
+  
+  recurrent_group_en.md
+  
+In the third part, two-level sequence is demonstrated briefly and then layers supporting two-level sequence as input are listed and described respectively.
+
+..  toctree::
+  :maxdepth: 1
+  
+  hierarchical_layer_en.rst
+
+In the last part, the unit test of hierarchical RNN is presented as an example to explain how to use hierarchical RNN. We will use two-level sequence RNN and single-layer sequence RNN which have same effects with former as the network configuration seperately in unit test.
+
+..  toctree::
+  :maxdepth: 1
+  
+  hrnn_rnn_api_compare_en.rst
+
diff --git a/doc/v2/howto/rnn/recurrent_group_cn.md b/doc/v2/howto/rnn/recurrent_group_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..06dc9e089ab2b2b926fcb1bd034262f2c846f06f
--- /dev/null
+++ b/doc/v2/howto/rnn/recurrent_group_cn.md
@@ -0,0 +1,96 @@
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical_layer_cn.html">支持双层序列作为输入的Layer</a>。
+ 
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+ 
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+ 
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+		
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+	  
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+    
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+		
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+		 
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+ 
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+ 
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+ 
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+
+- 单层RNN：从一个word生成下一个word。
+- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..de6b60f29eb97029a54609cd2194bb7faf3ffec5
--- /dev/null
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
@@ -0,0 +1,96 @@
+# Recurrent Group Tutorial
+
+## Overview
+
+Sequential data is common in natural language processing.
+
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: <a href = "hierarchical_layer_en.html">Layers for supporting double-layer sequences as input.</a>
+
+## Related Concepts
+
+### Basic Principle 
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+
+``` python 
+recurrent_group(step, input, reverse) 
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+		
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+	  
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+
+### Input Example
+
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+
+## Sequence-level RNN Introduction
+
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+
+- Word-level RNN:  each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+
+## Usage of Sequence-level RNN
+
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+
+- **Single-input Single-output**: Both input and output are single layer sequences.
+  - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+  - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+  - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+  - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+ 
+- **Double-input Double-output**: Both input and output are two-level sequence.
+  - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+  - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+  - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+  - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+ 
+### Usage of Generation Process
+Using `beam_search` need follow those conventions: 
+
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
diff --git a/doc/v2/howto/rnn/rnn_config_cn.rst b/doc/v2/howto/rnn/rnn_config_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..63fa161fafed0f3a8ec8799af21304cbec62d813
--- /dev/null
+++ b/doc/v2/howto/rnn/rnn_config_cn.rst
@@ -0,0 +1,261 @@
+RNN配置
+========
+
+本教程将指导你如何在 PaddlePaddle
+中配置循环神经网络（RNN）。PaddlePaddle
+高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
+
+-  配置循环神经网络架构。
+-  使用学习完成的循环神经网络模型生成序列。
+
+我们将使用 vanilla 循环神经网络和 sequence to sequence
+模型来指导你完成这些步骤。sequence to sequence
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。
+
+配置循环神经网络架构
+--------------------
+
+简单门控循环神经网络(Gated Recurrent Neural Network)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
+
+.. image:: src/bi_lstm.jpg
+      :align: center
+
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
+
+.. math::
+
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
+
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
+循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
+可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
+sequence
+模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的
+vanilla
+循环神经网络作为使用\ ``recurrent_group``\ 配置简单循环神经网络的例子。
+注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用\ ``grumemory``\ 和\ ``lstmemory``\ ，因为它们的计算效率比\ ``recurrent_group``\ 更高。
+
+对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
+
+.. math::
+
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
+
+``recurrent_group``\ 是构建循环神经网络的最重要的工具。
+它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
+
+.. code:: python
+
+    def simple_rnn(input,
+                   size=None,
+                   name=None,
+                   reverse=False,
+                   rnn_bias_attr=None,
+                   act=None,
+                   rnn_layer_attr=None):
+        def __rnn_step__(ipt):
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
+           return rnn_out
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
+
+PaddlePaddle
+使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
+一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
+layer(引导层)**\ ，其输出被用作Memory的初始值。
+在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
+(*x*\ \ *t* + 1)的输出被用作\ ``out_mem``\ Memory的\ **输出**\ 。
+
+Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。
+其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
+
+我们在函数的结尾返回\ ``rnn_out``\ 。 这意味着 ``rnn_out``
+层的输出被用作门控循环神经网络的\ **输出**\ 函数。
+
+Sequence to Sequence Model with Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+我们将使用 sequence to sequence model with attention
+作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
+
+.. image:: src/encoder-decoder-attention-model.png
+      :align: center
+
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
+用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
+
+模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
+``recurrent_group``
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
+
+我们还将编码向量投射到 ``decoder_size``
+维空间。这通过获得反向循环网络的第一个实例，并将其投射到
+``decoder_size`` 维空间完成：
+
+.. code:: python
+
+    # 定义源语句的数据层
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    # 计算每个词的词向量
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    # 应用前向循环神经网络
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    # 将循环神经网络的前向和反向部分混合在一起
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    # 投射编码向量到 decoder_size
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
+
+    # 计算反向RNN的第一个实例
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    # 投射反向RNN的第一个实例到 decoder size
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
+
+解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
+``gru_decoder_with_attention`` 中定义：
+
+.. code:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
+
+    # 对于配备有注意力机制的解码器，在训练中，
+    # 目标向量（groudtruth）是数据输入，
+    # 而源序列的编码向量可以被无边界的memory访问
+    # StaticInput 意味着不同时间步的输入都是相同的值，
+    # 否则它以一个序列输入，不同时间步的输入是不同的。
+    # 所有输入序列应该有相同的长度。
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
+attention，门控循环单元单步函数和输出函数：
+
+.. code:: python
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        # 定义解码器的Memory
+        # Memory的输出定义在 gru_step 内
+        # 注意 gru_step 应该与它的Memory名字相同
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        # 计算 attention 加权编码向量
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        # 混合当前词向量和attention加权编码向量
+         decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
+        # 定义门控循环单元循环神经网络单步函数
+         gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        # 定义输出函数
+         out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
+        return out
+
+生成序列
+--------
+
+训练模型后，我们可以使用它来生成序列。通常的做法是使用\ **beam search**
+生成序列。以下代码片段定义 beam search 算法。注意，\ ``beam_search``
+函数假设 ``step`` 的输出函数返回的是下一个时刻输出词的 softmax
+归一化概率向量。我们对模型进行了以下更改。
+
+-  使用 ``GeneratedInput`` 来表示 trg\_embedding。 ``GeneratedInput``
+   将上一时间步所生成的词的向量来作为当前时间步的输入。
+-  使用 ``beam_search`` 函数。这个函数需要设置：
+
+   -  ``bos_id``: 开始标记。每个句子都以开始标记开头。
+   -  ``eos_id``: 结束标记。每个句子都以结束标记结尾。
+   -  ``beam_size``: beam search 算法中的beam大小。
+   -  ``max_length``: 生成序列的最大长度。
+
+代码如下：
+
+.. code:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
+    # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
+    # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
+
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
diff --git a/doc/v2/howto/rnn/rnn_config_en.rst b/doc/v2/howto/rnn/rnn_config_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3
--- /dev/null
+++ b/doc/v2/howto/rnn/rnn_config_en.rst
@@ -0,0 +1,235 @@
+RNN Configuration
+=================
+
+This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
+
+- configure recurrent neural network architecture.
+- generate sequence with learned recurrent neural network models.
+
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 
+
+===============================================
+Configure Recurrent Neural Network Architecture
+===============================================
+
+-------------------------------------
+Simple Gated Recurrent Neural Network
+-------------------------------------
+
+Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
+
+.. image:: src/bi_lstm.jpg
+     :align: center
+
+Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
+
+.. math::
+
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
+
+
+where :math:`f_x(.)` is called **step function**, and :math:`f_y(.)` is called **output function**. In vanilla recurrent neural network, both of the step function and output function are very simple. However, PaddlePaddle supports the configuration of very complex architectures by modifying these two functions. We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. In this section, we will use a simple vanilla recurrent neural network as an example of configuring simple recurrent neural network using :code:`recurrent_group`. Notice that if you only need to use simple RNN, GRU, or LSTM, then :code:`grumemory` and :code:`lstmemory` is recommended because they are more computationally efficient than :code:`recurrent_group`.
+
+For vanilla RNN, at each time step, the **step function** is:
+
+.. math::
+
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+where :math:`x_t` is the RNN state, and :math:`I_t` is the input, :math:`W_x` and :math:`W_i` are transformation matrices for RNN states and inputs, respectively. :math:`b` is the bias.
+Its **output function** simply takes :math:`x_t` as the output.
+
+:code:`recurrent_group` is the most important tools for constructing recurrent neural networks. It defines the **step function**, **output function** and the inputs of the recurrent neural network. Notice that the :code:`step` argument of this function implements both the :code:`step function` and the :code:`output function`:
+
+.. code-block:: python
+
+    def simple_rnn(input,
+                   size=None,
+                   name=None,
+                   reverse=False,
+                   rnn_bias_attr=None,
+                   act=None,
+                   rnn_layer_attr=None):
+        def __rnn_step__(ipt):
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
+           return rnn_out
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
+
+
+PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory.
+
+A memory can also be a sequence. In this case, at each time step, we have a sequence as the state of the recurrent neural network. This can be useful when constructing very complex recurrent neural network. Other advanced functions include defining multiple memories, and defining hierarchical recurrent neural network architecture using sub-sequence.
+
+We return :code:`rnn_out` at the end of the function. It means that the output of the layer :code:`rnn_out` is utilized as the **output** function of the gated recurrent neural network.
+
+-----------------------------------------
+Sequence to Sequence Model with Attention
+-----------------------------------------
+We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
+
+.. image:: src/encoder-decoder-attention-model.png
+      :align: center
+
+In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
+
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :ref:`api_trainer_config_helpers_layers` for more details.
+
+We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:
+
+.. code-block:: python
+
+    # Define the data layer of the source sentence.
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    # Calculate the word embedding of each word.
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    # Apply forward recurrent neural network.
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    # Mix the forward and backward parts of the recurrent neural network together.
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    # Project encoding vector to decoder_size.
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
+
+    # Compute the first instance of the backward RNN.
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    # Project the first instance of backward RNN to decoder size.
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
+
+
+The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
+
+.. code-block:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
+
+    # For decoder equipped with attention mechanism, in training,
+    # target embedding (the groudtruth) is the data input,
+    # while encoded source sequence is accessed to as an unbounded memory.
+    # StaticInput means the same value is utilized at different time steps.
+    # Otherwise, it is a sequence input. Inputs at different time steps are different.
+    # All sequence inputs should have the same length.
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+
+The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
+
+.. code-block:: python
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        # Defines the memory of the decoder.
+        # The output of this memory is defined in gru_step.
+        # Notice that the name of gru_step should be the same as the name of this memory.
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        # Compute attention weighted encoder vector.
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        # Mix the current word embedding and the attention weighted encoder vector.
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
+        # Define Gated recurrent unit recurrent neural network step function.
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        # Defines the output function.
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
+        return out
+
+
+=================
+Generate Sequence
+=================
+After training the model, we can use it to generate sequences. A common practice is to use **beam search** to generate sequences. The following code snippets defines a beam search algorithm. Notice that :code:`beam_search` function assumes the output function of the :code:`step` returns a softmax normalized probability vector of the next token. We made the following changes to the model.
+
+* use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step.
+* use :code:`beam_search` function. This function needs to set:
+
+  - :code:`bos_id`: the start token. Every sentence starts with the start token.
+  - :code:`eos_id`: the end token. Every sentence ends with the end token.
+  - :code:`beam_size`: the beam size used in beam search.
+  - :code:`max_length`: the maximum length of the generated sentences.
+    
+The code is listed below:
+
+.. code-block:: python
+
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    # In generation, decoder predicts a next target word based on
+    # the encoded source sequence and the last generated target word.
+    # The encoded source sequence (encoder's output) must be specified by
+    # StaticInput which is a read-only memory.
+    # Here, GeneratedInputs automatically fetchs the last generated word,
+    # which is initialized by a start mark, such as <s>.
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.
+
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
diff --git a/doc/v2/howto/rnn/src/bi_lstm.jpg b/doc/v2/howto/rnn/src/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/v2/howto/rnn/src/bi_lstm.jpg differ
diff --git a/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png b/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..79f911d4ba12ac0c0d1a936c9df639c302786914
Binary files /dev/null and b/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png differ
diff --git a/doc/v2/howto/rnn/src/glossary_rnn.dot b/doc/v2/howto/rnn/src/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/v2/howto/rnn/src/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot b/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/howto/rnn/src/simple_full_recurrent.dot b/doc/v2/howto/rnn/src/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/v2/howto/rnn/src/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/FullyConnected.jpg b/doc/v2/images/FullyConnected.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b2241f401434e527f95ee4e0e541a3f2ff78fd1e
Binary files /dev/null and b/doc/v2/images/FullyConnected.jpg differ
diff --git a/doc/v2/images/add_security_group.png b/doc/v2/images/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/v2/images/add_security_group.png differ
diff --git a/doc/v2/images/bi_lstm.jpg b/doc/v2/images/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/v2/images/bi_lstm.jpg differ
diff --git a/doc/v2/images/checkpointing.png b/doc/v2/images/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/v2/images/checkpointing.png differ
diff --git a/doc/v2/images/create_efs.png b/doc/v2/images/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/v2/images/create_efs.png differ
diff --git a/doc/v2/images/csr.png b/doc/v2/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/v2/images/csr.png differ
diff --git a/doc/v2/images/data_dispatch.png b/doc/v2/images/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/v2/images/data_dispatch.png differ
diff --git a/doc/v2/images/dataset.graffle b/doc/v2/images/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/v2/images/dataset.graffle differ
diff --git a/doc/v2/images/dataset.png b/doc/v2/images/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/v2/images/dataset.png differ
diff --git a/doc/v2/images/doc_en.png b/doc/v2/images/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/images/doc_en.png differ
diff --git a/doc/v2/images/efs_mount.png b/doc/v2/images/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/v2/images/efs_mount.png differ
diff --git a/doc/v2/images/encoder-decoder-attention-model.png b/doc/v2/images/encoder-decoder-attention-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..79f911d4ba12ac0c0d1a936c9df639c302786914
Binary files /dev/null and b/doc/v2/images/encoder-decoder-attention-model.png differ
diff --git a/doc/v2/images/engine.png b/doc/v2/images/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/v2/images/engine.png differ
diff --git a/doc/v2/images/file_storage.graffle b/doc/v2/images/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/v2/images/file_storage.graffle differ
diff --git a/doc/v2/images/file_storage.png b/doc/v2/images/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/v2/images/file_storage.png differ
diff --git a/doc/v2/images/glossary_rnn.dot b/doc/v2/images/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/v2/images/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/glossary_rnn_with_memory.dot b/doc/v2/images/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/v2/images/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/gradients.png b/doc/v2/images/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/v2/images/gradients.png differ
diff --git a/doc/v2/images/init_lock.graffle b/doc/v2/images/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/v2/images/init_lock.graffle differ
diff --git a/doc/v2/images/init_lock.png b/doc/v2/images/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/v2/images/init_lock.png differ
diff --git a/doc/v2/images/k8s-paddle-arch.png b/doc/v2/images/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/v2/images/k8s-paddle-arch.png differ
diff --git a/doc/v2/images/layers.png b/doc/v2/images/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/v2/images/layers.png differ
diff --git a/doc/v2/images/managed_policy.png b/doc/v2/images/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/v2/images/managed_policy.png differ
diff --git a/doc/v2/images/matrix.png b/doc/v2/images/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/v2/images/matrix.png differ
diff --git a/doc/v2/images/nvvp1.png b/doc/v2/images/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/v2/images/nvvp1.png differ
diff --git a/doc/v2/images/nvvp2.png b/doc/v2/images/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/v2/images/nvvp2.png differ
diff --git a/doc/v2/images/nvvp3.png b/doc/v2/images/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/v2/images/nvvp3.png differ
diff --git a/doc/v2/images/nvvp4.png b/doc/v2/images/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/v2/images/nvvp4.png differ
diff --git a/doc/v2/images/overview.png b/doc/v2/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/v2/images/overview.png differ
diff --git a/doc/v2/images/paddle-cloud-in-data-center.png b/doc/v2/images/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/v2/images/paddle-cloud-in-data-center.png differ
diff --git a/doc/v2/images/paddle-etcd.graffle b/doc/v2/images/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9
Binary files /dev/null and b/doc/v2/images/paddle-etcd.graffle differ
diff --git a/doc/v2/images/paddle-etcd.png b/doc/v2/images/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31
Binary files /dev/null and b/doc/v2/images/paddle-etcd.png differ
diff --git a/doc/v2/images/paddle-model-sharding.graffle b/doc/v2/images/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.graffle differ
diff --git a/doc/v2/images/paddle-model-sharding.png b/doc/v2/images/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.png differ
diff --git a/doc/v2/images/paddle-ps-0.png b/doc/v2/images/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/v2/images/paddle-ps-0.png differ
diff --git a/doc/v2/images/paddle-ps-1.png b/doc/v2/images/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/v2/images/paddle-ps-1.png differ
diff --git a/doc/v2/images/paddle-ps.graffle b/doc/v2/images/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/v2/images/paddle-ps.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.graffle b/doc/v2/images/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.png b/doc/v2/images/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.png differ
diff --git a/doc/v2/images/paddle-task-states.graffle b/doc/v2/images/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/v2/images/paddle-task-states.graffle differ
diff --git a/doc/v2/images/paddle-task-states.png b/doc/v2/images/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/v2/images/paddle-task-states.png differ
diff --git a/doc/v2/images/ps_cn.png b/doc/v2/images/ps_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/ps_cn.png differ
diff --git a/doc/v2/images/ps_en.png b/doc/v2/images/ps_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/ps_en.png differ
diff --git a/doc/v2/images/pserver_and_trainer.png b/doc/v2/images/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/v2/images/pserver_and_trainer.png differ
diff --git a/doc/v2/images/pserver_init.graffle b/doc/v2/images/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/v2/images/pserver_init.graffle differ
diff --git a/doc/v2/images/pserver_init.png b/doc/v2/images/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/v2/images/pserver_init.png differ
diff --git a/doc/v2/images/route53_create_recordset.png b/doc/v2/images/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/v2/images/route53_create_recordset.png differ
diff --git a/doc/v2/images/route53_create_zone.png b/doc/v2/images/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/v2/images/route53_create_zone.png differ
diff --git a/doc/v2/images/sequence_data.png b/doc/v2/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/v2/images/sequence_data.png differ
diff --git a/doc/v2/images/simple_full_hierarchical_recurrent.dot b/doc/v2/images/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/v2/images/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/simple_full_recurrent.dot b/doc/v2/images/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/v2/images/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/submit-job.graffle b/doc/v2/images/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/v2/images/submit-job.graffle differ
diff --git a/doc/v2/images/submit-job.png b/doc/v2/images/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/v2/images/submit-job.png differ
diff --git a/doc/v2/images/trainer.graffle b/doc/v2/images/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254
Binary files /dev/null and b/doc/v2/images/trainer.graffle differ
diff --git a/doc/v2/images/trainer.png b/doc/v2/images/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/trainer.png differ
diff --git a/doc/v2/images/trainer_cn.png b/doc/v2/images/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/trainer_cn.png differ
diff --git a/doc/v2/images/worker_security_group.png b/doc/v2/images/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/v2/images/worker_security_group.png differ
diff --git a/doc/v2/images/workflow_of_CAPI.png b/doc/v2/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/v2/images/workflow_of_CAPI.png differ
diff --git a/doc/v2/index_cn.rst b/doc/v2/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f645db6fc5d0f84bbe0cbb335677752e3a355ea
--- /dev/null
+++ b/doc/v2/index_cn.rst
@@ -0,0 +1,11 @@
+PaddlePaddle 文档
+======================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  build_and_install/index_cn.rst
+  howto/index_cn.rst
+  dev/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/v2/index_en.rst b/doc/v2/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..909f035cca3db2a02fd38462acc451375eceff40
--- /dev/null
+++ b/doc/v2/index_en.rst
@@ -0,0 +1,11 @@
+PaddlePaddle Documentation
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  build_and_install/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/external/Anakin b/external/Anakin
new file mode 160000
index 0000000000000000000000000000000000000000..beec126e4cfe762e4b6b542496069323dca35ee7
--- /dev/null
+++ b/external/Anakin
@@ -0,0 +1 @@
+Subproject commit beec126e4cfe762e4b6b542496069323dca35ee7
diff --git a/external/Paddle b/external/Paddle
new file mode 160000
index 0000000000000000000000000000000000000000..6f68fe71d60d4339e407b9fd1fd13990fb30c67a
--- /dev/null
+++ b/external/Paddle
@@ -0,0 +1 @@
+Subproject commit 6f68fe71d60d4339e407b9fd1fd13990fb30c67a
diff --git a/external/book b/external/book
new file mode 160000
index 0000000000000000000000000000000000000000..2b81d844673c1ba09fd596d70492375f2998ad36
--- /dev/null
+++ b/external/book
@@ -0,0 +1 @@
+Subproject commit 2b81d844673c1ba09fd596d70492375f2998ad36
diff --git a/external/models b/external/models
new file mode 160000
index 0000000000000000000000000000000000000000..d6024059de7ba447ab2859c23ef86e8519c127ae
--- /dev/null
+++ b/external/models
@@ -0,0 +1 @@
+Subproject commit d6024059de7ba447ab2859c23ef86e8519c127ae
diff --git a/external/paddle-mobile b/external/paddle-mobile
new file mode 160000
index 0000000000000000000000000000000000000000..73e2f989e78e59e6fafbf5d973e36ad17418c64a
--- /dev/null
+++ b/external/paddle-mobile
@@ -0,0 +1 @@
+Subproject commit 73e2f989e78e59e6fafbf5d973e36ad17418c64a
diff --git a/mobile b/mobile
deleted file mode 160000
index c3aa92ac28662d7a1553cd258ddd3f19412f5018..0000000000000000000000000000000000000000
--- a/mobile
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c3aa92ac28662d7a1553cd258ddd3f19412f5018
diff --git a/paddle b/paddle
deleted file mode 160000
index 653686c753304f1b1d2a433cae96b96434e6c2d6..0000000000000000000000000000000000000000
--- a/paddle
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 653686c753304f1b1d2a433cae96b96434e6c2d6
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 16435dc01885bdd17fcbadc9d95055a9750bffad..0000000000000000000000000000000000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-alabaster==0.7.10
-Babel==2.6.0
-backports.functools-lru-cache==1.5
-certifi==2018.4.16
-chardet==3.0.4
-CommonMark==0.5.4
-cycler==0.10.0
-docutils==0.14
-graphviz==0.8.3
-idna==2.6
-imagesize==1.0.0
-Jinja2==2.10
-kiwisolver==1.0.1
-LinkChecker==9.3
-Markdown==2.6.11
-MarkupSafe==1.0
-matplotlib==2.2.2
-nltk==3.3
-numpy==1.14.4
-opencv-python==3.4.1.15
-packaging==17.1
-paddlepaddle
-Pillow==5.1.0
-protobuf==3.1.0
-Pygments==2.2.0
-pyparsing==2.2.0
-python-dateutil==2.7.3
-pytz==2018.4
-rarfile==3.0
-recommonmark==0.4.0
-recordio==0.1.5
-requests==2.9.2
-scipy==1.1.0
-six==1.11.0
-snowballstemmer==1.2.1
-Sphinx==1.7.5
-sphinx-markdown-tables==0.0.3
-sphinx-rtd-theme==0.4.0
-sphinxcontrib-websupport==1.1.0
-subprocess32==3.5.1
-typing==3.6.4
-urllib3==1.22
diff --git a/scripts/build_doc_lib_lite.sh b/scripts/build_doc_lib_lite.sh
new file mode 100755
index 0000000000000000000000000000000000000000..561667587ec5eb771dafd97698ed6f4b45202faa
--- /dev/null
+++ b/scripts/build_doc_lib_lite.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "1. Setup submodules"
+git submodule update --init --recursive
+
+echo "2. Build Paddle library"
+cd external/Paddle
+git branch
+paddle/scripts/paddle_docker_build.sh gen_doc_lib_lite 
+cd ../..
+
+exit_code=0
+
diff --git a/scripts/deploy_docs.sh b/scripts/deploy_docs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..768fc7e04d386f1292d79cab67ac3eae4642e62a
--- /dev/null
+++ b/scripts/deploy_docs.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exit_code=0
+
+if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit $exit_code; fi;
+    
+# Deploy to the the content server if its a "develop" or "release/version" branch
+# The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+    PPO_SCRIPT_BRANCH=develop
+elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+    PPO_SCRIPT_BRANCH=master
+else
+    # Early exit, this branch doesn't require documentation build
+    echo "This branch doesn't require documentation build"
+    exit $exit_code;
+fi
+
+echo "Build Paddle library. This step is needed to compile Paddle API documents"
+cd external/Paddle
+git branch
+paddle/scripts/paddle_docker_build.sh gen_doc_lib 
+cd ../..
+
+export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+
+echo "Deploy under docker environment"
+docker run -it \
+    -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+    -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+    -e DEPLOY_DOCS_SH=$DEPLOY_DOCS_SH \
+    -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
+    -e PPO_SCRIPT_BRANCH=$PPO_SCRIPT_BRANCH \
+    -e PADDLE_ROOT=/FluidDoc/external/Paddle \
+    -e PYTHONPATH=/FluidDoc/external/Paddle/build/python \
+    -v "$PWD:/FluidDoc" \
+    -w /FluidDoc \
+    paddlepaddle/paddle:latest-dev \
+    /bin/bash -c 'curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH /FluidDoc /FluidDoc/build/doc/ $PPO_SCRIPT_BRANCH' || exit_code=$(( exit_code | $? ))
+
+exit $exit_code
diff --git a/source/advanced_usage/deploy/anakin_example.md b/source/advanced_usage/deploy/anakin_example.md
deleted file mode 120000
index e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/anakin_example.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# Example
-Anakin目前只支持NCHW的格式
-示例文件在test/framework/net下
-
-## 在NV的GPU上运行CNN模型
-示例文件为打开example_nv_cnn_net.cpp，整体流程如下：
-- 将模型的的path设置为anakin模型的路径，初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
-- 根据模型设置网络图的输入尺寸，进行图优化
-- 根据优化后的网络图初始化网络执行器
-- 取出网络的输入tensor，将数据拷贝到输入tensor
-- 运行推导
-- 取出网络的输出tensor
-
-以NV平台为例演示Anakin框架的使用方法，注意编译时需要打开GPU编译开关
-
-## 在X86上运行RNN模型
-示例文件为example_x86_rnn_net.cpp
-整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
-- 使用X86标识初始化图对象和网络执行器对象
-- rnn模型的输入尺寸是可变的，初始化图时的输入维度是维度的最大值，输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词，其中第0到第4个词是第一句话，第5到第11个词是第二句话
-
-以X86平台为例演示Anakin框架的使用方法，注意编译时需要打开X86编译开关
-
-## 在NV的GPU上使用Anakin的线程池运行CNN模型
-示例文件为example_nv_cnn_net_multi_thread.cpp ，示例使用worker的同步预测接口
-整体流程与在NV的GPU上运行CNN模型相似，不同之处如下：
-- 用模型地址和线程池大小初始化worker对象
-- 将输入tensor注入任务队列,获得输出tensor
diff --git a/source/advanced_usage/deploy/anakin_tutorial.md b/source/advanced_usage/deploy/anakin_tutorial.md
deleted file mode 120000
index 5efbc89abd469871b318c306e8cb03dd95f0c85b..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/anakin_tutorial.md
+++ /dev/null
@@ -1,639 +0,0 @@
-# Anakin 使用教程 ##
-
-本教程将会简略的介绍Anakin的工作原理，一些基本的Anakin API，以及如何调用这些API。
-  
-## 内容 ###
-
-- [Anakin的工作原理](#principle)
-- [Anakin APIs](#api)
-- [示例代码](#example)
-
-## <span id = 'principle'> Anakin的工作原理</span> ###
-
-![Anakin_principle](../pics/anakin_fm_ch.png)
-
-用Anakin来进行前向计算主要分为三个步骤：
-
-- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型  
-  在使用Anakin之前，用户必须将所有其他模型转换成Anakin模型，我们提供了转换脚本，用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。
-- 生成Anakin计算图
-  加载Anakin模型生成原始计算图，然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
-- 执行计算图  
-  Anakin会选择不同硬件平台执行计算图。
-
-
-## <span id ='api'>Anakin APIs </span> ###
-### Tensor ####
-
-`Tensor`提供基础的数据操作和管理，为ops提供统一的数据接口。`Tensor`包含以下几个属性：   
-
-- Buffer  
-   数据存储区
-- Shape  
-   数据的维度信息
-- Event  
-   用于异步计算的同步
-
- `Tensor` 类包含三个`Shape`对象， 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息，`_valid_shape`表示当前`tensor`使用的空间信息， `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
-
-
-Dimentions | Math entity |
- :----: | :----:
-1 | vector
-2 | matrix
-3 | 3-tensor
-n | n-tensor
-
-#### 声明tensor对象
-
-`Tensor`接受三个模板参数:
-
-
-```c++
- template<typename TargetType, DataType datatype, typename LayOutType = NCHW>
- class Tensor .../* Inherit other class */{
-  //some implements
-  ...
- };
-```
-
-TargetType是平台类型，如X86，GPU等等，在Anakin内部有相应的标识与之对应；datatype是普通的数据类型，在Anakin内部也有相应的标志与之对应；[LayOutType](#layout)是数据分布类型，如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
-
-1. <span id='target'>TargetType</sapn>
-
- Anakin TargetType | platform
-  :----: | :----:|
-  NV | NVIDIA GPU
-  ARM | ARM
-  AMD | AMD GPU
-  X86 | X86
-  NVHX86 | NVIDIA GPU with Pinned Memory
-
-2. <sapn id='datatype'>DataType</span>
-
-Anakin DataType | C++ | Description 
-:---: | :---: | :---: |
-AK_HALF | short | fp16
-AK_FLOAT | float | fp32
-AK_DOUBLE | double | fp64
-AK_INT8 | char | int8
-AK_INT16 | short | int16
-AK_INT32 | int | int32
-AK_INT64 | long | int64
-AK_UINT8 | unsigned char | uint8
-AK_UINT16 | unsigned short | uint8
-AK_UINT32 | unsigned int | uint32
-AK_STRING | std::string | /
-AK_BOOL | bool | /
-AK_SHAPE | / | Anakin Shape 
-AK_TENSOR | / | Anakin Tensor 
-
-
-3. <span id = 'layout'>LayOutType </span>
-
-Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
-:---: | :---: | :---: | :---: |
-W | 1-D | YES | NO
-HW | 2-D | YES | NO
-WH | 2-D | YES | NO
-NW | 2-D | YES | YES
-NHW | 3-D | YES |YES
-NCHW ( default ) | 4-D | YES | YES
-NHWC | 4-D | YES | NO
-NCHW_C4 | 5-D | YES | YES
-
-
-理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOutType，NCHW_C4是专门针对于int8这种数据类型的。
-
-
-例子
-
-> 下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
-
-> 要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
-
-> 1. 使用shape对象初始化tensor
-``` c++  
-  //create a null tensor. A null tensor holds for nothing.
-  //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
-  //tensor's Layout is NCHW(default)
-   Tensor<X86, AK_FLOAT> mytensor;
-
-   //1. using shape object to create a tensor.
-   Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
-   Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
-
-  // A 4-D shape
-   Shape shape2(N, C, H, W); // batch x channel x height x width
-```
-
->`注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`  
-
-
-```c++
-   // A 4-D tensor.
-   Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
-
-   //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
-   Tensor<NV, AK_INT8> mytensor3(shape2);   //right
-   
-   Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
-   Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
-
-```
-
-> 2. 使用现有的数据和shape初始化tensor
-
-```c++
-
-   /**
-   *  A construtor of Tensor.
-   *  data_ptr is a pointer to any data type of data
-   *  TargetType is type of a platform [Anakin TargetType]
-   *  id : device id
-   *  shape: a Anakin shape
-   */
-   Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
-
-   //using existing data feed to a tensor
-   Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
-
-```
-
-> 3. 使用tensor初始化tensor
-
-```c++
-   Tensor<NV, AK_FLOAT> tensor(exist_tensor);
-```
-
-
-> 提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
-
-
-#### 填充tensor数据区
-
-
-填充数据区得看你申明tensor的方式， 下面展示了如何填充tensor的数据区。
-
-```c++
-首先来看看tensor的四种声明方式：
-
-1. Tensor<X86, AK_FLOAT> mytensor;
-2. Tensor<X86, AK_FLOAT, W> mytensor1(shape1);
-3. Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape);
-4. Tensor<NV, AK_FLOAT> tensor(exist_tensor);
-
-
-相关的声明方式的数据填充方法如下：
-
-1：声明一个空的tensor，此时没有为其分配内存，所以，我们需要手动的为其分配内存。
-            
-            //parama shape
-            mytensor.re_alloc(Shape shape); 
-
-            //Get writable pointer to mytensor.
-            //parama index (int): where you start to write.
-            //Dtype is your data type such int, float or double.
-            Dtype *p = mytensor.mutable_data(index/*=0*/);
-            //write data to mytensor
-            for(int i = 0; i < mytensor.size(); i++){
-              p[i] = 1.0f;
-            }
-            //do something ...
-
-2: 这种声明方式会自动分配内存 
-
-          //Get writable pointer to mytensor.
-          //parama index (int): where you start to write.
-          //Dtype is your data type such int, float or double.
-          Dtype *p = mytensor1.mutable_data(index/*=0*/);
-          //write data to mytensor
-          for(int i = 0; i < mytensor.size(); i++){
-            p[i] = 1.0f;
-          }
-          //do something ...
-
- 
-3：在该种声明方式中，我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存，得依情况而定。如果data_ptr和申明的
-tensor都在都一个目标平台上，那么该tensor就会与data_ptr共享内存空间，相反，如果他们不在同一个平台上（如data_ptr在X86上，而
-tensor在GPU上），那么此时tensor就会开辟一个新的内存空间，并将data_ptr所指向的数据拷贝到tensor的buffer中。
-
-          //Get writable pointer to mytensor.
-          //parama index (int): where you start to write.
-          //Dtype is your data type such int, float or double.
-          Dtype *p = mytensor.mutable_data(index/*=0*/);
-          //write data to mytensor
-          for(int i = 0; i < mytensor.size(); i++){
-            p[i] = 1.0f;
-          }
-          //do something ...
-
-4：该种方式仍不需要手动分配内存
-
-          //Get writable pointer to mytensor.
-          //parama index (int): where you start to write.
-          //Dtype is your data type such int, float or double.
-          Dtype *p = mytensor.mutable_data(index/*=0*/);
-          //write data to mytensor
-          for(int i = 0; i < mytensor.size(); i++){
-            p[i] = 1.0f;
-          }
-          //do something ...
-
-
-另外，你还可以获取一个tensor的可读指针，示例如下：
-        //Get read-only pointer to mytensor.
-        //parama index (int): where you start to read.
-        //Dtype is your data type such int, float or double.
-         Dtype *p = mytensor.data(index/*=0*/);
-        //do something ...
-```
-
-如果想更详细的了解tensor，请查阅*soure_path/saber/core/tensor.h*
-
-#### 获取tensor的shape
-
-```c++
-//some declarations
-// ...
-Shape shape = mytensor.shape();
-
-//Get a first dimetion size of tesor, if it has.
-int d1 = shape[0];
-
-//Get a second dimention size of tensor, if it has.
-int d2 = shape[1];
-
-...
-
-//Get a n-th dimention size of tensor, if it has.
-int dn = shape[n-1];
-
-
-//Get a tensor's dimention
-int dims = mytensor.dims();
-
-//Get the size of tensor.
-//size = d1 x d2 x ... x dn.
-int size = mytensor.size();
-
-//Get the size of tensor at interval [Di, Dj)
-// form i-th dimention to j-th dimention, but not including the j-th dimention.
-// which means di x (di+1) x ... x (dj -1)
-int size = mytensor.count(start, end);
-```
-
-#### 设置tensor的shape
-
-我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
-
-
-```c++
-/**
- * \brief set a tensor's shape
- * \param valid_shape [a Shape object]
- * \param shape [a Shape object]
- * \param offset [a Shape object]
- * \return the status of this operation, that means whether it success * or not.
- */
-SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value)); 
-```
-
-这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同，如果不同就会出错，返回SaberInvalidValue。 如果相同，那么将成功设置tensor的shape。
-
-```c++
-
-// some declarations
-// ...
-//valid_shape, shape , offset are Shape object;
-//All these Shape object's LayOutType must be equal to mytensor's.
-mytensor.set_shape(valid_shape, shape, offset);
-
-```
-
-#### 重置 tensor的shape
-
-```c++
-//some declarations
-Shape shape, valid_shape, offset;
-
-//do some initializations
-... 
-mytensor.reshape(valid_shape, shape, offset);
-```
-
-注意： Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
-
-
-### Graph ###
-
-`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
-
-#### 图的声明
-
-与`Tensor`一样，graph也接受三个模板参数。
-
-```c++
-
-template<typename TargetType, DataType Dtype, Precision Ptype>
-class Graph ... /* inherit other class*/{
-  
-  //some implements
-  ...
-
-};
-```
-
-前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
-
-
-```c++
-
-//Create a empty graph object.
-Graph graph = Graph<NV, AK_FLOAT, Precision::FP32> tmp();
-
-//Create a pointer to a empty graph.
-Graph *graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-
-//Create a pointer to a empty graph.
-auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-
-```
-
-#### 加载 Anakin 模型
-
-```c++
-//some declarations
-...
-auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-std::string model_path = "the/path/to/where/your/models/are";
-const char *model_path1 = "the/path/to/where/your/models/are";
-
-//Loading Anakin model to generate a compute graph.
-auto status = graph->load(model_path);
-
-//Or this way.
-auto status = graph->load(model_path1);
-//Check whether load operation success.
-if(!status){
-  std::cout << "error" << endl;
-  //do something...
-}
-
-```
-
-#### 优化计算图
-
-```c++
-//some declarations
-...
-//Load graph.
-...
-//According to the ops of loaded graph, optimize compute graph.
-graph->Optimize();
-
-```
-
-> 注意： 第一次加载原始图，必须要优化。
-
-#### 保存模型
-
-你可以在任何时候保存模型， 特别的， 你可以保存一个优化的模型，这样，下次再加载模型时，就不必进行优化操作。
-
-
-```c++
-//some declarations
-...
-//Load graph.
-...
-// save a model
-//save_model_path: the path to where your model is.
-auto status = graph->save(save_model_path);
-
-//Checking
-if(!status){
-  cout << "error" << endl;
-  //do somethin...
-}
-```
-
-#### 重新设置计算图里的tensor的shape
-
-```c++
-//some declarations
-...
-//Load graph.
-...
-vector<int> shape{10, 256, 256, 10};
-//input_name : std::string.
-//Reshape a tensor named input_name.
-graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
-```
-
-#### 设置 batch size
-
-`Graph` 支持重新设置batch size的大小。
-
-```c++
-//some declarations
-...
-//Load graph.
-...
-//input_name : std::string.
-//Reset a tensor named input_name.
-int new_batch_size = 4;
-graph->ResetBatchSize(input_name, new_batch_size);
-```
-
-###  Net ###
-
-
-`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
-#### Creating a graph executor
-
-`Net`接受四个模板参数。  
-
-
-```c++
-template<typename TargetType, DataType Dtype, Precision PType OpRunType RunType = OpRunType::ASYNC>
-class Net{
-  //some implements
-  ...
-
-};
-```
-由于有些Op可能支持多种精度，我们可以通过Precision来指定。OpRunType表示同步或异步类型，异步是默认类型。OpRunType::SYNC表示同步，在GPU上只有单个流；OpRunType::ASYNC表示异步，在GPU上有多个流并以异步方式执行。实际上，Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
-
-
-1. <span id = 'precision'> Precision </span>
-
-Precision | Op support
-:---: | :---:
-Precision::INT4 | NO
-Precision::INT8 | NO
-Precision::FP16 | NO
-Precision::FP32 | YES
-Precision::FP64 | NO
-
-现在Op的精度只支持FP32， 但在将来我们会支持剩下的Precision.
-
-
-
-2. OpRunType
-
-OpRunType | Sync/Aync |Description
-:---: | :---: | :---:
-OpRunType::SYNC | Synchronization | single-stream on GPU
-OpRunType::ASYNC | Asynchronization | multi-stream on GPU
-
-用graph对象创建一个执行器。
-```c++
-//some declarations
-...
-//Create a pointer to a graph.
-auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-//do something...
-...
-
-//create a executor
-Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
-
-```
-
-#### 获取输入输出tensor
-
-
-获取输入输出tensor，并填充输入tensor的buffer。如果想要获取输入和输出tensor，那么必须指定输入的名字，如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外，如果想知道input_i对应哪个输入，你需要去dash board查看，如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
-
-```c++
-//some declaratinos
-...
-
-//create a executor
-//TargetType is NV [NVIDIA GPU]
-Net<NV, AK_FLOAT, Precision::FP32> executor(*graph);
-
-//Get the first input tensor.
-//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
-//Note: Member function get_in returns an pointer to tensor.
-Tensor<NV, AK_FLOAT>* tensor_in0 = executor.get_in("input_0");
-
-//If you have multiple input tensors
-//You just type this code below.
-Tensor<NV, AK_FLOAT>* tensor_in1 = executor.get_in("input_1");
-...
-auto tensor_inn = executor.get_in("input_n");
-```
-
-当得到输入tensor之后，就可以填充它的数据区了。
-
-```c++
-//This tensor is resident at GPU.
-auto tensor_d_in = executor.get_in("input_0");
-
-//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
-
-//using Tensor4d = Tensor<Ttype, Dtype>;
-Tensor4d<X86, AK_FLOAT> tensor_h_in; //host tensor;
-//Tensor<X86, AK_FLOAT> tensor_h_in; 
-
-//Allocate memory for host tensor.
-tensor_h_in.re_alloc(tensor_d_in->valid_shape());
-//Get a writable pointer to tensor.
-float *h_data = tensor_h_in.mutable_data();
-
-//Feed your tensor.
-/** example
-for(int i = 0; i < tensor_h_in.size(); i++){
-  h_data[i] = 1.0f;
-}
-*/
-//Copy host tensor's data to device tensor.
-tensor_d_in->copy_from(tensor_h_in);
-
-// And then
-```
-
-
-类似的，我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是， 我们需要指定输入tensor结点的名字，这个可以从dash board中看到，请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor：
-```c++
-//Note: this tensor are resident at GPU.
-Tensor<NV, AK_FLOAT>* tensor_out_d = executor.get_out("pred_out");
-
-```
-
-
-#### Executing graph
-
-
-当一切准备就绪后，我们就可以执行真正的计算了！
-```c++
-executor.prediction();
-```
- 
-## <span id='example'> 示例代码 </span> ##
-
-下面的例子展示了如何调用Anakin。
-
-在这儿之前， 请确保你已经有了Anakin模型。如果还没有，那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
-
-### Single-thread
-
-单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`*
-
-```c++
-
-std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
-// Create an empty graph object.
-auto graph = new Graph<NV, AK_FLOAT, Precision::FP32>();
-// Load Anakin model.
-auto status = graph->load(model_path);
-if(!status ) {
-    LOG(FATAL) << " [ERROR] " << status.info();
-}
-// Reshape
-graph->Reshape("input_0", {10, 384, 960, 10});
-// You must optimize graph for the first time.
-graph->Optimize();
-// Create a executer.
-Net<NV, AK_FLOAT, Precision::FP32> net_executer(*graph);
-
-//Get your input tensors through some specific string such as "input_0", "input_1", and 
-//so on. 
-//And then, feed the input tensor.
-//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
-auto d_tensor_in_p = net_executer.get_in("input_0");
-Tensor4d<X86, AK_FLOAT> h_tensor_in;
-auto valid_shape_in = d_tensor_in_p->valid_shape();
-for (int i=0; i<valid_shape_in.size(); i++) {
-    LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; //see tensor's dimentions
-}
-h_tensor_in.re_alloc(valid_shape_in);
-float* h_data = h_tensor_in.mutable_data();
-for (int i=0; i<h_tensor_in.size(); i++) {
-    h_data[i] = 1.0f;
-}
-d_tensor_in_p->copy_from(h_tensor_in);
-
-//Do inference.
-net_executer.prediction();
-
-//Get result tensor through the name of output node.
-//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
-
-//For example, you've got a output node named obj_pre_out
-//Then, you can get an output tensor.
-auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
-auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
-//......
-// do something else ...
-//...
-//save model.
-//You might not optimize the graph when you load the saved model again.
-std::string save_model_path = model_path + std::string(".saved");
-auto status = graph->save(save_model_path);
-if (!status ) {
-    LOG(FATAL) << " [ERROR] " << status.info();
-}
-
-```
diff --git a/source/advanced_usage/deploy/convert_paddle_to_anakin.md b/source/advanced_usage/deploy/convert_paddle_to_anakin.md
deleted file mode 120000
index 56ca582b2b47f404ede777712830731ea7f4e9b5..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/convert_paddle_to_anakin.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 模型转换指南
-
-Anakin 支持不同框架的模型预测。但由于格式的差别，Anakin 需要您预先转换模型。本文档介绍如何转换模型。
-
-## 简介
-
-Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型，模型包含网络结构（model 或 prototxt）和权重参数（param 或 caffemodel）。   
-
-模型转换的输出是一个 bin 文件，它作为 Anakin 框架的 graph 参数导入。   
-
-您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。   
-
-
-## 系统要求
-
-- python 2.7+
-- pyyaml
-- flask
-- protobuf 3.5+
-
-
-## 用法
-
-### 1、环境
-转换器所需的依赖标注于 *系统要求* 一节。
-
-### 2、配置
-您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例，下面作进一步说明。
-
-#### config.yaml
-```bash
-OPTIONS:
-    Framework: CAFFE       # 依框架类型填写 CAFFE 或 FLUID
-    SavePath: ./output     # 转换结束后模型的保存位置
-    ResultName: googlenet  # 输出模型的名字
-    Config:
-        LaunchBoard: ON    # 是否生成网络结构预览页面
-        Server:
-            ip: 0.0.0.0
-            port: 8888     # 从一个可用端口访问预览页面
-        OptimizedGraph:    # 当您使用了 Anakin 框架的 Optimized 功能时，才应该打开此项
-            enable: OFF
-            path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
-    LOGGER:
-        LogToPath: ./log/  # 生成日志的路径
-        WithColor: ON
-
-TARGET:
-    CAFFE:
-        # 当 Framework 为 CAFFE 时需填写
-        ProtoPaths:
-            - /path/to/caffe/src/caffe/proto/caffe.proto
-        PrototxtPath: /path/to/your/googlenet.prototxt
-        ModelPath: /path/to/your/googlenet.caffemodel
-
-    FLUID:
-        # 当 Framework 为 FLUID 时需填写
-        Debug: NULL
-        ProtoPaths:
-            - /
-        PrototxtPath: /path/to/fluid/inference_model
-        ModelPath: /path/to/fluid/inference_model
-	# ...
-```
-
-### 3、转换
-在完成配置文件的修改后，您只需执行 ```python converter.py``` 就可以进行模型转换了。
-
-
-### 4、预览
-最后一步，就是在浏览器中查看令人振奋的转换结果！网址是在 *config.yaml* 中配置的，例如 http://0.0.0.0:8888 。
-
-> 注意：若您使用了默认的 IP 地址 0.0.0.0，请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/source/advanced_usage/deploy/how_to_add_anakin_op.md b/source/advanced_usage/deploy/how_to_add_anakin_op.md
deleted file mode 120000
index f2783eb9f591a31443f2a692ce0eb1bcc9b1063a..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/how_to_add_anakin_op.md
+++ /dev/null
@@ -1,405 +0,0 @@
-# 如何增加新的Operator
-
-## 基本概念
-
-简单介绍下几个同Operator相关的基本概念，详情请参考设计文档。
-
-```framework```: 上层的逻辑代码，负责从parser中获取参数及weights，添加op时主要修改framework/operator目录下的内容。
-
-```saber```: 底层的实现代码，Anakin通过saber封装了不同的backends，不同的实现(impl)分别特化出自己的实现，外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中，增加op主要修改saber/funcs下的内容。
-
-saber的文件结构：
-* saber/funcs下的是各个funcs的外部接口，这一层的op与具体的设备实现无关，只与各op完成的功能有关。由于跟实现(impl)无关，本层文件明均不带impl。
-* saber/funcs/impl下是各个op的impl声明，特定设备需要完成该层声明的特化版本，如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本，saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关，均带有```impl_```前缀。
-* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件，添加cuda的kernel需要在该文件目录下添加。
-* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
-
-### 涉及到的基类及各个类之前的关系
-
-简单介绍相关的基类
-
-* ```anakin::Operator```: framework的operator基类，位于framework/core/operator/operator.h
-
-* ```anakin::saber::BaseFunc```: saber对外的op接口基类，提供统一的对外接口，位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape，并通过```tensor```的```set_shape```接口(只设置shape，不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
-
-* ```ankain::saber::ImplBase```: saber设备实现的op的接口，所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类，一类以```vender_```为前缀，带有```vender_```代码意为使用第三方库来实现该op，如cudnn的conv，或mkl的conv等等，这类op的性能我们难以调优，因此单独列为一类。另一类是带有源码的saber实现，这些实现都带有```saber_```为前缀，此类实现带有源码，能够通过后续优化不断提升性能，实现起名时需要注意这一点。
-
-## 添加operator
-
-添加一个新的op需要以下几步：
-
-1. 添加saber的param
-2. 定义saber的Operator类
-3. 定义新的impl声明
-3. 完成新的impl实现
-4. 增加framework的实现或特化
-
-接下来就针对这几步，以一个简单例子为例介绍实现。
-
-例如我们要添加新的Mul op。给出计算公式如下：$$Out = alpha \dot X * Y$$
-
-### 为operator增加param
-
-涉及到的文件：```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param，这一步可以跳过。
-这里```XXXParam```是一个```struct```。包含一个无参数的构造函数，含参数的构造函数，复制构造函数，```operator=()```及```operator==()```。
-```
-template <typename opTensor> // 能够获得target, datatype, layout
-struct MulParam{
-  MulParam()
-    : alpha(0)
-  {}
-  MulParam(float alpha_in)
-    : alpha(alpha_in)
-  {}
-  MulParam(const MulParam& right)
-    : alpha(right.alpha)
-  {}
-  MulParam &operator=(const MulParam &right) {
-    alpha = right.alpha;
-  }
-  bool operator==(const MulParam &right) {
-    return alpha == right.alpha;
-  }
-  float alpha;
-};
-```
-
-### 定义Operator类
-涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类，这里需要修改输入的impl定义头文件。
-下面给出一个相对完整的定义结构供参考。
-```
-//不同的设备需要包含对应的operator实现.[详见](#impl)
-#ifdef NVIDIA_GPU
-#include "saber/funcs/impl/cuda/saber_mul.h"
-#include "saber/funcs/impl/cuda/vender_mul.h"
-#endif
-//如果一个设备现在还没有对应的operator实现，需要包含声明。[详见](#declare)
-#ifdef USE_X86_PLACE
-#include "saber/funcs/impl/impl_mul.h"
-#endif
-namespace anakin {
-namespace saber {
-template<typename TargetType,
-        DataType OpDtype,
-        DataType inDtype = AK_FLOAT,
-        DataType outDtype = AK_FLOAT,
-        typename LayOutType_op = NCHW,
-        typename LayOutType_in = NCHW,
-        typename LayOutType_out = NCHW>
-class Mul : public BaseFunc<
-        Tensor<TargetType, inDtype, LayOutType_in>,
-        Tensor<TargetType, outDtype, LayOutType_out>,
-        Tensor<TargetType, OpDtype, LayOutType_op>,
-        ImplBase, MulParam> {
-public:
-    using BaseFunc<
-            Tensor<TargetType, inDtype, LayOutType_in>,
-            Tensor<TargetType, outDtype, LayOutType_out>,
-            Tensor<TargetType, OpDtype, LayOutType_op>,
-            ImplBase, MulParam>::BaseFunc;
-    Mul() = default;
-    typedef Tensor<TargetType, inDtype, LayOutType_in> InDataTensor;
-    typedef Tensor<TargetType, outDtype, LayOutType_out> OutDataTensor;
-    typedef Tensor<TargetType, OpDtype, LayOutType_op> OpTensor;
-    typedef MulParam<OpTensor> Param_t;
-    typedef std::vector<InDataTensor *> Input_v;
-    typedef std::vector<OutDataTensor *> Output_v;
-    typedef std::vector<Shape> Shape_v;
-
-    virtual SaberStatus compute_output_shape(const Input_v &input,
-                                             Output_v &output, Param_t &param) override {
-        //计算输出的shape，
-        Shape output_shape = (input[0]->valid_shape());
-        /* code */
-        return output[0]->set_shape(output_shape);
-    }
-    virtual SaberStatus init_impl(ImplEnum implenum) override {
-      // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
-      switch (implenum) {
-            case VENDER_IMPL:
-                this->_impl.push_back(new VenderMul <TargetType,
-                OpDtype, inDtype, outDtype,
-                LayOutType_op, LayOutType_in, LayOutType_out>);
-                return SaberSuccess;
-            case SABER_IMPL:
-                this->_impl.push_back(new SaberMul <TargetType,
-                OpDtype, inDtype, outDtype,
-                LayOutType_op, LayOutType_in, LayOutType_out>);
-                return SaberSuccess;
-            default:
-                return SaberUnImplError;
-        }
-    }
-private:
-    virtual void pick_best_static() override {
-        if (true) // some condition?
-            this->_best_impl = this->_impl[0];
-    }
-    virtual void pick_best_specify(ImplEnum implenum) override {
-        this->_best_impl = this->_impl[0];
-    }
-};
-} // namespace saber
-} // namespace anakin
-```
-
-### 为operator增加新的impl<span id="declare">声明</span>
-
-涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明，特化版本放在对应的文件夹下，这里的声明就是给出所有设备的统一声明。下面给出一个参考。
-```
-#include "saber/funcs/impl/impl_macro.h"
-namespace anakin{
-namespace saber{
-DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字，第二个是对应param的名字
-}
-}
-```
-
-### 完成新的operator特定后端<span id="impl">实现</span>
-
-涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
-这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op，```saber```指的源码实现的op。这里以cuda的vender实现为例，简单介绍一下特化出的函数的几个基本接口。
-
-```
-// include 对应的声明
-#include "saber/funcs/impl/impl_mul.h"
-
-namespace anakin{
-namespace saber{
-template <DataType OpDtype,
-    DataType inDtype,
-    DataType outDtype,
-    typename LayOutType_op,
-    typename LayOutType_in,
-    typename LayOutType_out>
-class VenderMul<NV, //偏特化出需要的后端。
-    OpDtype, inDtype, outDtype,
-    LayOutType_op, LayOutType_in, LayOutType_out> :
-    public ImplBase<
-        Tensor<NV, inDtype, LayOutType_in>,
-        Tensor<NV, outDtype, LayOutType_out>,
-        Tensor<NV, OpDtype, LayOutType_op>,
-        MulParam<Tensor<NV, OpDtype, LayOutType_op> > >
-{
-public:
-    typedef Tensor<NV, inDtype, LayOutType_in> DataTensor_in;
-    typedef Tensor<NV, outDtype, LayOutType_out> DataTensor_out;
-    typedef Tensor<NV, OpDtype, LayOutType_op> OpTensor;
-    typedef typename DataTensor_in::Dtype InDataType;
-    typedef typename DataTensor_out::Dtype OutDataType;
-    typedef typename OpTensor::Dtype OpDataType;
-    VenderMul(){}
-    ~VenderMul() {}
-
-    virtual SaberStatus init(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            MulParam<OpTensor>& param, Context<NV>& ctx) {
-        this->_ctx = ctx;
-        create(inputs, outputs, param, ctx);
-    }
-
-    virtual SaberStatus create(const std::vector<DataTensor_in *>& inputs,
-                            std::vector<DataTensor_out *>& outputs,
-                            MulParam<OpTensor>& param, Context<NV>& ctx) {
-        // set内部参数
-    }
-
-    virtual SaberStatus dispatch(const std::vector<DataTensor_in*>& inputs,
-                          std::vector<DataTensor_out*>& outputs,
-                        MulParam<OpTensor>& param) {
-        // dispatch kernel.
-    }
-
-private:
-};
-}
-}
-```
-```init```和```create```的区别：```init```接口是第一次初始化op的时候进入的接口，此函数只在第一次初始化op时调用，这个接口一般放一些只需要执行一次的代码，如malloc或者create之类的函数。```create```函数除了第一次init执行外，在输入发生变化或者param发生变化时会再次触发，create一般放置set函数，设置内部变量，当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内，如果```create```函数执行了一些严重耗时的操作，这里会拖慢整个op的执行时间，需要慎重选择操作放置的位置。
-### 添加framework的特化
-
-涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
-这里简单介绍下如果添加或修改framework内的operator
-
-```
-#include "framework/core/base.h"
-#include "framework/core/data_types.h"
-#include "framework/core/operator/operator.h"
-#include "utils/logger/logger.h"
-#include "saber/funcs/mul.h" // 需要包对应的saber头文件
-namespace anakin {
-namespace ops {
-template<typename Ttype, DataType Dtype, Precision Ptype>
-class MulHelper;
-
-template<typename Ttype, DataType Dtype, Precision Ptype>
-class Mul : public Operator<Ttype, Dtype, Ptype> {
-public:
-    Mul() {}
-    /// forward impl
-    virtual void operator() (OpContext<Ttype> &ctx,
-                             const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
-                             std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
-        LOG(ERROR) << "Not Impl Yet Operator power<TargetType:"<<"unknown"<<","
-                   <<type_id<typename DataTypeWarpper<Dtype>::type>().type_info()<<">";
-    }
-    friend class MulHelper<Ttype, Dtype, Ptype>;
-};
-template<typename Ttype, DataType Dtype, Precision Ptype>
-class MulHelper : public OperatorHelper<Ttype, Dtype, Ptype> {
-public:
-    MulHelper() = default;
-    ~MulHelper();
-    Status InitParam() override;
-
-    Status Init(OpContext<Ttype> &ctx,
-                const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
-                std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
-    Status InferShape(const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
-                      std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) override;
-
-public:
-    saber::MulParam<Tensor4d<Ttype, Dtype>> _param_mul;
-    saber::Mul<Ttype, Dtype> _funcs_mul;
-};
-}
-} /* namespace anakin */
-```
-对应的```.cpp```文件如下：
-```
-#include "framework/operators/mul.h"
-
-namespace anakin {
-namespace ops {
-
-#ifdef USE_CUDA
-template<>
-void Mul<NV, AK_FLOAT, Precision::FP32>::operator()(
-    OpContext<NV>& ctx,
-    const std::vector<Tensor4dPtr<NV, AK_FLOAT> >& ins,
-    std::vector<Tensor4dPtr<NV, AK_FLOAT> >& outs) {
-    auto* impl =
-        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper);
-    auto& param =
-        static_cast<MulHelper<NV, AK_FLOAT, Precision::FP32>*>(this->_helper)->_param_mul;
-    impl->_funcs_mul(ins, outs, param, ctx);
-}
-#endif
-
-template<typename Ttype, DataType Dtype, Precision Ptype>
-Status MulHelper<Ttype, Dtype, Ptype>::InitParam() {
-    auto alpha = GET_PARAMETER(float, alpha);
-    MulParam<Tensor4d<Ttype, Dtype>> param_mul(alpha);
-    _param_mul = param_mul;
-    return Status::OK();
-}
-
-template<typename Ttype, DataType Dtype, Precision Ptype>
-Status MulHelper<Ttype, Dtype, Ptype>::Init(OpContext<Ttype>& ctx,
-        const std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
-        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
-
-    SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
-    return Status::OK();
-}
-
-template<typename Ttype, DataType Dtype, Precision Ptype>
-Status MulHelper<Ttype, Dtype, Ptype>::InferShape(const
-        std::vector<Tensor4dPtr<Ttype, Dtype> >& ins,
-        std::vector<Tensor4dPtr<Ttype, Dtype> >& outs) {
-    SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
-    return Status::OK();
-}
-
-#ifdef USE_CUDA
-template class MulHelper<NV, AK_FLOAT, Precision::FP32>;
-#endif
-#ifdef USE_ARM_PLACE
-template class MulHelper<ARM, AK_FLOAT, Precision::FP32>;
-#endif
-// register helper
-#ifdef USE_CUDA
-ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
-#endif
-#ifdef USE_ARM_PLACE
-ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
-#endif
-//! register op
-ANAKIN_REGISTER_OP(Mul)
-.Doc("Mul operator")
-#ifdef USE_CUDA
-.__alias__<NV, AK_FLOAT, Precision::FP32>("mul")
-#endif
-#ifdef USE_ARM_PLACE
-.__alias__<ARM, AK_FLOAT, Precision::FP32>("mul")
-#endif
-.num_in(1)
-.num_out(1)
-.Args<float>("alpha", " alpha of Mul "); //注册
-
-} /* namespace ops */
-
-} /* namespace anakin */
-```
-
-## 实现单元测试
-涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
-在对应的test下需要添加新的单元测试
-
-```
-TEST(TestSaberFuncNV, test_depthwise_conv) {
-
-    // init tensors and some param.
-
-    // start Reshape & doInfer
-    Context<NV> ctx1(0, 1, 1);
-
-    // create param
-    MulParam<Tensor<NV, AK_FLOAT, NCHW> > param(alpha);
-
-    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> input;
-    std::vector<Tensor<NV, AK_FLOAT, NCHW>*> output;
-
-    // create saber op
-    Mul<NV, AK_FLOAT, AK_FLOAT, AK_FLOAT, NCHW> mul;
-
-    // compute output shape
-    mul.compute_output_shape(input, output, param);
-
-    // re_alloc output tensors memory based on output shape
-    output[0]->re_alloc(output[0]->shape());
-
-    // init saber op(calling init and create)
-    mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
-    // call operator()
-    mul(input, output, param, ctx1);
-
-    // cuda specified, record events
-    cudaStream_t cuda_stream = ctx1.get_compute_stream();
-    output[0]->record_event(cuda_stream);
-    output_dev.sync();
-    
-    // param changed 
-    param.alpha = 2.0;
-    // auto calling saber op(create and dispatch)
-    mul(input, output, param, ctx1);
-
-    cudaDeviceSynchronize();
-    CUDA_CHECK(cudaPeekAtLastError());
-}
-
-int main(int argc, const char** argv){
-    anakin::saber::Env<NV>::env_init();
-
-    // initial logger
-    //logger::init(argv[0]);
-    InitTest();
-    RUN_ALL_TESTS(argv[0]);
-    return 0;
-}
-
-```
-## 调试及注意事项
-
-一个op需要有对外的op接口和内部实现，由于存在saber/funcs/impl的非特化版本声明，当有op在某种设备下没有对应实现时，也能够编译，但此时是没有任何实现的空实现，
diff --git a/source/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/source/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
deleted file mode 120000
index a1f75f5e95cfb90f26d3782ba30a6d1887a70424..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
+++ /dev/null
@@ -1,459 +0,0 @@
-# 如何支持一个新的设备
-
-## 概览
-
-添加一个新的设备需要以下3个步骤：
-
-* [在`CMakeList`中添加设备的支持](#0001)
-* [在`saber`中添加设备的实现](#0002)
-* [在`framework`中添加设备的具体化或实例化](#0003)
-
-假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
-
-## <span id = '0001'> 在`CMakeList`中添加设备的支持 </span> ##
-
-* 修改根目录`CMakeList.txt`
-```cmake
-#select the plantform to build
-anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
-anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
-anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
-anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
-```
-
-* 修改`saber/CMakeList.txt`
-
-根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
-```cmake
-if(USE_TNEW_PLACE)
-    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
-    anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
-endif()
-```
-
-* 修改`test/CMakeList.txt`
-
-新增设备的单测文件放在`test/saber/tnew`目录下，修改`test`目录下的`CMakeList.txt`。
-```cmake
-if(USE_TNEW_PLACE)
-    anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
-endif()
-```
-
-* 修改`cmake/anakin_config.h.in`
-```c++
-// plantform to use
-#cmakedefine USE_GPU_PLACE
-
-#cmakedefine USE_X86_PLACE
-
-#cmakedefine USE_ARM_PLACE
-
-#cmakedefine USE_TNEW_PLACE
-```
-
-* 其他依赖和编译选项    
-修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
-
-
-## <span id = '0002'> 在`saber`中添加设备的实现 </span> ##
-`saber`是`Anakin`的基础计算库，对外提供设备无关的统一的API，设备相关的实现都会封装到`TargetWrapper`中。
-
-### 在`saber/saber_types.h`中添加设备
-
-```c++
-enum TargetTypeEnum {
-    eINVALID = -1,
-    eNV = 1,
-    eAMD = 2,
-    eARM = 3,
-    eX86 = 4,
-    eNVHX86 = 5,
-    eTNEW = 6
-};
-
-typedef TargetType<eNV> NV;
-typedef TargetType<eARM> ARM;
-typedef TargetType<eAMD> AMD;
-typedef TargetType<eX86> X86;
-typedef TargetType<eTNEW> TNEW;
-
-```
-
-### 在`saber/core`中添加设备的实现
-
-1. 在`target_traits.h`中添加新设备
-
-* 增加设备类型
-```c++
-struct __cuda_device{};
-struct __arm_device{};
-struct __amd_device{};
-struct __x86_device{};
-struct __tnew_device{};
-```
-
-* `TargetTypeTraits`模板具体化
-```c++
-template <>
-struct TargetTypeTraits<TNEW> {
-    typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
-    typedef __tnew_device target_type;
-};
-```
-
-2. 在`data_traits.h`中特化`DataTrait`模板类
-
-如果设备需要特殊的数据类型，则特化出设备的`DataTrait`类的实现，例如opencl数据类型的实现如下：
-```c++
-#ifdef USE_OPENCL
-struct ClMem{
-    ClMem(){
-        dmem = nullptr;
-        offset = 0;
-    }
-
-    ClMem(cl_mem* mem_in, int offset_in = 0) {
-        dmem = mem_in;
-        offset = offset_in;
-    }
-
-    ClMem(ClMem& right) {
-        dmem = right.dmem;
-        offset = right.offset;
-    }
-
-    ClMem& operator=(ClMem& right) {
-        this->dmem = right.dmem;
-        this->offset = right.offset;
-        return *this;
-    }
-
-    ClMem& operator+(int offset_in) {
-        this->offset += offset_in;
-        return *this;
-    }
-
-    int offset{0};
-    cl_mem* dmem;
-};
-
-template <>
-struct DataTrait<AMD, AK_FLOAT> {
-    typedef ClMem Dtype;
-    typedef float dtype;
-};
-
-template <>
-struct DataTrait<AMD, AK_DOUBLE> {
-    typedef ClMem Dtype;
-    typedef double dtype;
-};
-
-template <>
-struct DataTrait<AMD, AK_INT8> {
-    typedef ClMem Dtype;
-    typedef char dtype;
-};
-#endif //use_opencl
-```
-
-3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
-
-特化`TargetWrapper`模板类，在`target_wrapper.h`中声明函数，具体如下：
-```c++
-template <>
-struct TargetWrapper<TNEW, __xxx_target> { //根据TNEW的具体类型修改__xxx_target，__host_target或者__device_target
-
-    typedef xxx_event event_t;          //根据设备实现xxx_event
-    typedef xxx_stream stream_t;        //根据设备实现xxx_stream
-
-    static void get_device_count(int& count);
-
-    static void set_device(int id);
-
-    //We should add strategy to avoid malloc directly
-    static void mem_alloc(void** ptr, size_t n);
-
-    static void mem_free(void* ptr);
-
-    static void mem_set(void* ptr, int value, size_t n);
-
-    static void create_event(event_t& event, bool flag = false);
-
-    static void create_stream(stream_t& stream);
-
-    static void create_stream_with_flag(stream_t& stream, unsigned int flag);
-
-    static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
-
-    static void destroy_stream(stream_t& stream);
-
-    static void destroy_event(event_t& event);
-
-    static void record_event(event_t& event, stream_t stream);
-
-    static void query_event(event_t& event);
-
-    static void sync_event(event_t& event);
-
-    static void sync_stream(event_t& event, stream_t& stream);
-
-    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-                            size_t count, __DtoD);
-
-    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-                             size_t count, stream_t& stream, __DtoD);
-
-    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-                            size_t count, __HtoD);
-
-    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-                             size_t count, stream_t& stream, __HtoD);
-
-    static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-                            size_t count, __DtoH);
-
-    static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
-                             size_t count, stream_t& stream, __DtoH);
-
-    static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-                                int src_dev, size_t count);
-
-    static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
-                                 int src_dev, size_t count, stream_t& stream);
-
-    static int get_device_id();
-};
-
-```
-
-4. 在`impl/`目录下添加设备目录和实现
-
-在`saber/core/impl`目录下添加设备目录`tnew`。
-* 实现`TargetWrapper<TNEW, __xxx_target>`结构体中各函数的定义。    
-如果`TargetWrapper<TNEW, __xxx_target>`的实现与默认的模板类一致，则不用特化出该类。
-
-```c++
-typedef TargetWrapper<TNEW, __xxx_target> TNEW_API;
-void TNEW_API::get_device_count(int &count) {
-    // add implementation
-}
-
-void TNEW_API::set_device(int id){
-    // add implementation
-}
-        
-void TNEW_API::mem_alloc(void** ptr, size_t n){
-    // add implementation
-}
-        
-void TNEW_API::mem_free(void* ptr){
-    if(ptr != nullptr){
-        // add implementation
-    }
-}
-...
-
-```
-
-* 特化实现`device.h`中的`Device<TNEW>`
-
-```c++
-template <>
-void Device<TNEW>::create_stream() {
-    // add implementation
-}
-
-template <>
-void Device<TNEW>::get_info() {
-
-    // add implementation
-}
-
-```
-
-### 在`saber/funcs`中实现设备相关的op
-
-参考[如何增加新的Operator](addCustomOp.md)
-
-
-## <span id = '0003'> 在`framework`中添加设备的具体化或实例化 </span> ##
-
-### `framework/core`
-
-* `net.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
-template class Net<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
-#endif
-```
-
-* `operator_func.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class OperatorFunc<TNEW, AK_FLOAT, Precision::FP32>;
-#endif
-```
-
-* `worker.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::ASYNC>;
-template class Worker<TNEW, AK_FLOAT, Precision::FP32, OpRunType::SYNC>;
-#endif
-```
-
-* `operator_attr.cpp`中添加实例化
-
-```c++
-template
-OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP32>(const std::string& op_name);
-template
-OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::FP16>(const std::string& op_name);
-template
-OpAttrWarpper& OpAttrWarpper::__alias__<TNEW, AK_FLOAT, Precision::INT8>(const std::string& op_name);
-```
-
-* `parameter.h`中添加设备的实现
-
-```c++
-#ifdef USE_TNEW_PLACE
-template<typename Dtype>
-class PBlock<Dtype, TNEW> {
-public:
-	typedef Tensor4d<TNEW, DataTypeRecover<Dtype>::type> type;
-
-	PBlock() {
-		_inner_tensor = std::make_shared<type>(); 
-	}
-	...
-}
-#endif //TNEW
-```
-
-* `type_traits_extend.h`中添加设备的实现
-
-```c++
-template<>
-struct target_host<saber::TNEW> {
-    typedef saber::X86 type; //根据TNEW选择正确的host type
-};
-```
-
-### `framework/graph`
-
-* `graph.cpp`中添加实例化
-  
-```c++
-  #ifdef USE_TNEW_PLACE
-  template class Graph<TNEW, AK_FLOAT, Precision::FP32>;
-  template class Graph<TNEW, AK_FLOAT, Precision::FP16>;
-  template class Graph<TNEW, AK_FLOAT, Precision::INT8>;
-  #endif
-```
-
-### `framework/model_parser`
-
-* `parser.cpp`中添加实例化
-  
-```c++
-  #ifdef USE_TNEW_PLACE
-  template
-  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
-          const char* model_path);
-  template
-  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
-          const char* model_path);
-  template
-  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
-          const char* model_path);
-  
-  template
-  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
-          std::string& model_path);
-  template
-  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
-          std::string& model_path);
-  template
-  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
-          std::string& model_path);
-  
-  template
-  Status load<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
-          std::string& model_path);
-  template
-  Status load<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
-          std::string& model_path);
-  template
-  Status load<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
-          std::string& model_path);
-  
-  template
-  Status save<TNEW, AK_FLOAT, Precision::FP32>(graph::Graph<TNEW, AK_FLOAT, Precision::FP32>* graph,
-          const char* model_path);
-  template
-  Status save<TNEW, AK_FLOAT, Precision::FP16>(graph::Graph<TNEW, AK_FLOAT, Precision::FP16>* graph,
-          const char* model_path);
-  template
-  Status save<TNEW, AK_FLOAT, Precision::INT8>(graph::Graph<TNEW, AK_FLOAT, Precision::INT8>* graph,
-          const char* model_path);
-  #endif
-```
-
-* `model_io.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class NodeIO<TNEW, AK_FLOAT, Precision::FP32>;
-template class NodeIO<TNEW, AK_FLOAT, Precision::FP16>;
-template class NodeIO<TNEW, AK_FLOAT, Precision::INT8>;
-#endif
-```
-
-### `framework/operators`
-
-为`framework/operators`目录下所有op添加实例化或具体化
-以`activation.cpp`为例，实例化如下：
-
-```c++
-#ifdef USE_TNEW_PLACE
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
-template class ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>;
-ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
-#endif
-```
-
-如果TNEW设备函数的实现与现有模板实现不一致，可以特化实现如下（以init()为例）：
-```c++
-#ifdef USE_TNEW_PLACE
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
-template <>
-Status ActivationHelper<TNEW, AK_FLOAT, Precision::FP32>::Init(OpContext<TNEW> &ctx,\
-        const std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& ins, \
-                std::vector<Tensor4dPtr<TNEW, AK_FLOAT> >& outs) {
-    SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
-    return Status::OK();
-}
-ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
-#endif
-```
-
-在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
-
-```c++
-#ifdef USE_TNEW_PLACE
-.__alias__<TNEW, AK_FLOAT, Precision::FP32>("activation")
-#endif
-```
-
-## 注意事项
-不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/source/advanced_usage/deploy/install_anakin.md b/source/advanced_usage/deploy/install_anakin.md
deleted file mode 120000
index bb7c1950308622e3de292268a718e6ec688e6ae6..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/install_anakin.md
+++ /dev/null
@@ -1,69 +0,0 @@
-## 从源码编译安装Anakin ##
-
-我们已经在CentOS 7.3上成功的安装和测试了Anakin，对于其他操作系统，我们将很快支持。
-
-### 安装概览 ###
-
-* [在CentOS上安装 Anakin]()
-* [在Ubuntu上安装 Anakin]()
-* [在ARM上安装 Anakin](run_on_arm_ch.md)
-* [验证安装]()
-
-
-### 在CentOS上安装 Anakin ###
-#### 1. 系统要求 ####
-
-*  make 3.82+
-*  cmake 2.8.12+
-*  gcc 4.8.2+
-*  g++ 4.8.2+
-*  其他需要补充的。。。
-
-#### 2. 编译CPU版Anakin ####
-
-暂时不支持
-
-#### 3. 编译支持NVIDIA GPU的Anakin ####
-
-- 3.1. 安装依赖
-  - 3.1.1 protobuf  
-    >$ git clone https://github.com/google/protobuf  
-    >$ cd protobuf  
-    >$ git submodule update --init --recursive  
-    >$ ./autogen.sh  
-    >$ ./configure --prefix=/path/to/your/insall_dir  
-    >$ make  
-    >$ make check  
-    >$ make install  
-    >$ sudo ldconfig
-
-
-    如安装protobuf遇到任何问题，请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
-
-- 3.2 CUDA Toolkit
-  - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher. 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-  - [cuDNN v7](https://developer.nvidia.com/cudnn). 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). 
-- 3.3  编译Anakin
-  >$ git clone https:/xxxxx  
-  >$ cd anakin  
-  >$ mkdir build  
-  >$ camke ..  
-  >$ make
-
-
-#### 4. 编译支持AMD GPU的Anakin ####
-
-暂时还不支持
-
-
-### 在Ubuntu上安装 Anakin ###
-
-暂时还不支持
-
-
-### 在ARM上安装 Anakin ###
-
-暂时还不支持
-
-### 验证安装 ###
-we are coming soon...
diff --git a/source/advanced_usage/deploy/mobile_build.md b/source/advanced_usage/deploy/mobile_build.md
deleted file mode 120000
index e51593164987d548e256ddebbc5fa8d960fb5255..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/mobile_build.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# 环境搭建
-## 使用 docker
-### 1. 安装 docker
-安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
-### 2. 使用 docker 搭建构建环境
-首先进入 paddle-mobile 的目录下，执行 `docker build`
-以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
-```
-$ docker build -t paddle-mobile:dev - < Dockerfile
-```
-使用 `docker images` 可以看到我们新建的 image
-```
-$ docker images
-REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
-paddle-mobile   dev     33b146787711   45 hours ago    372MB
-```
-### 3. 使用 docker 构建
-进入 paddle-mobile 目录，执行 docker run
-```
-$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
-root@5affd29d4fc5:/ # cd /paddle-mobile
-# 生成构建 android 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
-# 生成构建 linux 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
-```
-### 4. 设置编译选项
-可以通过 ccmake 设置编译选项
-```
-root@5affd29d4fc5:/ # ccmake .
-                                                     Page 1 of 1
- CMAKE_ASM_FLAGS
- CMAKE_ASM_FLAGS_DEBUG
- CMAKE_ASM_FLAGS_RELEASE
- CMAKE_BUILD_TYPE
- CMAKE_INSTALL_PREFIX             /usr/local
- CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
- CPU                              ON
- DEBUGING                         ON
- FPGA                             OFF
- LOG_PROFILE                      ON
- MALI_GPU                         OFF
- NET                              googlenet
- USE_EXCEPTION                    ON
- USE_OPENMP                       OFF
-```
-修改选项后，按 `c`, `g` 更新 Makefile
-### 5. 构建
-使用 make 命令进行构建
-```
-root@5affd29d4fc5:/ # make
-```
-### 6. 查看构建产出
-构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
-
-## 不使用 docker
-不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/source/advanced_usage/deploy/mobile_dev.md b/source/advanced_usage/deploy/mobile_dev.md
deleted file mode 120000
index 474380f9dbfd2fb8a06630cb1ca3ca5cd14ca9d9..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/mobile_dev.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# iOS开发文档
-
-## 编译
-
-### 一. 使用 build.sh 编译
-
-```sh 
-sh build.sh ios
-
-# 如果只想编译某个特定模型的 op, 则需执行以下命令
-sh build.sh ios googlenet
-
-# 在这个文件夹下, 你可以拿到生成的 .a 库
-cd ../build/release/ios/build
-
-```
-
-### 二. 使用 xcode 编译
-
-我们提供了 ios 开发更为熟悉的 xcode 编译环境:
-在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
-
-### 三. 集成
-
-#### 如使用 c++ 接口
-将 
-
-```
-libpaddle-mobile.a 
-io.h  
-program.h 
-types.h 
-lod_tensor.h 
-tensor.h
-```
-拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
-
-#### 如使用 oc 接口
-将在xcode 编译生成的
-```
-libPaddleMobile.a 
-PaddleMobile.h
-```
-拖入工程, 接口如下:
-
-```
-/*
-	创建单例对象
-*/
-+ (instancetype)sharedInstance;
-
-/*
-	load 模型, 开辟内存
-*/
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/*
-	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
-- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/*
-	进行预测
-*/
-- (NSArray *)predict:(CGImageRef)image;
-
-/*
-	清理内存
-*/
-- (void)clear;
-
-```
diff --git a/source/advanced_usage/deploy/native_infer.rst b/source/advanced_usage/deploy/native_infer.rst
deleted file mode 100644
index e1eee3f818796e895362caab10846cf59b557162..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/native_infer.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-Paddle 预测 API
-===============
-
-为了更简单方便的预测部署，Fluid 提供了一套高层 API
-用来隐藏底层不同的优化实现。
-
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
-包括
-
--  头文件 ``paddle_inference_api.h`` 定义了所有的接口
--  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
--  库文件 ``libpaddle_inference_api.so`` 或
-   ``libpaddle_inference_api.a``
-
-编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
-
-下面是一些 API 概念的介绍
-
-PaddleTensor
-------------
-
-PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
-
-.. code:: cpp
-
-    struct PaddleTensor {
-      std::string name;  // variable name.
-      std::vector<int> shape;
-      PaddleBuf data;  // blob of data.
-      PaddleDType dtype;
-    };
-
--  ``name`` 用于指定输入数据对应的 模型中variable 的名字
-   （暂时没有用，但会在后续支持任意 target 时启用）
--  ``shape`` 表示一个 Tensor 的 shape
--  ``data`` 数据以连续内存的方式存储在\ ``PaddleBuf``
-   中，\ ``PaddleBuf``
-   可以接收外面的数据或者独立\ ``malloc``\ 内存，详细可以参考头文件中相关定义。
--  ``dtype`` 表示 Tensor 的数据类型
-
-engine
-------
-
-高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
-
--  原生 engine，由 paddle 原生的 forward operator
-   组成，可以天然支持所有paddle 训练出的模型，
--  Anakin engine，封装了
-   `Anakin <https://github.com/PaddlePaddle/Anakin>`__
-   ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle
-   模型，
--  TensorRT mixed engine，用子图的方式支持了
-   `TensorRT <https://developer.nvidia.com/tensorrt>`__ ，支持所有paddle
-   模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
-
-其实现为
-
-.. code:: cpp
-
-    enum class PaddleEngineKind {
-      kNative = 0,       // Use the native Fluid facility.
-      kAnakin,           // Use Anakin for inference.
-      kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
-    };
-
-预测部署过程
-------------
-
-总体上分为以下步骤
-
-1. 用合适的配置创建 ``PaddlePredictor``
-2. 创建输入用的 ``PaddleTensor``\ ，传入到 ``PaddlePredictor`` 中
-3. 获取输出的 ``PaddleTensor`` ，将结果取出
-
-下面完整演示一个简单的模型，部分细节代码隐去
-
-.. code:: cpp
-
-    #include "paddle_inference_api.h"
-
-    // 创建一个 config，并修改相关设置
-    paddle::NativeConfig config;
-    config.model_dir = "xxx";
-    config.use_gpu = false;
-    // 创建一个原生的 PaddlePredictor
-    auto predictor =
-          paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-    // 创建输入 tensor
-    int64_t data[4] = {1, 2, 3, 4};
-    paddle::PaddleTensor tensor{.name = "",
-                                .shape = std::vector<int>({4, 1}),
-                                .data = PaddleBuf(data, sizeof(data)),
-                                .dtype = PaddleDType::INT64};
-    // 创建输出 tensor，输出 tensor 的内存可以复用
-    std::vector<paddle::PaddleTensor> outputs;
-    // 执行预测
-    CHECK(predictor->Run(slots, &outputs));
-    // 获取 outputs ...
-
-编译时，联编 ``libpaddle_fluid.a/.so`` 和
-``libpaddle_inference_api.a/.so`` 便可。
-
-详细代码参考
-------------
-
--  `inference
-   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
--  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
diff --git a/source/advanced_usage/deploy/native_inference_engine.rst b/source/advanced_usage/deploy/native_inference_engine.rst
deleted file mode 100644
index e1eee3f818796e895362caab10846cf59b557162..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/native_inference_engine.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-Paddle 预测 API
-===============
-
-为了更简单方便的预测部署，Fluid 提供了一套高层 API
-用来隐藏底层不同的优化实现。
-
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
-包括
-
--  头文件 ``paddle_inference_api.h`` 定义了所有的接口
--  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
--  库文件 ``libpaddle_inference_api.so`` 或
-   ``libpaddle_inference_api.a``
-
-编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
-
-下面是一些 API 概念的介绍
-
-PaddleTensor
-------------
-
-PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
-
-.. code:: cpp
-
-    struct PaddleTensor {
-      std::string name;  // variable name.
-      std::vector<int> shape;
-      PaddleBuf data;  // blob of data.
-      PaddleDType dtype;
-    };
-
--  ``name`` 用于指定输入数据对应的 模型中variable 的名字
-   （暂时没有用，但会在后续支持任意 target 时启用）
--  ``shape`` 表示一个 Tensor 的 shape
--  ``data`` 数据以连续内存的方式存储在\ ``PaddleBuf``
-   中，\ ``PaddleBuf``
-   可以接收外面的数据或者独立\ ``malloc``\ 内存，详细可以参考头文件中相关定义。
--  ``dtype`` 表示 Tensor 的数据类型
-
-engine
-------
-
-高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
-
--  原生 engine，由 paddle 原生的 forward operator
-   组成，可以天然支持所有paddle 训练出的模型，
--  Anakin engine，封装了
-   `Anakin <https://github.com/PaddlePaddle/Anakin>`__
-   ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle
-   模型，
--  TensorRT mixed engine，用子图的方式支持了
-   `TensorRT <https://developer.nvidia.com/tensorrt>`__ ，支持所有paddle
-   模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
-
-其实现为
-
-.. code:: cpp
-
-    enum class PaddleEngineKind {
-      kNative = 0,       // Use the native Fluid facility.
-      kAnakin,           // Use Anakin for inference.
-      kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
-    };
-
-预测部署过程
-------------
-
-总体上分为以下步骤
-
-1. 用合适的配置创建 ``PaddlePredictor``
-2. 创建输入用的 ``PaddleTensor``\ ，传入到 ``PaddlePredictor`` 中
-3. 获取输出的 ``PaddleTensor`` ，将结果取出
-
-下面完整演示一个简单的模型，部分细节代码隐去
-
-.. code:: cpp
-
-    #include "paddle_inference_api.h"
-
-    // 创建一个 config，并修改相关设置
-    paddle::NativeConfig config;
-    config.model_dir = "xxx";
-    config.use_gpu = false;
-    // 创建一个原生的 PaddlePredictor
-    auto predictor =
-          paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-    // 创建输入 tensor
-    int64_t data[4] = {1, 2, 3, 4};
-    paddle::PaddleTensor tensor{.name = "",
-                                .shape = std::vector<int>({4, 1}),
-                                .data = PaddleBuf(data, sizeof(data)),
-                                .dtype = PaddleDType::INT64};
-    // 创建输出 tensor，输出 tensor 的内存可以复用
-    std::vector<paddle::PaddleTensor> outputs;
-    // 执行预测
-    CHECK(predictor->Run(slots, &outputs));
-    // 获取 outputs ...
-
-编译时，联编 ``libpaddle_fluid.a/.so`` 和
-``libpaddle_inference_api.a/.so`` 便可。
-
-详细代码参考
-------------
-
--  `inference
-   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
--  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
diff --git a/source/advanced_usage/deploy/run_anakin_on_arm.md b/source/advanced_usage/deploy/run_anakin_on_arm.md
deleted file mode 120000
index ebeb38f534ebfc8cb5a41d103abe3bb1de7e379a..0000000000000000000000000000000000000000
--- a/source/advanced_usage/deploy/run_anakin_on_arm.md
+++ /dev/null
@@ -1,151 +0,0 @@
-## 源码编译 Anakin ##
-
-目前Anakin支持ARM Android平台，采用Android NDK交叉编译工具链，已在mac os和centos上编译和测试通过。
-
-### 安装概览 ###
-
-* [系统需求](#0001)
-* [安装第三方依赖](#0002)
-* [Anakin源码编译](#0003)
-* [验证安装](#0004)
-
-
-### <span id = '0001'> 1. 系统需求 </span> ###
-
-*  宿主机: linux, mac    
-*  cmake 3.8.2+    
-*  Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
-
-### <span id = '0002'> 2. 安装第三方依赖 </span> ###
-
-- 2.1 protobuf3.4.0     
-   源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)    
- - 2.1.1 为宿主机编译protobuf     
- ```bash
-   $ tar -xzf protobuf-3.4.0.tar.gz  
-   $ cd protobuf-3.4.0   
-   $ ./autogen.sh  
-   $ ./configure    
-   $ make  
-   $ make check   
-   $ make install
-   ```
-   上述 $make install 执行后，可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下，
-   如有问题，请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
-   然后将已经生成文件清除。
- ```bash
-   $ make distclean
-   ```
- - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf，注意设置ANDROID_NDK的路径，以及ARCH_ABI、HOSTOSN的值，   
- ```bash
-
-   $ export ANDROID_NDK=your_ndk_path 
-   $ ARCH_ABI="arm-linux-androideabi-4.9"
-   $ HOSTOSN="darwin-x86_64"
-   $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm  
-   $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
-   $ export LDFLAGS="--sysroot=$SYSROOT"
-   $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
-   $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
-   $ export CPPFLAGS=""
-   $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
-   $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
-   $ export CCFLAGS="$CXXFLAGS"
-   $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
-   $ export CC="$CXX"
-   $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"  
-   $ ./autogen.sh  
-   $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"  
-   $ make
-  ```
-  
-  编译生成 *.a 静态库，若希望编译*.so 动态链接库 ，请在./configure参数中改--disable-shared为--disable-static --enable-shared。  
-  生成文件在src/.libs/下，将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。  
-  在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。        
-  ```cmake
-  set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
-  ```
-  
-- 2.2 opencv 2.4.3+(optional)    
-    Anakin只在examples示例中使用opencv   
-    Android系统的opencv从[这里下载](https://opencv.org/releases.html)    
-    解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`    
-    在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`, 
-    并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。   
-    ```cmake
-    include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
-    LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
-    ```
-### <span id = '0003'> 3. Anakin源码编译 </span> ###
-
-#### 编译Android版本
-
-   克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
-```bash
-    cd your_dir
-    git clone https://github.com/PaddlePaddle/Anakin.git
-    cd Anakin
-    git fetch origin arm
-    git checkout arm
-  ```
-  修改`android_build.sh`    
-- 修改NDK路径    
-  ```bash
-    #modify "your_ndk_path" to your NDK path
-    export ANDROID_NDK=your_ndk_path
-  ```
-- 修改ARM 处理器架构     
-  对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`， 
-  对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。        
-  目前我们只支持 `armeabi-v7a with NEON`；`arm64-v8a` 还在开发中。      
-  ```bash
-      -DANDROID_ABI="armeabi-v7a with NEON"
-  ```
-- 设置Android API    
-  根据Android系统的版本设置API level， 例如API Level 21 -> Android 5.0.1    
-  ```bash
-      -DANDROID_NATIVE_API_LEVEL=21
-  ```
-
-- 选择编译静态库或动态库    
-  设置`BUILD_SHARED=NO`编译静态库    
-  设置`BUILD_SHARED=YES`编译动态库    
-  ```bash
-      -DBUILD_SHARED=NO
-  ```
-- OpenMP多线程支持    
-  设置`USE_OPENMP=YES`开启OpenMP多线程    
-  ```bash
-      -DUSE_OPENMP=YES
-  ```
-  
-- 编译单测文件    
-  设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件    
-    ```bash
-        -DBUILD_WITH_UNIT_TEST=YES
-    ```
-
-- 编译示例文件    
-  设置`BUILD_EXAMPLES=YES`将会编译示例文件    
-    ```bash
-        -DBUILD_EXAMPLES=YES
-    ```
-  
-- 开启opencv    
-  如果使用opencv，设置`USE_OPENCV=YES`    
-    ```bash
-        -DUSE_OPENCV=YES
-    ```
-    
-- 开始编译    
-  运行脚本 `android_build.sh` 将自动编译Anakin     
-  ```bash
-      ./android_build.sh
-  ```
-
-### <span id = '0004'> 4. 验证安装 </span> ###    
-  编译好的库会放在目录`${Anakin_root}/output`下；    
-  编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下；    
-  编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
-  
-  对于Android系统，打开设备的调试模式，通过ADB可以访问的目录是`data/local/tmp`，通过ADB push将测试文件、模型和数据发送到设备目录， 运行测试文件。
diff --git a/source/advanced_usage/development/contribute_to_paddle.md b/source/advanced_usage/development/contribute_to_paddle.md
deleted file mode 120000
index a2242fa22cafa7899d7202d5032bc22d9debba4b..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/contribute_to_paddle.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/source/advanced_usage/development/cpu_profiling_cn.md b/source/advanced_usage/development/cpu_profiling_cn.md
deleted file mode 120000
index 855ddcb09b88db49cb482850a5bbe9a3ee8c3278..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/cpu_profiling_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/howto/optimization/cpu_profiling_cn.md
\ No newline at end of file
diff --git a/source/advanced_usage/development/gpu_profiling_cn.rst b/source/advanced_usage/development/gpu_profiling_cn.rst
deleted file mode 120000
index f2396716bddd4810fa77c738d41f5482aa6d6055..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/gpu_profiling_cn.rst
+++ /dev/null
@@ -1,242 +0,0 @@
-============
-GPU性能调优
-============
-
-..  contents::
-
-此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
-
-- 什么是性能分析？
-- 为什么需要性能分析？
-- 如何进行性能分析？
-- 性能分析工具介绍
-- 详细教程
-- 性能分析小技巧
-
-什么是性能分析？
-================
-在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
-也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
-
-简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
-
-为什么需要性能分析？
-============================
-训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
-而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
-如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
-
-如何进行性能分析？
-========================
-为了达到性能最优，您可以采用下面五个步骤：
-
-- 对代码进行性能分析
-- 找到运行慢的部分
-- 找到运行慢的原因
-- 修改成更快的版本
-- 再次对代码进行性能分析
-
-Usually, processor has two key performance limits include float point throughput and
-memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
-This is why they can be so fast.
-
-通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
-GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
-
-性能分析工具介绍
-======================
-就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
-
-**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
-在这个教程中，我们主要会介绍nvprof和nvvp。
-
-:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
-above profilers.
-
-:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
-
-.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
-   :language: c++
-   :lines: 137-151
-   :linenos:
-
-上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
-
-1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
-
-2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
-program crashes when CPU version of PaddlePaddle invokes them.
-
-3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
-
-您会在接下来的部分中获得更多的细节介绍。
-
-详细教程
-============
-
-内置定时器
-------------
-
-如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
-接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
-下面举个简单的例子：
-
-1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
-
-    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
-        :language: c++
-        :lines: 137-151
-        :emphasize-lines: 8-12,14
-        :linenos:
-
-2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
-
-    .. code-block:: bash
-
-        cmake .. -DWITH_TIMER=ON
-        make
-
-3. 执行您的代码，并观察结果(如高亮部分）。
-
-    .. code-block:: bash
-        :emphasize-lines: 1,12-15
-
-        > ./paddle/legacy/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
-        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
-        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
-        [==========] Running 1 test from 1 test case.
-        [----------] Global test environment set-up.
-        [----------] 1 test from Profiler
-        [ RUN      ] Profiler.BilinearFwdBwd
-        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
-        gSizeX = 64, imgSizeY = 64"
-        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
-        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
-        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
-        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
-        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
-        [       OK ] Profiler.BilinearFwdBwd (136 ms)
-        [----------] 1 test from Profiler (136 ms total)
-
-        [----------] Global test environment tear-down
-        [==========] 1 test from 1 test case ran. (136 ms total)
-        [  PASSED  ] 1 test.
-
-nvprof 工具
-----------------
-
-要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
-
-1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
-
-    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
-        :language: c++
-        :lines: 137-151
-        :emphasize-lines: 6-7
-        :linenos:
-
-2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
-
-    .. code-block:: bash
-
-        cmake .. -DWITH_PROFILER=ON
-        make
-
-3. 使用 **nvprof** 来分析执行文件。
-
-    .. code-block:: bash
-
-        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
-
-然后，您就能获得如下的分析结果：
-
-.. code-block:: bash
-
-    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
-    ==78544== Profiling result:
-    Time(%)     Time     Calls       Avg       Min       Max  Name
-    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
-    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
-    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
-    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
-
-    ==78544== API calls:
-    Time(%)     Time     Calls       Avg       Min       Max  Name
-    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
-    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
-    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
-    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
-    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
-    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
-    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
-    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
-    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
-    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
-    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
-    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
-    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
-    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
-    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
-    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
-    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
-    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
-    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
-    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
-    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
-    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
-    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
-    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
-    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
-    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
-    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
-
-
-nvvp 工具
---------------
-
-如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
-
-**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
-
-..  image:: nvvp1.png
-    :align: center
-    :scale: 33%
-
-从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
-同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
-
-
-..  image:: nvvp2.png
-    :align: center
-    :scale: 33%
-
-而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
-例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
-
-..  image:: nvvp3.png
-    :align: center
-    :scale: 33%
-
-..  image:: nvvp4.png
-    :align: center
-    :scale: 33%
-
-性能分析小技巧
-==================
-
-- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
-- 接下来可以考虑下时间线的分析。
-- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
-- 可能的情况下，试着让输出的分析数据和理论值对应。
-
-    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
-    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
-- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
-
-性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
-当然，具体情况因人而异。
-
-参考资料
-===========
-Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/source/advanced_usage/development/host_memory_profiling_cn.md b/source/advanced_usage/development/host_memory_profiling_cn.md
deleted file mode 120000
index b32efeb71263f9aca15d7713865ebae936be4fff..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/host_memory_profiling_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/howto/optimization/host_memory_profiling_cn.md
\ No newline at end of file
diff --git a/source/advanced_usage/development/new_op.md b/source/advanced_usage/development/new_op.md
deleted file mode 120000
index f3ead50ea0fc0d01531f05265eafc7f11341134b..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/new_op.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/dev/new_op_cn.md
\ No newline at end of file
diff --git a/source/advanced_usage/development/timeline_cn.md b/source/advanced_usage/development/timeline_cn.md
deleted file mode 120000
index d2980e6ab2a3141ed05aecf986b9a54d536ad87e..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/timeline_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/howto/optimization/timeline_cn.md
\ No newline at end of file
diff --git a/source/advanced_usage/development/write_docs.rst b/source/advanced_usage/development/write_docs.rst
deleted file mode 120000
index 901197c6aaf5f6b75b40b70fc3367ce9ac6a2935..0000000000000000000000000000000000000000
--- a/source/advanced_usage/development/write_docs.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../../paddle/doc/fluid/dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/source/api_guides/high_level/index.rst b/source/api_guides/high_level/index.rst
deleted file mode 100644
index be91b4195845750079b7171f13497c2ec6d677bd..0000000000000000000000000000000000000000
--- a/source/api_guides/high_level/index.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-.. _api_guide_high_level_api:
-
-##############
-High level API
-##############
-
diff --git a/source/api_guides/index.rst b/source/api_guides/index.rst
deleted file mode 100644
index fe624a6ae20297d4cc4f4b91d51d7cee721ef82a..0000000000000000000000000000000000000000
--- a/source/api_guides/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-#########
-API Guide
-#########
-
-..  todo::
-
-    Complete this doc
-
-
-..  toctree::
-    :maxdepth: 4
-
-    high_level/index.rst
-    low_level/index.rst
\ No newline at end of file
diff --git a/source/api_guides/low_level/executor/executor.rst b/source/api_guides/low_level/executor/executor.rst
deleted file mode 100644
index aed355785c7b506711837fae3121f4ff4dbc98e2..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/executor/executor.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-..  _api_guide_executor:
-
-########
-Executor
-########
-
-:code:`Executor` 即 :code:`执行器` 。PaddlePaddle Fluid中有两种执行器可以选择。
-:code:`Executor` 实现了一个简易的执行器，所有Operator会被顺序执行。用户可以使用
-Python脚本驱动 :code:`Executor` 执行。默认情况下 :code:`Executor` 是单线程的，如果
-想使用数据并行，请参考另一个执行器， :ref:`api_guide_parallel_executor` 。
-
-:code:`Executor` 的代码逻辑非常简单。建议用户在调试过程中，先使用
-:code:`Executor` 跑通模型，再切换到多设备计算，甚至多机计算。
-
-:code:`Executor` 在构造的时候接受一个 :code:`Place`， 它们可以是 :ref:`api_fluid_CPUPlace`
-或 :ref:`api_fluid_CUDAPlace` 。 :code:`Executor` 在执行的时候可以选择执行的
-:ref:`api_guide_low_level_program` 。
-
-简单的使用方法，请参考 :ref:`quick_start_fit_a_line` , API Reference 请参考
-:ref:`api_fluid_Executor` 。
diff --git a/source/api_guides/low_level/executor/parallel_executor.rst b/source/api_guides/low_level/executor/parallel_executor.rst
deleted file mode 100644
index 01c48dd6abfe1c5c89dfdfbe4a46765ef6ee7d90..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/executor/parallel_executor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. _api_guide_parallel_executor:
-
-################
-ParallelExecutor
-################
\ No newline at end of file
diff --git a/source/api_guides/low_level/index.rst b/source/api_guides/low_level/index.rst
deleted file mode 100644
index dda51d83922ca6a314d050b3de6f805842d938e4..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/index.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-#############
-Low level API
-#############
-
-Layers
-######
-
-神经网络的主体API是一些层函数，他们包括
-
-..  toctree::
-    :maxdepth: 1
-
-    layers/math.rst
-    layers/activations.rst
-    layers/convolution.rst
-    layers/pooling.rst
-    layers/preprocessing.rst
-    layers/io.rst
-    layers/metrics.rst
-    layers/detection.rst
-
-执行引擎
-########
-
-..  toctree::
-
-    executor/executor.rst
-    executor/parallel_executor.rst
-
-数据读取
-########
-
-参数属性与参数初始化(ParamAttr)
-###############################
-
-
-预测引擎
-########
-
-..  _api_guide_low_level_program:
-
-Program
-#######
-
-Block
-#####
-
-Variable
-########
-
-Scope
-#####
-
-Place
-#####
-
-
-CreateOperator
-##############
-
-Backward
-########
-
-模型平均(Model Average)
-#######################
-
-Optimizers
-##########
-
-正则化
-######
-
-Transpiler
-##########
-
-Gradient Clipping
-#################
-
-
-调试工具/VisualDL
-#################
-
diff --git a/source/api_guides/low_level/layers/activations.rst b/source/api_guides/low_level/layers/activations.rst
deleted file mode 100644
index 88191af24241abe525547fe1069e502ccdc4d9c1..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/activations.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-########
-激活函数
-########
-
diff --git a/source/api_guides/low_level/layers/convolution.rst b/source/api_guides/low_level/layers/convolution.rst
deleted file mode 100644
index 64df924a86171020adc90f6fd17d98d3e2e671d8..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/convolution.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-########
-卷积操作
-########
diff --git a/source/api_guides/low_level/layers/detection.rst b/source/api_guides/low_level/layers/detection.rst
deleted file mode 100644
index 00a3fc40cb62d349f0d94b16c782cc263ddc9739..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/detection.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-########
-图像检测
-########
-
diff --git a/source/api_guides/low_level/layers/io.rst b/source/api_guides/low_level/layers/io.rst
deleted file mode 100644
index 8b036355af923194663c8671f1ea0dd42773129e..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/io.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-########
-输入输出
-########
-
-
-..  _api_guide_reader:
-
-Reader相关API
-#############
\ No newline at end of file
diff --git a/source/api_guides/low_level/layers/math.rst b/source/api_guides/low_level/layers/math.rst
deleted file mode 100644
index 2c8cc56091a7adcf10f2bbc24a32ca8c522b93cc..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/math.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-########
-数学算子
-########
-
diff --git a/source/api_guides/low_level/layers/metrics.rst b/source/api_guides/low_level/layers/metrics.rst
deleted file mode 100644
index 42a82f606a862144e4a92af08cc3e99f2d689148..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/metrics.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-########
-评价指标
-########
\ No newline at end of file
diff --git a/source/api_guides/low_level/layers/pooling.rst b/source/api_guides/low_level/layers/pooling.rst
deleted file mode 100644
index ecca8a465debba9173e6ab1b1cb2c40ecc641f06..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/pooling.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-########
-池化操作
-########
-
-
diff --git a/source/api_guides/low_level/layers/preprocessing.rst b/source/api_guides/low_level/layers/preprocessing.rst
deleted file mode 100644
index 7ab81de5759a2de1dbdad27c794ab7d99ae89373..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/layers/preprocessing.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-##########
-预处理操作
-##########
-
-
-
-图像预处理操作
-##############
-
-
-语音预处理操作
-##############
\ No newline at end of file
diff --git a/source/api_guides/low_level/lodtensor.rst b/source/api_guides/low_level/lodtensor.rst
deleted file mode 100644
index fe585bed8b08148bc4d0b74c70371ec4f8b87f34..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/lodtensor.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-..  _api_guide_lod_tensor:
-
-#########
-LoDTensor
-#########
diff --git a/source/api_guides/low_level/recordio.rst b/source/api_guides/low_level/recordio.rst
deleted file mode 100644
index 9e28cb25176a1a5509f3a9a9af09fec73e1e1b41..0000000000000000000000000000000000000000
--- a/source/api_guides/low_level/recordio.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-############
-RecordIO文件
-############
-
-
-RecordIO转换API
-###############
-
-
-
-.. _api_guide_recordio_file_format:
-
-RecordIO文件格式
-################
diff --git a/source/api_reference/average.rst b/source/api_reference/average.rst
deleted file mode 100644
index 48f208301e708964ca781f3b87d842646e73e092..0000000000000000000000000000000000000000
--- a/source/api_reference/average.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_average:
-
-=============
-fluid.average
-=============
-
-.. _api_fluid_average_WeightedAverage:
-
-WeightedAverage
----------------
-
-..  autoclass:: paddle.fluid.average.WeightedAverage
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/backward.rst b/source/api_reference/backward.rst
deleted file mode 100644
index 037e96d0c7ac094203ef0d80aa2b497a71304cb6..0000000000000000000000000000000000000000
--- a/source/api_reference/backward.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_backward:
-
-==============
-fluid.backward
-==============
-
-.. _api_fluid_backward_append_backward:
-
-append_backward
----------------
-
-..  autofunction:: paddle.fluid.backward.append_backward
-    :noindex:
-
-.. _api_fluid_backward_calc_gradient:
-
-calc_gradient
--------------
-
-..  autofunction:: paddle.fluid.backward.calc_gradient
-    :noindex:
-
diff --git a/source/api_reference/clip.rst b/source/api_reference/clip.rst
deleted file mode 100644
index 8d5b531db4ce557fbe23892c4489316e1da66e23..0000000000000000000000000000000000000000
--- a/source/api_reference/clip.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_clip:
-
-==========
-fluid.clip
-==========
-
-.. _api_fluid_clip_ErrorClipByValue:
-
-ErrorClipByValue
-----------------
-
-..  autoclass:: paddle.fluid.clip.ErrorClipByValue
-    :members:
-    :noindex:
-
-.. _api_fluid_clip_GradientClipByValue:
-
-GradientClipByValue
--------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByValue
-    :members:
-    :noindex:
-
-.. _api_fluid_clip_GradientClipByNorm:
-
-GradientClipByNorm
-------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByNorm
-    :members:
-    :noindex:
-
-.. _api_fluid_clip_GradientClipByGlobalNorm:
-
-GradientClipByGlobalNorm
-------------------------
-
-..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/data b/source/api_reference/data
deleted file mode 120000
index 5aa0dd5cb56f08380cc5a6a23ad057f8c1184fa2..0000000000000000000000000000000000000000
--- a/source/api_reference/data
+++ /dev/null
@@ -1 +0,0 @@
-../../paddle/doc/fluid/api/data
\ No newline at end of file
diff --git a/source/api_reference/data_feeder.rst b/source/api_reference/data_feeder.rst
deleted file mode 100644
index aed1bad8c91fefaee56d5c160ee5b98d6d1ae8cf..0000000000000000000000000000000000000000
--- a/source/api_reference/data_feeder.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_data_feeder:
-
-=================
-fluid.data_feeder
-=================
-
-.. _api_fluid_data_feeder_DataFeeder:
-
-DataFeeder
-----------
-
-..  autoclass:: paddle.fluid.data_feeder.DataFeeder
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/executor.rst b/source/api_reference/executor.rst
deleted file mode 100644
index 0345e5b29d7f02a47ef3eb0d1e7d7535058c8137..0000000000000000000000000000000000000000
--- a/source/api_reference/executor.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_executor:
-
-==============
-fluid.executor
-==============
-
-.. _api_fluid_executor_Executor:
-
-Executor
---------
-
-..  autoclass:: paddle.fluid.executor.Executor
-    :members:
-    :noindex:
-
-.. _api_fluid_executor_global_scope:
-
-global_scope
-------------
-
-..  autofunction:: paddle.fluid.executor.global_scope
-    :noindex:
-
-.. _api_fluid_executor_scope_guard:
-
-scope_guard
------------
-
-..  autofunction:: paddle.fluid.executor.scope_guard
-    :noindex:
-
-.. _api_fluid_executor__switch_scope:
-
-_switch_scope
--------------
-
-..  autofunction:: paddle.fluid.executor._switch_scope
-    :noindex:
-
-.. _api_fluid_executor_fetch_var:
-
-fetch_var
----------
-
-..  autofunction:: paddle.fluid.executor.fetch_var
-    :noindex:
-
diff --git a/source/api_reference/fluid.rst b/source/api_reference/fluid.rst
deleted file mode 100644
index 4b330fdebd6ef82708cbfb0f22bad88bd8651514..0000000000000000000000000000000000000000
--- a/source/api_reference/fluid.rst
+++ /dev/null
@@ -1,380 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid:
-
-=====
-fluid
-=====
-
-.. _api_fluid_Block:
-
-Block
------
-
-..  autoclass:: paddle.fluid.Block
-    :members:
-    :noindex:
-
-.. _api_fluid_Variable:
-
-Variable
---------
-
-..  autoclass:: paddle.fluid.Variable
-    :members:
-    :noindex:
-
-.. _api_fluid_Program:
-
-Program
--------
-
-..  autoclass:: paddle.fluid.Program
-    :members:
-    :noindex:
-
-.. _api_fluid_Operator:
-
-Operator
---------
-
-..  autoclass:: paddle.fluid.Operator
-    :members:
-    :noindex:
-
-.. _api_fluid_default_startup_program:
-
-default_startup_program
------------------------
-
-..  autofunction:: paddle.fluid.default_startup_program
-    :noindex:
-
-.. _api_fluid_default_main_program:
-
-default_main_program
---------------------
-
-..  autofunction:: paddle.fluid.default_main_program
-    :noindex:
-
-.. _api_fluid_program_guard:
-
-program_guard
--------------
-
-..  autofunction:: paddle.fluid.program_guard
-    :noindex:
-
-.. _api_fluid_get_var:
-
-get_var
--------
-
-..  autofunction:: paddle.fluid.get_var
-    :noindex:
-
-.. _api_fluid_Executor:
-
-Executor
---------
-
-..  autoclass:: paddle.fluid.Executor
-    :members:
-    :noindex:
-
-.. _api_fluid_global_scope:
-
-global_scope
-------------
-
-..  autofunction:: paddle.fluid.global_scope
-    :noindex:
-
-.. _api_fluid_scope_guard:
-
-scope_guard
------------
-
-..  autofunction:: paddle.fluid.scope_guard
-    :noindex:
-
-.. _api_fluid__switch_scope:
-
-_switch_scope
--------------
-
-..  autofunction:: paddle.fluid._switch_scope
-    :noindex:
-
-.. _api_fluid_fetch_var:
-
-fetch_var
----------
-
-..  autofunction:: paddle.fluid.fetch_var
-    :noindex:
-
-.. _api_fluid_Go:
-
-Go
---
-
-..  autoclass:: paddle.fluid.Go
-    :members:
-    :noindex:
-
-.. _api_fluid_make_channel:
-
-make_channel
-------------
-
-..  autofunction:: paddle.fluid.make_channel
-    :noindex:
-
-.. _api_fluid_channel_send:
-
-channel_send
-------------
-
-..  autofunction:: paddle.fluid.channel_send
-    :noindex:
-
-.. _api_fluid_channel_recv:
-
-channel_recv
-------------
-
-..  autofunction:: paddle.fluid.channel_recv
-    :noindex:
-
-.. _api_fluid_channel_close:
-
-channel_close
--------------
-
-..  autofunction:: paddle.fluid.channel_close
-    :noindex:
-
-.. _api_fluid_Select:
-
-Select
-------
-
-..  autoclass:: paddle.fluid.Select
-    :members:
-    :noindex:
-
-.. _api_fluid_Trainer:
-
-Trainer
--------
-
-..  autoclass:: paddle.fluid.Trainer
-    :members:
-    :noindex:
-
-.. _api_fluid_BeginEpochEvent:
-
-BeginEpochEvent
----------------
-
-..  autoclass:: paddle.fluid.BeginEpochEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_EndEpochEvent:
-
-EndEpochEvent
--------------
-
-..  autoclass:: paddle.fluid.EndEpochEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_BeginStepEvent:
-
-BeginStepEvent
---------------
-
-..  autoclass:: paddle.fluid.BeginStepEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_EndStepEvent:
-
-EndStepEvent
-------------
-
-..  autoclass:: paddle.fluid.EndStepEvent
-    :members:
-    :noindex:
-
-.. _api_fluid_CheckpointConfig:
-
-CheckpointConfig
-----------------
-
-..  autoclass:: paddle.fluid.CheckpointConfig
-    :members:
-    :noindex:
-
-.. _api_fluid_Inferencer:
-
-Inferencer
-----------
-
-..  autoclass:: paddle.fluid.Inferencer
-    :members:
-    :noindex:
-
-.. _api_fluid_DistributeTranspiler:
-
-DistributeTranspiler
---------------------
-
-..  autoclass:: paddle.fluid.DistributeTranspiler
-    :members:
-    :noindex:
-
-.. _api_fluid_memory_optimize:
-
-memory_optimize
----------------
-
-..  autofunction:: paddle.fluid.memory_optimize
-    :noindex:
-
-.. _api_fluid_release_memory:
-
-release_memory
---------------
-
-..  autofunction:: paddle.fluid.release_memory
-    :noindex:
-
-.. _api_fluid_ParallelExecutor:
-
-ParallelExecutor
-----------------
-
-..  autoclass:: paddle.fluid.ParallelExecutor
-    :members:
-    :noindex:
-
-.. _api_fluid_ExecutionStrategy:
-
-ExecutionStrategy
------------------
-
-..  autoclass:: paddle.fluid.ExecutionStrategy
-    :members:
-    :noindex:
-
-.. _api_fluid_BuildStrategy:
-
-BuildStrategy
--------------
-
-..  autoclass:: paddle.fluid.BuildStrategy
-    :members:
-    :noindex:
-
-.. _api_fluid_create_lod_tensor:
-
-create_lod_tensor
------------------
-
-..  autofunction:: paddle.fluid.create_lod_tensor
-    :noindex:
-
-.. _api_fluid_create_random_int_lodtensor:
-
-create_random_int_lodtensor
----------------------------
-
-..  autofunction:: paddle.fluid.create_random_int_lodtensor
-    :noindex:
-
-.. _api_fluid_LoDTensor:
-
-LoDTensor
----------
-
-..  autoclass:: paddle.fluid.LoDTensor
-    :members:
-    :noindex:
-
-.. _api_fluid_CPUPlace:
-
-CPUPlace
---------
-
-..  autoclass:: paddle.fluid.CPUPlace
-    :members:
-    :noindex:
-
-.. _api_fluid_CUDAPlace:
-
-CUDAPlace
----------
-
-..  autoclass:: paddle.fluid.CUDAPlace
-    :members:
-    :noindex:
-
-.. _api_fluid_CUDAPinnedPlace:
-
-CUDAPinnedPlace
----------------
-
-..  autoclass:: paddle.fluid.CUDAPinnedPlace
-    :members:
-    :noindex:
-
-.. _api_fluid_Tensor:
-
-Tensor
-------
-
-..  autoclass:: paddle.fluid.Tensor
-    :members:
-    :noindex:
-
-.. _api_fluid_ParamAttr:
-
-ParamAttr
----------
-
-..  autoclass:: paddle.fluid.ParamAttr
-    :members:
-    :noindex:
-
-.. _api_fluid_WeightNormParamAttr:
-
-WeightNormParamAttr
--------------------
-
-..  autoclass:: paddle.fluid.WeightNormParamAttr
-    :members:
-    :noindex:
-
-.. _api_fluid_DataFeeder:
-
-DataFeeder
-----------
-
-..  autoclass:: paddle.fluid.DataFeeder
-    :members:
-    :noindex:
-
-.. _api_fluid_Scope:
-
-Scope
------
-
-..  autoclass:: paddle.fluid.Scope
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/gen_doc.py b/source/api_reference/gen_doc.py
deleted file mode 100644
index 011480a4a6d0f512d537eec39ed4cb2201d49eeb..0000000000000000000000000000000000000000
--- a/source/api_reference/gen_doc.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import argparse
-import sys
-import types
-
-import paddle.fluid as fluid
-
-
-def parse_arg():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--submodules', nargs="*")
-    parser.add_argument(
-        'module', type=str, help='Generate the documentation of which module')
-    return parser.parse_args()
-
-
-class DocGenerator(object):
-    def __init__(self, module_name=None, stream=sys.stdout):
-        if module_name == "":
-            module_name = None
-        self.stream = stream
-        if module_name is None:
-            self.module_name = "fluid"
-        else:
-            self.module_name = "fluid." + module_name
-        if module_name is None:
-            self.module = fluid
-        else:
-            if not hasattr(fluid, module_name):
-                raise ValueError("Cannot find fluid.{0}".format(module_name))
-            else:
-                self.module = getattr(fluid, module_name)
-        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-''')
-        self._print_ref_raw_("_".join(self.module_name.split(".")))
-        self._print_header_(self.module_name, dot='=', is_title=True)
-
-    def print_submodule(self, submodule_name):
-        submodule = getattr(self.module, submodule_name)
-        if submodule is None:
-            raise ValueError("Cannot find submodule {0}".format(submodule_name))
-        self.print_section(submodule_name)
-
-        for item in submodule.__all__:
-            self.print_item(item)
-
-    def print_current_module(self):
-        for item in self.module.__all__:
-            self.print_item(item)
-
-    def print_section(self, name):
-        self._print_header_(name, dot='=', is_title=False)
-
-    def print_item(self, name):
-        item = getattr(self.module, name, None)
-        if item is None:
-           return
-        if isinstance(item, types.TypeType):
-            self.print_class(name)
-        elif isinstance(item, types.FunctionType):
-            self.print_method(name)
-        else:
-            pass
-
-    def print_class(self, name):
-        self._print_ref_(name)
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.{0}.{1}
-    :members:
-    :noindex:
-
-'''.format(self.module_name, name))
-
-    def print_method(self, name):
-        self._print_ref_(name)
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.{0}.{1}
-    :noindex:
-
-'''.format(self.module_name, name))
-
-    def _print_header_(self, name, dot, is_title):
-        dot_line = dot * len(name)
-        if is_title:
-            self.stream.write(dot_line)
-            self.stream.write('\n')
-        self.stream.write(name)
-        self.stream.write('\n')
-        self.stream.write(dot_line)
-        self.stream.write('\n')
-        self.stream.write('\n')
-
-    def _print_ref_(self, name):
-        self._print_ref_raw_("_".join(self.module_name.split(".") + [name]))
-
-    def _print_ref_raw_(self, anchor):
-        self.stream.write(".. _api_{0}:\n\n".format(anchor))
-
-
-def main():
-    args = parse_arg()
-    gen = DocGenerator(args.module)
-    if args.submodules is None:
-        gen.print_current_module()
-    else:
-        for submodule_name in args.submodules:
-            gen.print_submodule(submodule_name)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/source/api_reference/initializer.rst b/source/api_reference/initializer.rst
deleted file mode 100644
index f3fc2f2a67a914c4ed1584ae0f0127a21ecc1a6c..0000000000000000000000000000000000000000
--- a/source/api_reference/initializer.rst
+++ /dev/null
@@ -1,133 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_initializer:
-
-=================
-fluid.initializer
-=================
-
-.. _api_fluid_initializer_Constant:
-
-Constant
---------
-
-..  autoclass:: paddle.fluid.initializer.Constant
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Uniform:
-
-Uniform
--------
-
-..  autoclass:: paddle.fluid.initializer.Uniform
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Normal:
-
-Normal
-------
-
-..  autoclass:: paddle.fluid.initializer.Normal
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Xavier:
-
-Xavier
-------
-
-..  autoclass:: paddle.fluid.initializer.Xavier
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_Bilinear:
-
-Bilinear
---------
-
-..  autoclass:: paddle.fluid.initializer.Bilinear
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_MSRA:
-
-MSRA
-----
-
-..  autoclass:: paddle.fluid.initializer.MSRA
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_force_init_on_cpu:
-
-force_init_on_cpu
------------------
-
-..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
-    :noindex:
-
-.. _api_fluid_initializer_init_on_cpu:
-
-init_on_cpu
------------
-
-..  autofunction:: paddle.fluid.initializer.init_on_cpu
-    :noindex:
-
-.. _api_fluid_initializer_ConstantInitializer:
-
-ConstantInitializer
--------------------
-
-..  autoclass:: paddle.fluid.initializer.ConstantInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_UniformInitializer:
-
-UniformInitializer
-------------------
-
-..  autoclass:: paddle.fluid.initializer.UniformInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_NormalInitializer:
-
-NormalInitializer
------------------
-
-..  autoclass:: paddle.fluid.initializer.NormalInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_XavierInitializer:
-
-XavierInitializer
------------------
-
-..  autoclass:: paddle.fluid.initializer.XavierInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_BilinearInitializer:
-
-BilinearInitializer
--------------------
-
-..  autoclass:: paddle.fluid.initializer.BilinearInitializer
-    :members:
-    :noindex:
-
-.. _api_fluid_initializer_MSRAInitializer:
-
-MSRAInitializer
----------------
-
-..  autoclass:: paddle.fluid.initializer.MSRAInitializer
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/io.rst b/source/api_reference/io.rst
deleted file mode 100644
index f89115537c7c6802b5b81b75aaa99f739bd6e23e..0000000000000000000000000000000000000000
--- a/source/api_reference/io.rst
+++ /dev/null
@@ -1,129 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_io:
-
-========
-fluid.io
-========
-
-.. _api_fluid_io_save_vars:
-
-save_vars
----------
-
-..  autofunction:: paddle.fluid.io.save_vars
-    :noindex:
-
-.. _api_fluid_io_save_params:
-
-save_params
------------
-
-..  autofunction:: paddle.fluid.io.save_params
-    :noindex:
-
-.. _api_fluid_io_save_persistables:
-
-save_persistables
------------------
-
-..  autofunction:: paddle.fluid.io.save_persistables
-    :noindex:
-
-.. _api_fluid_io_load_vars:
-
-load_vars
----------
-
-..  autofunction:: paddle.fluid.io.load_vars
-    :noindex:
-
-.. _api_fluid_io_load_params:
-
-load_params
------------
-
-..  autofunction:: paddle.fluid.io.load_params
-    :noindex:
-
-.. _api_fluid_io_load_persistables:
-
-load_persistables
------------------
-
-..  autofunction:: paddle.fluid.io.load_persistables
-    :noindex:
-
-.. _api_fluid_io_save_inference_model:
-
-save_inference_model
---------------------
-
-..  autofunction:: paddle.fluid.io.save_inference_model
-    :noindex:
-
-.. _api_fluid_io_load_inference_model:
-
-load_inference_model
---------------------
-
-..  autofunction:: paddle.fluid.io.load_inference_model
-    :noindex:
-
-.. _api_fluid_io_get_inference_program:
-
-get_inference_program
----------------------
-
-..  autofunction:: paddle.fluid.io.get_inference_program
-    :noindex:
-
-.. _api_fluid_io_save_checkpoint:
-
-save_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-
-.. _api_fluid_io_load_checkpoint:
-
-load_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-
-.. _api_fluid_io_clean_checkpoint:
-
-clean_checkpoint
-----------------
-
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
-
-.. _api_fluid_io_load_persist_vars_without_grad:
-
-load_persist_vars_without_grad
-------------------------------
-
-..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
-    :noindex:
-
-.. _api_fluid_io_save_persist_vars_without_grad:
-
-save_persist_vars_without_grad
-------------------------------
-
-..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
-    :noindex:
-
-.. _api_fluid_io_get_latest_checkpoint_serial:
-
-get_latest_checkpoint_serial
-----------------------------
-
-..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
-    :noindex:
-
diff --git a/source/api_reference/layers.rst b/source/api_reference/layers.rst
deleted file mode 100644
index 743f97c3eb966e547db1728670282189a48ccb11..0000000000000000000000000000000000000000
--- a/source/api_reference/layers.rst
+++ /dev/null
@@ -1,1764 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_layers:
-
-============
-fluid.layers
-============
-
-control_flow
-============
-
-.. _api_fluid_layers_split_lod_tensor:
-
-split_lod_tensor
-----------------
-
-..  autofunction:: paddle.fluid.layers.split_lod_tensor
-    :noindex:
-
-.. _api_fluid_layers_merge_lod_tensor:
-
-merge_lod_tensor
-----------------
-
-..  autofunction:: paddle.fluid.layers.merge_lod_tensor
-    :noindex:
-
-.. _api_fluid_layers_BlockGuard:
-
-BlockGuard
-----------
-
-..  autoclass:: paddle.fluid.layers.BlockGuard
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_BlockGuardWithCompletion:
-
-BlockGuardWithCompletion
-------------------------
-
-..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_WhileGuard:
-
-WhileGuard
-----------
-
-..  autoclass:: paddle.fluid.layers.WhileGuard
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_While:
-
-While
------
-
-..  autoclass:: paddle.fluid.layers.While
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_Switch:
-
-Switch
-------
-
-..  autoclass:: paddle.fluid.layers.Switch
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_lod_rank_table:
-
-lod_rank_table
---------------
-
-..  autofunction:: paddle.fluid.layers.lod_rank_table
-    :noindex:
-
-.. _api_fluid_layers_max_sequence_len:
-
-max_sequence_len
-----------------
-
-..  autofunction:: paddle.fluid.layers.max_sequence_len
-    :noindex:
-
-.. _api_fluid_layers_lod_tensor_to_array:
-
-lod_tensor_to_array
--------------------
-
-..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
-    :noindex:
-
-.. _api_fluid_layers_array_to_lod_tensor:
-
-array_to_lod_tensor
--------------------
-
-..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
-    :noindex:
-
-.. _api_fluid_layers_increment:
-
-increment
----------
-
-..  autofunction:: paddle.fluid.layers.increment
-    :noindex:
-
-.. _api_fluid_layers_array_write:
-
-array_write
------------
-
-..  autofunction:: paddle.fluid.layers.array_write
-    :noindex:
-
-.. _api_fluid_layers_create_array:
-
-create_array
-------------
-
-..  autofunction:: paddle.fluid.layers.create_array
-    :noindex:
-
-.. _api_fluid_layers_less_than:
-
-less_than
----------
-
-..  autofunction:: paddle.fluid.layers.less_than
-    :noindex:
-
-.. _api_fluid_layers_equal:
-
-equal
------
-
-..  autofunction:: paddle.fluid.layers.equal
-    :noindex:
-
-.. _api_fluid_layers_array_read:
-
-array_read
-----------
-
-..  autofunction:: paddle.fluid.layers.array_read
-    :noindex:
-
-.. _api_fluid_layers_shrink_memory:
-
-shrink_memory
--------------
-
-..  autofunction:: paddle.fluid.layers.shrink_memory
-    :noindex:
-
-.. _api_fluid_layers_array_length:
-
-array_length
-------------
-
-..  autofunction:: paddle.fluid.layers.array_length
-    :noindex:
-
-.. _api_fluid_layers_IfElse:
-
-IfElse
-------
-
-..  autoclass:: paddle.fluid.layers.IfElse
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_DynamicRNN:
-
-DynamicRNN
-----------
-
-..  autoclass:: paddle.fluid.layers.DynamicRNN
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_ConditionalBlock:
-
-ConditionalBlock
-----------------
-
-..  autoclass:: paddle.fluid.layers.ConditionalBlock
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_StaticRNN:
-
-StaticRNN
----------
-
-..  autoclass:: paddle.fluid.layers.StaticRNN
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_reorder_lod_tensor_by_rank:
-
-reorder_lod_tensor_by_rank
---------------------------
-
-..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
-    :noindex:
-
-.. _api_fluid_layers_ParallelDo:
-
-ParallelDo
-----------
-
-..  autoclass:: paddle.fluid.layers.ParallelDo
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_Print:
-
-Print
------
-
-..  autofunction:: paddle.fluid.layers.Print
-    :noindex:
-
-.. _api_fluid_layers_is_empty:
-
-is_empty
---------
-
-..  autofunction:: paddle.fluid.layers.is_empty
-    :noindex:
-
-device
-======
-
-.. _api_fluid_layers_get_places:
-
-get_places
-----------
-
-..  autofunction:: paddle.fluid.layers.get_places
-    :noindex:
-
-io
-==
-
-.. _api_fluid_layers_data:
-
-data
-----
-
-..  autofunction:: paddle.fluid.layers.data
-    :noindex:
-
-.. _api_fluid_layers_BlockGuardServ:
-
-BlockGuardServ
---------------
-
-..  autoclass:: paddle.fluid.layers.BlockGuardServ
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_ListenAndServ:
-
-ListenAndServ
--------------
-
-..  autoclass:: paddle.fluid.layers.ListenAndServ
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_Send:
-
-Send
-----
-
-..  autofunction:: paddle.fluid.layers.Send
-    :noindex:
-
-.. _api_fluid_layers_Recv:
-
-Recv
-----
-
-..  autofunction:: paddle.fluid.layers.Recv
-    :noindex:
-
-.. _api_fluid_layers_open_recordio_file:
-
-open_recordio_file
-------------------
-
-..  autofunction:: paddle.fluid.layers.open_recordio_file
-    :noindex:
-
-.. _api_fluid_layers_open_files:
-
-open_files
-----------
-
-..  autofunction:: paddle.fluid.layers.open_files
-    :noindex:
-
-.. _api_fluid_layers_read_file:
-
-read_file
----------
-
-..  autofunction:: paddle.fluid.layers.read_file
-    :noindex:
-
-.. _api_fluid_layers_shuffle:
-
-shuffle
--------
-
-..  autofunction:: paddle.fluid.layers.shuffle
-    :noindex:
-
-.. _api_fluid_layers_batch:
-
-batch
------
-
-..  autofunction:: paddle.fluid.layers.batch
-    :noindex:
-
-.. _api_fluid_layers_double_buffer:
-
-double_buffer
--------------
-
-..  autofunction:: paddle.fluid.layers.double_buffer
-    :noindex:
-
-.. _api_fluid_layers_random_data_generator:
-
-random_data_generator
----------------------
-
-..  autofunction:: paddle.fluid.layers.random_data_generator
-    :noindex:
-
-.. _api_fluid_layers_Preprocessor:
-
-Preprocessor
-------------
-
-..  autoclass:: paddle.fluid.layers.Preprocessor
-    :members:
-    :noindex:
-
-.. _api_fluid_layers_load:
-
-load
-----
-
-..  autofunction:: paddle.fluid.layers.load
-    :noindex:
-
-nn
-==
-
-.. _api_fluid_layers_fc:
-
-fc
---
-
-..  autofunction:: paddle.fluid.layers.fc
-    :noindex:
-
-.. _api_fluid_layers_embedding:
-
-embedding
----------
-
-..  autofunction:: paddle.fluid.layers.embedding
-    :noindex:
-
-.. _api_fluid_layers_dynamic_lstm:
-
-dynamic_lstm
-------------
-
-..  autofunction:: paddle.fluid.layers.dynamic_lstm
-    :noindex:
-
-.. _api_fluid_layers_dynamic_lstmp:
-
-dynamic_lstmp
--------------
-
-..  autofunction:: paddle.fluid.layers.dynamic_lstmp
-    :noindex:
-
-.. _api_fluid_layers_dynamic_gru:
-
-dynamic_gru
------------
-
-..  autofunction:: paddle.fluid.layers.dynamic_gru
-    :noindex:
-
-.. _api_fluid_layers_gru_unit:
-
-gru_unit
---------
-
-..  autofunction:: paddle.fluid.layers.gru_unit
-    :noindex:
-
-.. _api_fluid_layers_linear_chain_crf:
-
-linear_chain_crf
-----------------
-
-..  autofunction:: paddle.fluid.layers.linear_chain_crf
-    :noindex:
-
-.. _api_fluid_layers_crf_decoding:
-
-crf_decoding
-------------
-
-..  autofunction:: paddle.fluid.layers.crf_decoding
-    :noindex:
-
-.. _api_fluid_layers_cos_sim:
-
-cos_sim
--------
-
-..  autofunction:: paddle.fluid.layers.cos_sim
-    :noindex:
-
-.. _api_fluid_layers_cross_entropy:
-
-cross_entropy
--------------
-
-..  autofunction:: paddle.fluid.layers.cross_entropy
-    :noindex:
-
-.. _api_fluid_layers_square_error_cost:
-
-square_error_cost
------------------
-
-..  autofunction:: paddle.fluid.layers.square_error_cost
-    :noindex:
-
-.. _api_fluid_layers_chunk_eval:
-
-chunk_eval
-----------
-
-..  autofunction:: paddle.fluid.layers.chunk_eval
-    :noindex:
-
-.. _api_fluid_layers_sequence_conv:
-
-sequence_conv
--------------
-
-..  autofunction:: paddle.fluid.layers.sequence_conv
-    :noindex:
-
-.. _api_fluid_layers_conv2d:
-
-conv2d
-------
-
-..  autofunction:: paddle.fluid.layers.conv2d
-    :noindex:
-
-.. _api_fluid_layers_conv3d:
-
-conv3d
-------
-
-..  autofunction:: paddle.fluid.layers.conv3d
-    :noindex:
-
-.. _api_fluid_layers_sequence_pool:
-
-sequence_pool
--------------
-
-..  autofunction:: paddle.fluid.layers.sequence_pool
-    :noindex:
-
-.. _api_fluid_layers_sequence_softmax:
-
-sequence_softmax
-----------------
-
-..  autofunction:: paddle.fluid.layers.sequence_softmax
-    :noindex:
-
-.. _api_fluid_layers_softmax:
-
-softmax
--------
-
-..  autofunction:: paddle.fluid.layers.softmax
-    :noindex:
-
-.. _api_fluid_layers_pool2d:
-
-pool2d
-------
-
-..  autofunction:: paddle.fluid.layers.pool2d
-    :noindex:
-
-.. _api_fluid_layers_pool3d:
-
-pool3d
-------
-
-..  autofunction:: paddle.fluid.layers.pool3d
-    :noindex:
-
-.. _api_fluid_layers_batch_norm:
-
-batch_norm
-----------
-
-..  autofunction:: paddle.fluid.layers.batch_norm
-    :noindex:
-
-.. _api_fluid_layers_beam_search_decode:
-
-beam_search_decode
-------------------
-
-..  autofunction:: paddle.fluid.layers.beam_search_decode
-    :noindex:
-
-.. _api_fluid_layers_conv2d_transpose:
-
-conv2d_transpose
-----------------
-
-..  autofunction:: paddle.fluid.layers.conv2d_transpose
-    :noindex:
-
-.. _api_fluid_layers_conv3d_transpose:
-
-conv3d_transpose
-----------------
-
-..  autofunction:: paddle.fluid.layers.conv3d_transpose
-    :noindex:
-
-.. _api_fluid_layers_sequence_expand:
-
-sequence_expand
----------------
-
-..  autofunction:: paddle.fluid.layers.sequence_expand
-    :noindex:
-
-.. _api_fluid_layers_lstm_unit:
-
-lstm_unit
----------
-
-..  autofunction:: paddle.fluid.layers.lstm_unit
-    :noindex:
-
-.. _api_fluid_layers_reduce_sum:
-
-reduce_sum
-----------
-
-..  autofunction:: paddle.fluid.layers.reduce_sum
-    :noindex:
-
-.. _api_fluid_layers_reduce_mean:
-
-reduce_mean
------------
-
-..  autofunction:: paddle.fluid.layers.reduce_mean
-    :noindex:
-
-.. _api_fluid_layers_reduce_max:
-
-reduce_max
-----------
-
-..  autofunction:: paddle.fluid.layers.reduce_max
-    :noindex:
-
-.. _api_fluid_layers_reduce_min:
-
-reduce_min
-----------
-
-..  autofunction:: paddle.fluid.layers.reduce_min
-    :noindex:
-
-.. _api_fluid_layers_reduce_prod:
-
-reduce_prod
------------
-
-..  autofunction:: paddle.fluid.layers.reduce_prod
-    :noindex:
-
-.. _api_fluid_layers_sequence_first_step:
-
-sequence_first_step
--------------------
-
-..  autofunction:: paddle.fluid.layers.sequence_first_step
-    :noindex:
-
-.. _api_fluid_layers_sequence_last_step:
-
-sequence_last_step
-------------------
-
-..  autofunction:: paddle.fluid.layers.sequence_last_step
-    :noindex:
-
-.. _api_fluid_layers_dropout:
-
-dropout
--------
-
-..  autofunction:: paddle.fluid.layers.dropout
-    :noindex:
-
-.. _api_fluid_layers_split:
-
-split
------
-
-..  autofunction:: paddle.fluid.layers.split
-    :noindex:
-
-.. _api_fluid_layers_ctc_greedy_decoder:
-
-ctc_greedy_decoder
-------------------
-
-..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
-    :noindex:
-
-.. _api_fluid_layers_edit_distance:
-
-edit_distance
--------------
-
-..  autofunction:: paddle.fluid.layers.edit_distance
-    :noindex:
-
-.. _api_fluid_layers_l2_normalize:
-
-l2_normalize
-------------
-
-..  autofunction:: paddle.fluid.layers.l2_normalize
-    :noindex:
-
-.. _api_fluid_layers_matmul:
-
-matmul
-------
-
-..  autofunction:: paddle.fluid.layers.matmul
-    :noindex:
-
-.. _api_fluid_layers_topk:
-
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
-.. _api_fluid_layers_warpctc:
-
-warpctc
--------
-
-..  autofunction:: paddle.fluid.layers.warpctc
-    :noindex:
-
-.. _api_fluid_layers_sequence_reshape:
-
-sequence_reshape
-----------------
-
-..  autofunction:: paddle.fluid.layers.sequence_reshape
-    :noindex:
-
-.. _api_fluid_layers_transpose:
-
-transpose
----------
-
-..  autofunction:: paddle.fluid.layers.transpose
-    :noindex:
-
-.. _api_fluid_layers_im2sequence:
-
-im2sequence
------------
-
-..  autofunction:: paddle.fluid.layers.im2sequence
-    :noindex:
-
-.. _api_fluid_layers_nce:
-
-nce
----
-
-..  autofunction:: paddle.fluid.layers.nce
-    :noindex:
-
-.. _api_fluid_layers_beam_search:
-
-beam_search
------------
-
-..  autofunction:: paddle.fluid.layers.beam_search
-    :noindex:
-
-.. _api_fluid_layers_row_conv:
-
-row_conv
---------
-
-..  autofunction:: paddle.fluid.layers.row_conv
-    :noindex:
-
-.. _api_fluid_layers_multiplex:
-
-multiplex
----------
-
-..  autofunction:: paddle.fluid.layers.multiplex
-    :noindex:
-
-.. _api_fluid_layers_layer_norm:
-
-layer_norm
-----------
-
-..  autofunction:: paddle.fluid.layers.layer_norm
-    :noindex:
-
-.. _api_fluid_layers_softmax_with_cross_entropy:
-
-softmax_with_cross_entropy
---------------------------
-
-..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
-    :noindex:
-
-.. _api_fluid_layers_smooth_l1:
-
-smooth_l1
----------
-
-..  autofunction:: paddle.fluid.layers.smooth_l1
-    :noindex:
-
-.. _api_fluid_layers_one_hot:
-
-one_hot
--------
-
-..  autofunction:: paddle.fluid.layers.one_hot
-    :noindex:
-
-.. _api_fluid_layers_autoincreased_step_counter:
-
-autoincreased_step_counter
---------------------------
-
-..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
-    :noindex:
-
-.. _api_fluid_layers_reshape:
-
-reshape
--------
-
-..  autofunction:: paddle.fluid.layers.reshape
-    :noindex:
-
-.. _api_fluid_layers_lod_reset:
-
-lod_reset
----------
-
-..  autofunction:: paddle.fluid.layers.lod_reset
-    :noindex:
-
-.. _api_fluid_layers_lrn:
-
-lrn
----
-
-..  autofunction:: paddle.fluid.layers.lrn
-    :noindex:
-
-.. _api_fluid_layers_pad:
-
-pad
----
-
-..  autofunction:: paddle.fluid.layers.pad
-    :noindex:
-
-.. _api_fluid_layers_label_smooth:
-
-label_smooth
-------------
-
-..  autofunction:: paddle.fluid.layers.label_smooth
-    :noindex:
-
-.. _api_fluid_layers_roi_pool:
-
-roi_pool
---------
-
-..  autofunction:: paddle.fluid.layers.roi_pool
-    :noindex:
-
-.. _api_fluid_layers_dice_loss:
-
-dice_loss
----------
-
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-
-.. _api_fluid_layers_image_resize:
-
-image_resize
-------------
-
-..  autofunction:: paddle.fluid.layers.image_resize
-    :noindex:
-
-.. _api_fluid_layers_image_resize_short:
-
-image_resize_short
-------------------
-
-..  autofunction:: paddle.fluid.layers.image_resize_short
-    :noindex:
-
-.. _api_fluid_layers_resize_bilinear:
-
-resize_bilinear
----------------
-
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-
-.. _api_fluid_layers_gather:
-
-gather
-------
-
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-
-.. _api_fluid_layers_random_crop:
-
-random_crop
------------
-
-..  autofunction:: paddle.fluid.layers.random_crop
-    :noindex:
-
-.. _api_fluid_layers_mean_iou:
-
-mean_iou
---------
-
-..  autofunction:: paddle.fluid.layers.mean_iou
-    :noindex:
-
-.. _api_fluid_layers_relu:
-
-relu
-----
-
-..  autofunction:: paddle.fluid.layers.relu
-    :noindex:
-
-.. _api_fluid_layers_log:
-
-log
----
-
-..  autofunction:: paddle.fluid.layers.log
-    :noindex:
-
-.. _api_fluid_layers_crop:
-
-crop
-----
-
-..  autofunction:: paddle.fluid.layers.crop
-    :noindex:
-
-ops
-===
-
-.. _api_fluid_layers_mean:
-
-mean
-----
-
-..  autofunction:: paddle.fluid.layers.mean
-    :noindex:
-
-.. _api_fluid_layers_mul:
-
-mul
----
-
-..  autofunction:: paddle.fluid.layers.mul
-    :noindex:
-
-.. _api_fluid_layers_scale:
-
-scale
------
-
-..  autofunction:: paddle.fluid.layers.scale
-    :noindex:
-
-.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
-
-sigmoid_cross_entropy_with_logits
----------------------------------
-
-..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
-    :noindex:
-
-.. _api_fluid_layers_elementwise_add:
-
-elementwise_add
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_add
-    :noindex:
-
-.. _api_fluid_layers_elementwise_div:
-
-elementwise_div
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_div
-    :noindex:
-
-.. _api_fluid_layers_elementwise_sub:
-
-elementwise_sub
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_sub
-    :noindex:
-
-.. _api_fluid_layers_elementwise_mul:
-
-elementwise_mul
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_mul
-    :noindex:
-
-.. _api_fluid_layers_elementwise_max:
-
-elementwise_max
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_max
-    :noindex:
-
-.. _api_fluid_layers_elementwise_min:
-
-elementwise_min
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_min
-    :noindex:
-
-.. _api_fluid_layers_elementwise_pow:
-
-elementwise_pow
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_pow
-    :noindex:
-
-.. _api_fluid_layers_clip:
-
-clip
-----
-
-..  autofunction:: paddle.fluid.layers.clip
-    :noindex:
-
-.. _api_fluid_layers_clip_by_norm:
-
-clip_by_norm
-------------
-
-..  autofunction:: paddle.fluid.layers.clip_by_norm
-    :noindex:
-
-.. _api_fluid_layers_logical_and:
-
-logical_and
------------
-
-..  autofunction:: paddle.fluid.layers.logical_and
-    :noindex:
-
-.. _api_fluid_layers_logical_or:
-
-logical_or
-----------
-
-..  autofunction:: paddle.fluid.layers.logical_or
-    :noindex:
-
-.. _api_fluid_layers_logical_xor:
-
-logical_xor
------------
-
-..  autofunction:: paddle.fluid.layers.logical_xor
-    :noindex:
-
-.. _api_fluid_layers_logical_not:
-
-logical_not
------------
-
-..  autofunction:: paddle.fluid.layers.logical_not
-    :noindex:
-
-.. _api_fluid_layers_uniform_random_batch_size_like:
-
-uniform_random_batch_size_like
-------------------------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
-    :noindex:
-
-.. _api_fluid_layers_gaussian_random:
-
-gaussian_random
----------------
-
-..  autofunction:: paddle.fluid.layers.gaussian_random
-    :noindex:
-
-.. _api_fluid_layers_gaussian_random_batch_size_like:
-
-gaussian_random_batch_size_like
--------------------------------
-
-..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
-    :noindex:
-
-.. _api_fluid_layers_scatter:
-
-scatter
--------
-
-..  autofunction:: paddle.fluid.layers.scatter
-    :noindex:
-
-.. _api_fluid_layers_sum:
-
-sum
----
-
-..  autofunction:: paddle.fluid.layers.sum
-    :noindex:
-
-.. _api_fluid_layers_slice:
-
-slice
------
-
-..  autofunction:: paddle.fluid.layers.slice
-    :noindex:
-
-.. _api_fluid_layers_polygon_box_transform:
-
-polygon_box_transform
----------------------
-
-..  autofunction:: paddle.fluid.layers.polygon_box_transform
-    :noindex:
-
-.. _api_fluid_layers_shape:
-
-shape
------
-
-..  autofunction:: paddle.fluid.layers.shape
-    :noindex:
-
-.. _api_fluid_layers_iou_similarity:
-
-iou_similarity
---------------
-
-..  autofunction:: paddle.fluid.layers.iou_similarity
-    :noindex:
-
-.. _api_fluid_layers_maxout:
-
-maxout
-------
-
-..  autofunction:: paddle.fluid.layers.maxout
-    :noindex:
-
-.. _api_fluid_layers_sigmoid:
-
-sigmoid
--------
-
-..  autofunction:: paddle.fluid.layers.sigmoid
-    :noindex:
-
-.. _api_fluid_layers_logsigmoid:
-
-logsigmoid
-----------
-
-..  autofunction:: paddle.fluid.layers.logsigmoid
-    :noindex:
-
-.. _api_fluid_layers_exp:
-
-exp
----
-
-..  autofunction:: paddle.fluid.layers.exp
-    :noindex:
-
-.. _api_fluid_layers_tanh:
-
-tanh
-----
-
-..  autofunction:: paddle.fluid.layers.tanh
-    :noindex:
-
-.. _api_fluid_layers_tanh_shrink:
-
-tanh_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.tanh_shrink
-    :noindex:
-
-.. _api_fluid_layers_softshrink:
-
-softshrink
-----------
-
-..  autofunction:: paddle.fluid.layers.softshrink
-    :noindex:
-
-.. _api_fluid_layers_sqrt:
-
-sqrt
-----
-
-..  autofunction:: paddle.fluid.layers.sqrt
-    :noindex:
-
-.. _api_fluid_layers_abs:
-
-abs
----
-
-..  autofunction:: paddle.fluid.layers.abs
-    :noindex:
-
-.. _api_fluid_layers_ceil:
-
-ceil
-----
-
-..  autofunction:: paddle.fluid.layers.ceil
-    :noindex:
-
-.. _api_fluid_layers_floor:
-
-floor
------
-
-..  autofunction:: paddle.fluid.layers.floor
-    :noindex:
-
-.. _api_fluid_layers_cos:
-
-cos
----
-
-..  autofunction:: paddle.fluid.layers.cos
-    :noindex:
-
-.. _api_fluid_layers_sin:
-
-sin
----
-
-..  autofunction:: paddle.fluid.layers.sin
-    :noindex:
-
-.. _api_fluid_layers_round:
-
-round
------
-
-..  autofunction:: paddle.fluid.layers.round
-    :noindex:
-
-.. _api_fluid_layers_reciprocal:
-
-reciprocal
-----------
-
-..  autofunction:: paddle.fluid.layers.reciprocal
-    :noindex:
-
-.. _api_fluid_layers_square:
-
-square
-------
-
-..  autofunction:: paddle.fluid.layers.square
-    :noindex:
-
-.. _api_fluid_layers_softplus:
-
-softplus
---------
-
-..  autofunction:: paddle.fluid.layers.softplus
-    :noindex:
-
-.. _api_fluid_layers_softsign:
-
-softsign
---------
-
-..  autofunction:: paddle.fluid.layers.softsign
-    :noindex:
-
-.. _api_fluid_layers_brelu:
-
-brelu
------
-
-..  autofunction:: paddle.fluid.layers.brelu
-    :noindex:
-
-.. _api_fluid_layers_leaky_relu:
-
-leaky_relu
-----------
-
-..  autofunction:: paddle.fluid.layers.leaky_relu
-    :noindex:
-
-.. _api_fluid_layers_soft_relu:
-
-soft_relu
----------
-
-..  autofunction:: paddle.fluid.layers.soft_relu
-    :noindex:
-
-.. _api_fluid_layers_elu:
-
-elu
----
-
-..  autofunction:: paddle.fluid.layers.elu
-    :noindex:
-
-.. _api_fluid_layers_relu6:
-
-relu6
------
-
-..  autofunction:: paddle.fluid.layers.relu6
-    :noindex:
-
-.. _api_fluid_layers_pow:
-
-pow
----
-
-..  autofunction:: paddle.fluid.layers.pow
-    :noindex:
-
-.. _api_fluid_layers_stanh:
-
-stanh
------
-
-..  autofunction:: paddle.fluid.layers.stanh
-    :noindex:
-
-.. _api_fluid_layers_hard_sigmoid:
-
-hard_sigmoid
-------------
-
-..  autofunction:: paddle.fluid.layers.hard_sigmoid
-    :noindex:
-
-.. _api_fluid_layers_swish:
-
-swish
------
-
-..  autofunction:: paddle.fluid.layers.swish
-    :noindex:
-
-.. _api_fluid_layers_uniform_random:
-
-uniform_random
---------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random
-    :noindex:
-
-.. _api_fluid_layers_hard_shrink:
-
-hard_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-.. _api_fluid_layers_cumsum:
-
-cumsum
-------
-
-..  autofunction:: paddle.fluid.layers.cumsum
-    :noindex:
-
-.. _api_fluid_layers_thresholded_relu:
-
-thresholded_relu
-----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
-
-tensor
-======
-
-.. _api_fluid_layers_create_tensor:
-
-create_tensor
--------------
-
-..  autofunction:: paddle.fluid.layers.create_tensor
-    :noindex:
-
-.. _api_fluid_layers_create_parameter:
-
-create_parameter
-----------------
-
-..  autofunction:: paddle.fluid.layers.create_parameter
-    :noindex:
-
-.. _api_fluid_layers_create_global_var:
-
-create_global_var
------------------
-
-..  autofunction:: paddle.fluid.layers.create_global_var
-    :noindex:
-
-.. _api_fluid_layers_cast:
-
-cast
-----
-
-..  autofunction:: paddle.fluid.layers.cast
-    :noindex:
-
-.. _api_fluid_layers_concat:
-
-concat
-------
-
-..  autofunction:: paddle.fluid.layers.concat
-    :noindex:
-
-.. _api_fluid_layers_sums:
-
-sums
-----
-
-..  autofunction:: paddle.fluid.layers.sums
-    :noindex:
-
-.. _api_fluid_layers_assign:
-
-assign
-------
-
-..  autofunction:: paddle.fluid.layers.assign
-    :noindex:
-
-.. _api_fluid_layers_fill_constant_batch_size_like:
-
-fill_constant_batch_size_like
------------------------------
-
-..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
-    :noindex:
-
-.. _api_fluid_layers_fill_constant:
-
-fill_constant
--------------
-
-..  autofunction:: paddle.fluid.layers.fill_constant
-    :noindex:
-
-.. _api_fluid_layers_argmin:
-
-argmin
-------
-
-..  autofunction:: paddle.fluid.layers.argmin
-    :noindex:
-
-.. _api_fluid_layers_argmax:
-
-argmax
-------
-
-..  autofunction:: paddle.fluid.layers.argmax
-    :noindex:
-
-.. _api_fluid_layers_ones:
-
-ones
-----
-
-..  autofunction:: paddle.fluid.layers.ones
-    :noindex:
-
-.. _api_fluid_layers_zeros:
-
-zeros
------
-
-..  autofunction:: paddle.fluid.layers.zeros
-    :noindex:
-
-.. _api_fluid_layers_reverse:
-
-reverse
--------
-
-..  autofunction:: paddle.fluid.layers.reverse
-    :noindex:
-
-learning_rate_scheduler
-=======================
-
-.. _api_fluid_layers_exponential_decay:
-
-exponential_decay
------------------
-
-..  autofunction:: paddle.fluid.layers.exponential_decay
-    :noindex:
-
-.. _api_fluid_layers_natural_exp_decay:
-
-natural_exp_decay
------------------
-
-..  autofunction:: paddle.fluid.layers.natural_exp_decay
-    :noindex:
-
-.. _api_fluid_layers_inverse_time_decay:
-
-inverse_time_decay
-------------------
-
-..  autofunction:: paddle.fluid.layers.inverse_time_decay
-    :noindex:
-
-.. _api_fluid_layers_polynomial_decay:
-
-polynomial_decay
-----------------
-
-..  autofunction:: paddle.fluid.layers.polynomial_decay
-    :noindex:
-
-.. _api_fluid_layers_piecewise_decay:
-
-piecewise_decay
----------------
-
-..  autofunction:: paddle.fluid.layers.piecewise_decay
-    :noindex:
-
-.. _api_fluid_layers_noam_decay:
-
-noam_decay
-----------
-
-..  autofunction:: paddle.fluid.layers.noam_decay
-    :noindex:
-
-.. _api_fluid_layers_append_LARS:
-
-append_LARS
------------
-
-..  autofunction:: paddle.fluid.layers.append_LARS
-    :noindex:
-
-detection
-=========
-
-.. _api_fluid_layers_prior_box:
-
-prior_box
----------
-
-..  autofunction:: paddle.fluid.layers.prior_box
-    :noindex:
-
-.. _api_fluid_layers_multi_box_head:
-
-multi_box_head
---------------
-
-..  autofunction:: paddle.fluid.layers.multi_box_head
-    :noindex:
-
-.. _api_fluid_layers_bipartite_match:
-
-bipartite_match
----------------
-
-..  autofunction:: paddle.fluid.layers.bipartite_match
-    :noindex:
-
-.. _api_fluid_layers_target_assign:
-
-target_assign
--------------
-
-..  autofunction:: paddle.fluid.layers.target_assign
-    :noindex:
-
-.. _api_fluid_layers_detection_output:
-
-detection_output
-----------------
-
-..  autofunction:: paddle.fluid.layers.detection_output
-    :noindex:
-
-.. _api_fluid_layers_ssd_loss:
-
-ssd_loss
---------
-
-..  autofunction:: paddle.fluid.layers.ssd_loss
-    :noindex:
-
-.. _api_fluid_layers_detection_map:
-
-detection_map
--------------
-
-..  autofunction:: paddle.fluid.layers.detection_map
-    :noindex:
-
-.. _api_fluid_layers_iou_similarity:
-
-iou_similarity
---------------
-
-..  autofunction:: paddle.fluid.layers.iou_similarity
-    :noindex:
-
-.. _api_fluid_layers_box_coder:
-
-box_coder
----------
-
-..  autofunction:: paddle.fluid.layers.box_coder
-    :noindex:
-
-metric_op
-=========
-
-.. _api_fluid_layers_accuracy:
-
-accuracy
---------
-
-..  autofunction:: paddle.fluid.layers.accuracy
-    :noindex:
-
-.. _api_fluid_layers_auc:
-
-auc
----
-
-..  autofunction:: paddle.fluid.layers.auc
-    :noindex:
-
-tensor
-======
-
-.. _api_fluid_layers_create_tensor:
-
-create_tensor
--------------
-
-..  autofunction:: paddle.fluid.layers.create_tensor
-    :noindex:
-
-.. _api_fluid_layers_create_parameter:
-
-create_parameter
-----------------
-
-..  autofunction:: paddle.fluid.layers.create_parameter
-    :noindex:
-
-.. _api_fluid_layers_create_global_var:
-
-create_global_var
------------------
-
-..  autofunction:: paddle.fluid.layers.create_global_var
-    :noindex:
-
-.. _api_fluid_layers_cast:
-
-cast
-----
-
-..  autofunction:: paddle.fluid.layers.cast
-    :noindex:
-
-.. _api_fluid_layers_concat:
-
-concat
-------
-
-..  autofunction:: paddle.fluid.layers.concat
-    :noindex:
-
-.. _api_fluid_layers_sums:
-
-sums
-----
-
-..  autofunction:: paddle.fluid.layers.sums
-    :noindex:
-
-.. _api_fluid_layers_assign:
-
-assign
-------
-
-..  autofunction:: paddle.fluid.layers.assign
-    :noindex:
-
-.. _api_fluid_layers_fill_constant_batch_size_like:
-
-fill_constant_batch_size_like
------------------------------
-
-..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
-    :noindex:
-
-.. _api_fluid_layers_fill_constant:
-
-fill_constant
--------------
-
-..  autofunction:: paddle.fluid.layers.fill_constant
-    :noindex:
-
-.. _api_fluid_layers_argmin:
-
-argmin
-------
-
-..  autofunction:: paddle.fluid.layers.argmin
-    :noindex:
-
-.. _api_fluid_layers_argmax:
-
-argmax
-------
-
-..  autofunction:: paddle.fluid.layers.argmax
-    :noindex:
-
-.. _api_fluid_layers_ones:
-
-ones
-----
-
-..  autofunction:: paddle.fluid.layers.ones
-    :noindex:
-
-.. _api_fluid_layers_zeros:
-
-zeros
------
-
-..  autofunction:: paddle.fluid.layers.zeros
-    :noindex:
-
-.. _api_fluid_layers_reverse:
-
-reverse
--------
-
-..  autofunction:: paddle.fluid.layers.reverse
-    :noindex:
-
diff --git a/source/api_reference/metrics.rst b/source/api_reference/metrics.rst
deleted file mode 100644
index 5ac2416bc3a0cec0ddc0414e36035e45a8857446..0000000000000000000000000000000000000000
--- a/source/api_reference/metrics.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_metrics:
-
-=============
-fluid.metrics
-=============
-
-.. _api_fluid_metrics_MetricBase:
-
-MetricBase
-----------
-
-..  autoclass:: paddle.fluid.metrics.MetricBase
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_CompositeMetric:
-
-CompositeMetric
----------------
-
-..  autoclass:: paddle.fluid.metrics.CompositeMetric
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Precision:
-
-Precision
----------
-
-..  autoclass:: paddle.fluid.metrics.Precision
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Recall:
-
-Recall
-------
-
-..  autoclass:: paddle.fluid.metrics.Recall
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Accuracy:
-
-Accuracy
---------
-
-..  autoclass:: paddle.fluid.metrics.Accuracy
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_ChunkEvaluator:
-
-ChunkEvaluator
---------------
-
-..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_EditDistance:
-
-EditDistance
-------------
-
-..  autoclass:: paddle.fluid.metrics.EditDistance
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_DetectionMAP:
-
-DetectionMAP
-------------
-
-..  autoclass:: paddle.fluid.metrics.DetectionMAP
-    :members:
-    :noindex:
-
-.. _api_fluid_metrics_Auc:
-
-Auc
----
-
-..  autoclass:: paddle.fluid.metrics.Auc
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/nets.rst b/source/api_reference/nets.rst
deleted file mode 100644
index 8872f9da38186ff6394c594f042e5c756a5febce..0000000000000000000000000000000000000000
--- a/source/api_reference/nets.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_nets:
-
-==========
-fluid.nets
-==========
-
-.. _api_fluid_nets_simple_img_conv_pool:
-
-simple_img_conv_pool
---------------------
-
-..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
-    :noindex:
-
-.. _api_fluid_nets_sequence_conv_pool:
-
-sequence_conv_pool
-------------------
-
-..  autofunction:: paddle.fluid.nets.sequence_conv_pool
-    :noindex:
-
-.. _api_fluid_nets_glu:
-
-glu
----
-
-..  autofunction:: paddle.fluid.nets.glu
-    :noindex:
-
-.. _api_fluid_nets_scaled_dot_product_attention:
-
-scaled_dot_product_attention
-----------------------------
-
-..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
-    :noindex:
-
diff --git a/source/api_reference/optimizer.rst b/source/api_reference/optimizer.rst
deleted file mode 100644
index 234ce23b7f3816b17a595ff3d084db7c5a7964de..0000000000000000000000000000000000000000
--- a/source/api_reference/optimizer.rst
+++ /dev/null
@@ -1,180 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_optimizer:
-
-===============
-fluid.optimizer
-===============
-
-.. _api_fluid_optimizer_SGD:
-
-SGD
----
-
-..  autoclass:: paddle.fluid.optimizer.SGD
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Momentum:
-
-Momentum
---------
-
-..  autoclass:: paddle.fluid.optimizer.Momentum
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adagrad:
-
-Adagrad
--------
-
-..  autoclass:: paddle.fluid.optimizer.Adagrad
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adam:
-
-Adam
-----
-
-..  autoclass:: paddle.fluid.optimizer.Adam
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adamax:
-
-Adamax
-------
-
-..  autoclass:: paddle.fluid.optimizer.Adamax
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_DecayedAdagrad:
-
-DecayedAdagrad
---------------
-
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Ftrl:
-
-Ftrl
-----
-
-..  autoclass:: paddle.fluid.optimizer.Ftrl
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_SGDOptimizer:
-
-SGDOptimizer
-------------
-
-..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_MomentumOptimizer:
-
-MomentumOptimizer
------------------
-
-..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_AdagradOptimizer:
-
-AdagradOptimizer
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_AdamOptimizer:
-
-AdamOptimizer
--------------
-
-..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_AdamaxOptimizer:
-
-AdamaxOptimizer
----------------
-
-..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_DecayedAdagradOptimizer:
-
-DecayedAdagradOptimizer
------------------------
-
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_FtrlOptimizer:
-
-FtrlOptimizer
--------------
-
-..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Adadelta:
-
-Adadelta
---------
-
-..  autoclass:: paddle.fluid.optimizer.Adadelta
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_ModelAverage:
-
-ModelAverage
-------------
-
-..  autoclass:: paddle.fluid.optimizer.ModelAverage
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_Optimizer:
-
-Optimizer
----------
-
-..  autoclass:: paddle.fluid.optimizer.Optimizer
-    :members:
-    :noindex:
-
-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/param_attr.rst b/source/api_reference/param_attr.rst
deleted file mode 100644
index 25a030fe966ba28942bc9ae44bc22c021e9e64c4..0000000000000000000000000000000000000000
--- a/source/api_reference/param_attr.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_param_attr:
-
-================
-fluid.param_attr
-================
-
-.. _api_fluid_param_attr_ParamAttr:
-
-ParamAttr
----------
-
-..  autoclass:: paddle.fluid.param_attr.ParamAttr
-    :members:
-    :noindex:
-
-.. _api_fluid_param_attr_WeightNormParamAttr:
-
-WeightNormParamAttr
--------------------
-
-..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/profiler.rst b/source/api_reference/profiler.rst
deleted file mode 100644
index 30e2a4f43e811a7b0237b55a194d47b2f7b4cdd3..0000000000000000000000000000000000000000
--- a/source/api_reference/profiler.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_profiler:
-
-==============
-fluid.profiler
-==============
-
-.. _api_fluid_profiler_cuda_profiler:
-
-cuda_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.cuda_profiler
-    :noindex:
-
-.. _api_fluid_profiler_reset_profiler:
-
-reset_profiler
---------------
-
-..  autofunction:: paddle.fluid.profiler.reset_profiler
-    :noindex:
-
-.. _api_fluid_profiler_profiler:
-
-profiler
---------
-
-..  autofunction:: paddle.fluid.profiler.profiler
-    :noindex:
-
-.. _api_fluid_profiler_start_profiler:
-
-start_profiler
---------------
-
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-
-.. _api_fluid_profiler_stop_profiler:
-
-stop_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
-
diff --git a/source/api_reference/recordio_writer.rst b/source/api_reference/recordio_writer.rst
deleted file mode 100644
index 32440b349d64466a4d299c6956ecb4d0a40305a1..0000000000000000000000000000000000000000
--- a/source/api_reference/recordio_writer.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_recordio_writer:
-
-=====================
-fluid.recordio_writer
-=====================
-
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
-
-convert_reader_to_recordio_file
--------------------------------
-
-..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
-    :noindex:
-
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
-
-convert_reader_to_recordio_files
---------------------------------
-
-..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
-    :noindex:
-
diff --git a/source/api_reference/regularizer.rst b/source/api_reference/regularizer.rst
deleted file mode 100644
index 6bdfbe6dc49bf1dce7488c6f193b71b8ddc3f0fc..0000000000000000000000000000000000000000
--- a/source/api_reference/regularizer.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_regularizer:
-
-=================
-fluid.regularizer
-=================
-
-.. _api_fluid_regularizer_append_regularization_ops:
-
-append_regularization_ops
--------------------------
-
-..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
-    :noindex:
-
-.. _api_fluid_regularizer_L1Decay:
-
-L1Decay
--------
-
-..  autoclass:: paddle.fluid.regularizer.L1Decay
-    :members:
-    :noindex:
-
-.. _api_fluid_regularizer_L2Decay:
-
-L2Decay
--------
-
-..  autoclass:: paddle.fluid.regularizer.L2Decay
-    :members:
-    :noindex:
-
-.. _api_fluid_regularizer_L1DecayRegularizer:
-
-L1DecayRegularizer
-------------------
-
-..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
-    :members:
-    :noindex:
-
-.. _api_fluid_regularizer_L2DecayRegularizer:
-
-L2DecayRegularizer
-------------------
-
-..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
-    :members:
-    :noindex:
-
diff --git a/source/api_reference/transpiler.rst b/source/api_reference/transpiler.rst
deleted file mode 100644
index 9c9b3dec91b6cdfa3c57d57201eaa9581621aa3f..0000000000000000000000000000000000000000
--- a/source/api_reference/transpiler.rst
+++ /dev/null
@@ -1,52 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-.. _api_fluid_transpiler:
-
-================
-fluid.transpiler
-================
-
-.. _api_fluid_transpiler_DistributeTranspiler:
-
-DistributeTranspiler
---------------------
-
-..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
-    :members:
-    :noindex:
-
-.. _api_fluid_transpiler_memory_optimize:
-
-memory_optimize
----------------
-
-..  autofunction:: paddle.fluid.transpiler.memory_optimize
-    :noindex:
-
-.. _api_fluid_transpiler_release_memory:
-
-release_memory
---------------
-
-..  autofunction:: paddle.fluid.transpiler.release_memory
-    :noindex:
-
-.. _api_fluid_transpiler_HashName:
-
-HashName
---------
-
-..  autoclass:: paddle.fluid.transpiler.HashName
-    :members:
-    :noindex:
-
-.. _api_fluid_transpiler_RoundRobin:
-
-RoundRobin
-----------
-
-..  autoclass:: paddle.fluid.transpiler.RoundRobin
-    :members:
-    :noindex:
-
diff --git a/source/beginners_guide/basics/image_classification/image/cifar.png b/source/beginners_guide/basics/image_classification/image/cifar.png
deleted file mode 100644
index f3c5f2f7b0c84f83382b70124dcd439586ed4eb0..0000000000000000000000000000000000000000
Binary files a/source/beginners_guide/basics/image_classification/image/cifar.png and /dev/null differ
diff --git a/source/beginners_guide/basics/image_classification/image/variations.png b/source/beginners_guide/basics/image_classification/image/variations.png
deleted file mode 100644
index b4ebbbe6a50f5fd7cd0cccb52cdac5653e34654c..0000000000000000000000000000000000000000
Binary files a/source/beginners_guide/basics/image_classification/image/variations.png and /dev/null differ
diff --git a/source/beginners_guide/basics/image_classification/index.md b/source/beginners_guide/basics/image_classification/index.md
deleted file mode 100644
index 0f5a13d8b910870082ce3b233d338de505208dea..0000000000000000000000000000000000000000
--- a/source/beginners_guide/basics/image_classification/index.md
+++ /dev/null
@@ -1,559 +0,0 @@
-
-# 图像分类
-
-本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。
-
-## 背景介绍
-
-图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在本教程中，我们专注于图像识别领域的一个重要问题，即图像分类。
-
-图像分类是根据图像的语义信息将不同类别图像区分开来，是计算机视觉中重要的基本问题，也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用，包括安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
-
-
-一般来说，图像分类通过手工特征或特征学习方法对整个图像进行全部描述，然后使用分类器判别物体类别，因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入，即一句话可以用一个装了词的袋子表示其特征，袋子中的词为句子中的单词、短语或字。对于图像而言，词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。
-
-而基于深度学习的图像分类方法，可以通过有监督或无监督的方式**学习**层次化的特征描述，从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩，CNN直接利用图像像素信息作为输入，最大程度上保留了输入图像的所有信息，通过卷积操作进行特征的提取和高层抽象，模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果，得到了广泛的应用。
-
-本教程主要介绍图像分类的深度学习模型，以及如何使用PaddlePaddle训练CNN模型。
-
-## 效果展示
-
-图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
-
-![dogCatClassification](./image/dog_cat.png)
-<p align="center">
-图1. 通用图像分类展示
-</p>
-
-
-图2展示了细粒度图像分类-花卉识别的效果，要求模型可以正确识别花的类别。
-
-![flowersClassification](./image/flowers.png)
-<p align="center">
-图2. 细粒度图像分类展示
-</p>
-
-
-一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
-
-![imageVariations](./image/variations.png)
-<p align="center">
-图3. 扰动图片展示[22]
-</p>
-
-## 模型概览
-
-图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上，很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛，ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集，在本章中我们基于这些竞赛的一些论文介绍图像分类模型。
-
-在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成，但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。
-1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等，一般也采用多种特征描述子，防止丢失过多的有用信息。
-2). **特征编码**: 底层特征中包含了大量冗余与噪声，为了提高特征表达的鲁棒性，需要使用一种特征变换算法对底层特征进行编码，称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
-3). **空间特征约束**: 特征编码之后一般会经过空间特征约束，也称作**特征汇聚**。特征汇聚是指在一个空间范围内，对每一维特征取最大值或者平均值，可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法，这种方法提出将图像均匀分块，在分块内做特征汇聚。
-4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述，接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器，在传统图像分类任务上性能很好。
-
-这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征，两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
-
-Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
-
-![ilsvrc](./image/ilsvrc.png)
-<p align="center">
-图4. ILSVRC图像分类Top-5错误率
-</p>
-
-### CNN
-
-传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
-
-![cnnStructure](./image/lenet.png)
-<p align="center">
-图5. CNN网络示例[20]
-</p>
-
-- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征，发掘出图片局部关联性质和空间不变性质。
-- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作，可以过滤掉一些不重要的高频信息。
-- 全连接层(fully-connected layer，或者fc layer): 输入层到隐藏层的神经元是全部连接的。
-- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层，例如Sigmoid、Tanh、ReLu等来增强网络的表达能力，在CNN里最常使用的为ReLu激活函数。
-- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作，提高网络的泛化能力，一定程度上防止过拟合。
-
-另外，在训练过程中由于每层参数不断更新，会导致下一次输入分布发生变化，这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中，每个batch对网络中的每一层特征都做归一化，使得每层分布相对稳定。BN算法不仅起到一定的正则作用，而且弱化了一些超参数的设计。经过实验证明，BN算法加速了模型收敛过程，在后来较深的模型中被广泛使用。
-
-接下来我们主要介绍VGG，GoogleNet和ResNet网络结构。
-
-### VGG
-
-牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
-
-![vgg16](./image/vgg16.png)
-<p align="center">
-图6. 基于ImageNet的VGG16模型
-</p>
-
-### GoogleNet
-
-GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军，在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块，因为GoogleNet模型由多组Inception模块组成，模型设计借鉴了NIN的一些思想。
-
-NIN模型主要有两个特点：1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络，即在线性卷积后面增加若干层1x1的卷积，这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层，参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图，然后采用全局均值池化(Avg-Pooling)替代全连接层，得到类别维度大小的向量，再进行分类。这种替代全连接层的方式有利于减少参数。
-
-Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
-
-![inception](./image/inception.png)
-<p align="center">
-图7. Inception模块
-</p>
-
-GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没有采用传统的多层全连接层，而是像NIN网络一样采用了均值池化层；但与NIN不同的是，池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外，由于网络中间层特征也很有判别性，GoogleNet在中间层添加了两个辅助分类器，在后向传播中增强梯度并且增强正则化，而整个网络的损失函数是这个三个分类器的损失加权求和。
-
-GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
-
-![googleNet](./image/googlenet.jpeg)
-<p align="center">
-图8. GoogleNet[12]
-</p>
-
-
-上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层；GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解，进一步提高网络非线性能力和加深网络；GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升，介于篇幅，这里不再详细介绍v2到v4的结构。
-
-
-### ResNet
-
-ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题，ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核，全卷积网络)的基础上，引入了残差模块。每个残差模块包含两条路径，其中一条路径是输入特征的直连通路，另一条路径对该特征做两到三次卷积操作得到该特征的残差，最后再将两条路径上的特征相加。
-
-残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
-
-![ResNetBlock](./image/resnet_block.jpg)
-<p align="center">
-图9. 残差模块
-</p>
-
-图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
-
-![ResNet](./image/resnet.png)
-<p align="center">
-图10. 基于ImageNet的ResNet模型
-</p>
-
-
-## 数据准备
-
-通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
-
-由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
-
-![CIFAR](./image/cifar.png)
-<p align="center">
-图11. CIFAR10数据集[21]
-</p>
-
-Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。
-
-通过输入`python train.py`，就可以开始训练模型了，以下小节将详细介绍`train.py`的相关内容。
-
-### 模型结构
-
-#### Paddle 初始化
-
-让我们从导入 Paddle Fluid API 和辅助模块开始。
-
-```python
-import paddle
-import paddle.fluid as fluid
-import numpy
-import sys
-```
-
-本教程中我们提供了VGG和ResNet两个模型的配置。
-
-#### VGG
-
-首先介绍VGG模型结构，由于CIFAR10图片大小和数量相比ImageNet数据小很多，因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。
-VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
-
-```python
-def vgg_bn_drop(input):
-def conv_block(ipt, num_filter, groups, dropouts):
-return fluid.nets.img_conv_group(
-input=ipt,
-pool_size=2,
-pool_stride=2,
-conv_num_filter=[num_filter] * groups,
-conv_filter_size=3,
-conv_act='relu',
-conv_with_batchnorm=True,
-conv_batchnorm_drop_rate=dropouts,
-pool_type='max')
-
-conv1 = conv_block(input, 64, 2, [0.3, 0])
-conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-bn = fluid.layers.batch_norm(input=fc1, act='relu')
-drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
-return predict
-```
-
-1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块，由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
-
-2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
-
-3. 最后接两层512维的全连接。
-
-4. 通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
-
-### ResNet
-
-ResNet模型的第1、3、4步和VGG模型相同，这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
-
-先介绍`resnet_cifar10`中的一些基本函数，再介绍网络连接过程。
-
-- `conv_bn_layer` : 带BN的卷积层。
-- `shortcut` : 残差模块的"直连"路径，"直连"实际分两种形式：残差模块输入和输出特征通道数不等时，采用1x1卷积的升维操作；残差模块输入和输出通道相等时，采用直连操作。
-- `basicblock` : 一个基础残差模块，即图9左边所示，由两组3x3卷积组成的路径和一条"直连"路径组成。
-- `bottleneck` : 一个瓶颈残差模块，即图9右边所示，由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。
-- `layer_warp` : 一组残差模块，由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同，以用来减少特征图在垂直和水平方向的大小。
-
-```python
-def conv_bn_layer(input,
-ch_out,
-filter_size,
-stride,
-padding,
-act='relu',
-bias_attr=False):
-tmp = fluid.layers.conv2d(
-input=input,
-filter_size=filter_size,
-num_filters=ch_out,
-stride=stride,
-padding=padding,
-act=None,
-bias_attr=bias_attr)
-return fluid.layers.batch_norm(input=tmp, act=act)
-
-
-def shortcut(input, ch_in, ch_out, stride):
-if ch_in != ch_out:
-return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-else:
-return input
-
-
-def basicblock(input, ch_in, ch_out, stride):
-tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
-short = shortcut(input, ch_in, ch_out, stride)
-return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-
-def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-tmp = block_func(input, ch_in, ch_out, stride)
-for i in range(1, count):
-tmp = block_func(tmp, ch_out, ch_out, 1)
-return tmp
-```
-
-`resnet_cifar10` 的连接结构主要有以下几个过程。
-
-1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。
-2. 然后连接3组残差模块即下面配置3组 `layer_warp` ，每组采用图 10 左边残差模块组成。
-3. 最后对网络做均值池化并返回该层。
-
-注意：除过第一层卷积层和最后一层全连接层之外，要求三组 `layer_warp` 总的含参层数能够被6整除，即 `resnet_cifar10` 的 depth 要满足 `$(depth - 2) % 6 == 0$` 。
-
-```python
-def resnet_cifar10(ipt, depth=32):
-# depth should be one of 20, 32, 44, 56, 110, 1202
-assert (depth - 2) % 6 == 0
-n = (depth - 2) / 6
-nStages = {16, 64, 128}
-conv1 = conv_bn_layer(ipt, ch_out=16, filter_size=3, stride=1, padding=1)
-res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-pool = fluid.layers.pool2d(
-input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-predict = fluid.layers.fc(input=pool, size=10, act='softmax')
-return predict
-```
-
-## Infererence Program 配置
-
-网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)。
-
-```python
-def inference_program():
-# The image is 32 * 32 with RGB representation.
-data_shape = [3, 32, 32]
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-
-predict = resnet_cifar10(images, 32)
-# predict = vgg_bn_drop(images) # un-comment to use vgg net
-return predict
-```
-
-## Train Program 配置
-
-然后我们需要设置训练程序 `train_program`。它首先从推理程序中进行预测。
-在训练期间，它将从预测中计算 `avg_cost`。
-在有监督训练中需要输入图像对应的类别信息，同样通过`fluid.layers.data`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
-
-**注意:** 训练程序应该返回一个数组，第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。
-
-```python
-def train_program():
-predict = inference_program()
-
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(cost)
-accuracy = fluid.layers.accuracy(input=predict, label=label)
-return [avg_cost, accuracy]
-```
-
-## Optimizer Function 配置
-
-在下面的 `Adam optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
-
-```python
-def optimizer_program():
-return fluid.optimizer.Adam(learning_rate=0.001)
-```
-
-## 训练模型
-
-### Trainer 配置
-
-现在，我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer_func`。
-
-```python
-use_cuda = False
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-trainer = fluid.Trainer(
-train_func=train_program,
-optimizer_func=optimizer_program,
-place=place)
-```
-
-### Data Feeders 配置
-
-`cifar.train10()` 每次产生一条样本，在完成shuffle和batch之后，作为训练的输入。
-
-```python
-# Each batch will yield 128 images
-BATCH_SIZE = 128
-
-# Reader for training
-train_reader = paddle.batch(
-paddle.reader.shuffle(paddle.dataset.cifar.train10(), buf_size=50000),
-batch_size=BATCH_SIZE)
-
-# Reader for testing. A separated data set for testing.
-test_reader = paddle.batch(
-paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
-```
-
-### Event Handler
-
-可以使用`event_handler`回调函数来观察训练过程，或进行测试等, 该回调函数是`trainer.train`函数里设定。
-
-`event_handler_plot`可以用来利用回调数据来打点画图:
-
-![png](./image/train_and_test.png)
-
-```python
-params_dirname = "image_classification_resnet.inference.model"
-
-from paddle.v2.plot import Ploter
-
-train_title = "Train cost"
-test_title = "Test cost"
-cost_ploter = Ploter(train_title, test_title)
-
-step = 0
-def event_handler_plot(event):
-global step
-if isinstance(event, fluid.EndStepEvent):
-if step % 1 == 0:
-cost_ploter.append(train_title, step, event.metrics[0])
-cost_ploter.plot()
-step += 1
-if isinstance(event, fluid.EndEpochEvent):
-avg_cost, accuracy = trainer.test(
-reader=test_reader,
-feed_order=['pixel', 'label'])
-cost_ploter.append(test_title, step, avg_cost)
-
-# save parameters
-if params_dirname is not None:
-trainer.save_params(params_dirname)
-```
-
-`event_handler` 用来在训练过程中输出文本日志
-
-```python
-params_dirname = "image_classification_resnet.inference.model"
-
-# event handler to track training and testing process
-def event_handler(event):
-if isinstance(event, fluid.EndStepEvent):
-if event.step % 100 == 0:
-print("\nPass %d, Batch %d, Cost %f, Acc %f" %
-(event.step, event.epoch, event.metrics[0],
-event.metrics[1]))
-else:
-sys.stdout.write('.')
-sys.stdout.flush()
-
-if isinstance(event, fluid.EndEpochEvent):
-# Test against with the test dataset to get accuracy.
-avg_cost, accuracy = trainer.test(
-reader=test_reader, feed_order=['pixel', 'label'])
-
-print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(event.epoch, avg_cost, accuracy))
-
-# save parameters
-if params_dirname is not None:
-trainer.save_params(params_dirname)
-```
-
-### 训练
-
-通过`trainer.train`函数训练:
-
-**注意:** CPU，每个 Epoch 将花费大约15～20分钟。这部分可能需要一段时间。请随意修改代码，在GPU上运行测试，以提高培训速度。
-
-```python
-trainer.train(
-reader=train_reader,
-num_epochs=2,
-event_handler=event_handler,
-feed_order=['pixel', 'label'])
-```
-
-一轮训练log示例如下所示，经过1个pass， 训练集上平均 Accuracy 为0.59 ，测试集上平均  Accuracy 为0.6 。
-
-```text
-Pass 0, Batch 0, Cost 3.869598, Acc 0.164062
-...................................................................................................
-Pass 100, Batch 0, Cost 1.481038, Acc 0.460938
-...................................................................................................
-Pass 200, Batch 0, Cost 1.340323, Acc 0.523438
-...................................................................................................
-Pass 300, Batch 0, Cost 1.223424, Acc 0.593750
-..........................................................................................
-Test with Pass 0, Loss 1.1, Acc 0.6
-```
-
-图12是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
-
-![CIFARErrorRate](./image/plot.png)
-<p align="center">
-图12. CIFAR10数据集上VGG模型的分类错误率
-</p>
-
-## 应用模型
-
-可以使用训练好的模型对图片进行分类，下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断，可以打开注释，更改加载的模型。
-
-### 生成预测输入数据
-
-`dog.png` is an example image of a dog. Turn it into an numpy array to match the data feeder format.
-
-```python
-# Prepare testing data.
-from PIL import Image
-import numpy as np
-import os
-
-def load_image(file):
-im = Image.open(file)
-im = im.resize((32, 32), Image.ANTIALIAS)
-
-im = np.array(im).astype(np.float32)
-# The storage order of the loaded image is W(width),
-# H(height), C(channel). PaddlePaddle requires
-# the CHW order, so transpose them.
-im = im.transpose((2, 0, 1))  # CHW
-im = im / 255.0
-
-# Add one dimension to mimic the list format.
-im = numpy.expand_dims(im, axis=0)
-return im
-
-cur_dir = os.getcwd()
-img = load_image(cur_dir + '/image/dog.png')
-```
-
-### Inferencer 配置和预测
-
-`Inferencer` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。
-我们可以简单地插入前面定义的推理程序。
-现在我们准备做预测。
-
-```python
-inferencer = fluid.Inferencer(
-infer_func=inference_program, param_path=params_dirname, place=place)
-
-# inference
-results = inferencer.infer({'pixel': img})
-print("infer results: ", results)
-```
-
-## 总结
-
-传统图像分类方法由多个阶段构成，框架较为复杂，而端到端的CNN模型结构可一步到位，而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型；然后基于CIFAR10数据集，介绍如何使用PaddlePaddle配置和训练CNN模型，尤其是VGG和ResNet模型；最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet，配置和训练流程是同样的，大家可以自行进行实验。
-
-
-## 参考文献
-
-[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
-
-[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
-
-[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
-
-[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
-
-[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
-
-[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
-
-[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
-
-[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
-
-[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
-
-[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
-
-[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
-
-[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
-
-[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
-
-[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
-
-[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
-
-[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
-
-[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
-
-[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
-
-[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
-
-[20] http://deeplearning.net/tutorial/lenet.html
-
-[21] https://www.cs.toronto.edu/~kriz/cifar.html
-
-[22] http://cs231n.github.io/classification/
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/basics/label_semantic_roles/index.md b/source/beginners_guide/basics/label_semantic_roles/index.md
deleted file mode 100644
index 331c093d784d7c9ba23c571fee4955da3d7be22f..0000000000000000000000000000000000000000
--- a/source/beginners_guide/basics/label_semantic_roles/index.md
+++ /dev/null
@@ -1,568 +0,0 @@
-# 语义角色标注
-
-本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/178.html)。
-
-## 背景介绍
-
-自然语言分析技术大致分为三个层面：词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中，谓词是对主语的陈述或说明，指出“做什么”、“是什么”或“怎么样，代表了一个事件的核心，跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有：施事者（Agent）、受事者（Patient）、客体（Theme）、经验者（Experiencer）、受益者（Beneficiary）、工具（Instrument）、处所（Location）、目标（Goal）和来源（Source）等。
-
-请看下面的例子，“遇到” 是谓词（Predicate，通常简写为“Pred”），“小明”是施事者（Agent），“小红”是受事者（Patient），“昨天” 是事件发生的时间（Time），“公园”是事情发生的地点（Location）。
-
-$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_{\mbox{Time}}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$
-
-语义角色标注（Semantic Role Labeling，SRL）以句子的谓词为中心，不对句子所包含的语义信息进行深入分析，只分析句子中各成分与谓词之间的关系，即句子的谓词（Predicate）- 论元（Argument）结构，并用语义角色来描述这些结构关系，是许多自然语言理解任务（如信息抽取，篇章分析，深度问答等）的一个重要中间步骤。在研究中一般都假定谓词是给定的，所要做的就是找出给定谓词的各个论元和它们的语义角色。
-
-传统的SRL系统大多建立在句法分析基础之上，通常包括5个流程：
-
-1. 构建一棵句法分析树，例如，图1是对上面例子进行依存句法分析得到的一棵句法树。
-2. 从句法树上识别出给定谓词的候选论元。
-3. 候选论元剪除；一个句子中的候选论元可能很多，候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。
-4. 论元识别：这个过程是从上一步剪除之后的候选中判断哪些是真正的论元，通常当做一个二分类问题来解决。
-5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。
-
-![dependencyParsing](./image/dependency_parsing.png)
-<div  align="center">
-图1. 依存句法分析句法树示例
-</div>
-
-然而，完全句法分析需要确定句子所包含的全部句法信息，并确定句子各成分之间的关系，是一个非常困难的任务，目前技术下的句法分析准确率并不高，句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度，同时获得一定的句法结构信息，“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析（partial parsing）或语块划分（chunking）。和完全句法分析得到一颗完整的句法树不同，浅层句法分析只需要识别句子中某些结构相对简单的独立成分，例如：动词短语，这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难，一些研究\[[1](#参考文献)\]也提出了基于语块（chunk）的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集，我们先来介绍这种表示方法。在BIO表示法中，B代表语块的开始，I代表语块的中间，O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签，例如：对于一个角色为A的论元，将它所包含的第一个语块赋予标签B-A，将它所包含的其它语块赋予标签I-A，不属于任何论元的语块赋予标签O。
-
-我们继续以上面的这句话为例，图1展示了BIO表示方法。
-
-![bioExample](./image/bio_example.png)
-<div  align="center">
-图2. BIO标注方法示例
-</div>
-
-从上面的例子可以看到，根据序列标注结果可以直接得到论元的语义角色标注结果，是一个相对简单的过程。这种简单性体现在：（1）依赖浅层句法分析，降低了句法分析的要求和难度；（2）没有了候选论元剪除这一步骤；（3）论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法，简化了流程，降低了错误累积的风险，往往能够取得更好的结果。
-
-与基于语块的SRL方法类似，在本教程中我们也将SRL看作一个序列标注问题，不同的是，我们只依赖输入文本序列，不依赖任何额外的语法解析结果或是复杂的人造特征，利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例，实践下面的任务：给定一句话和这句话里的一个谓词，通过序列标注的方式，从句子中找到谓词对应的论元，同时标注它们的语义角色。
-
-## 模型概览
-
-循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
-
-### 栈式循环神经网络（Stacked Recurrent Neural Network）
-
-深层网络有助于形成层次化特征，网络上层在下层已经学习到的初级特征基础上，形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络，但由于LSTM各个时间步参数共享，`$t-1$`时刻状态到`$t$`时刻的映射，始终只经过了一次非线性映射，也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元，令前一个LSTM`$t$`时刻的输出，成为下一个LSTM单元`$t$`时刻的输入，帮助我们构建起一个深层网络，我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力，能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。
-
-然而，训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常，堆叠4层LSTM单元可以正常训练，当层数达到4~8层时，会出现性能衰减，这时必须考虑一些新的结构以保证梯度纵向顺畅传播，这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一：在记忆单元（Memory Cell）这条信息传播的路线上没有非线性映射，当梯度反向传播时既不会衰减、也不会爆炸。因此，深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。
-
-一个LSTM单元完成的运算可以被分为三部分：（1）输入到隐层的映射（input-to-hidden） ：每个时间步输入信息`$x$`会首先经过一个矩阵映射，再作为遗忘门，输入门，记忆单元，输出门的输入，注意，这一次映射没有引入非线性激活；（2）隐层到隐层的映射（hidden-to-hidden）：这一步是LSTM计算的主体，包括遗忘门，输入门，记忆单元更新，输出门的计算；（3）隐层到输出的映射（hidden-to-output）：通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上，加入一条新的路径：除上一层LSTM输出之外，将前层LSTM的输入到隐层的映射作为的一个新的输入，同时加入一个线性映射去学习一个新的变换。
-
-图3是最终得到的栈式循环神经网络结构示意图。
-
-![lstmStructure](./image/stacked_lstm.png)
-<p align="center">
-图3. 基于LSTM的栈式循环神经网络结构示意图
-</p>
-
-### 双向循环神经网络（Bidirectional Recurrent Neural Network）
-
-在LSTM中，`$t$`时刻的隐藏层向量编码了到`$t$`时刻为止所有输入的信息，但`$t$`时刻的LSTM可以看到历史，却无法看到未来。在绝大多数自然语言处理任务中，我们几乎总是能拿到整个句子。这种情况下，如果能够像获取历史信息一样，得到未来的信息，对序列学习任务会有很大的帮助。
-
-为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，`$t$`时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
-
-![lstmStructure](./image/bidirectional_stacked_lstm.png)
-<p align="center">
-图4. 基于LSTM的双向循环神经网络结构示意图
-</p>
-
-需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中，介绍另一种双向循环神经网络。
-
-### 条件随机场 (Conditional Random Field)
-
-使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务。在SRL任务中，深层LSTM网络学习输入的特征表示，条件随机场（Conditional Random Filed， CRF）在特征的基础上完成序列标注，处于整个网络的末端。
-
-CRF是一种概率化结构模型，可以看作是一个概率无向图模型，结点表示随机变量，边表示随机变量之间的概率依赖关系。简单来讲，CRF学习条件概率`$P(X|Y)$`，其中 `$X = (x_1, x_2, ... , x_n)$` 是输入序列，`$Y = (y_1, y_2, ... , y_n)$` 是标记序列；解码过程是给定 `$X$`序列求解令`$P(Y|X)$`最大的`$Y$`序列，即`$Y^* = \mbox{arg max}_{Y} P(Y | X)$`。
-
-序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
-
-![linear_chain_crf](./image/linear_chain_crf.png)
-<p align="center">
-图5. 序列标注任务中使用的线性链条件随机场
-</p>
-
-根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\]，在给定观测序列`$X$`时，一个特定标记序列`$Y$`的概率可以定义为：
-
-$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
-
-其中`$Z(X)$`是归一化因子，`$t_j$` 是定义在边上的特征函数，依赖于当前和前一个位置，称为转移特征，表示对于输入序列`$X$`及其标注序列在 `$i$`及`$i - 1$`位置上标记的转移概率。`$s_k$`是定义在结点上的特征函数，称为状态特征，依赖于当前位置，表示对于观察序列`$X$`及其`$i$`位置的标记概率。`$\lambda_j$` 和 `$\mu_k$` 分别是转移特征函数和状态特征函数对应的权值。实际上，`$t$`和`$s$`可以用相同的数学形式表示，再对转移特征和状态特在各个位置`$i$`求和有：`$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$`，把`$f$`统称为特征函数，于是`$P(Y|X)$`可表示为：
-
-$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
-
-`$\omega$`是特征函数对应的权值，是CRF模型要学习的参数。训练时，对于给定的输入序列和对应的标记序列集合`$D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$` ，通过正则化的极大似然估计，求解如下优化目标：
-
-$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
-
-这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时，对于给定的输入序列`$X$`，通过解码算法（通常有：维特比算法、Beam Search）求令出条件概率`$\bar{P}(Y|X)$`最大的输出序列 `$\bar{Y}$`。
-
-### 深度双向LSTM（DB-LSTM）SRL模型
-
-在SRL任务中，输入是 “谓词” 和 “一句话”，目标是从这句话中找到谓词的论元，并标注论元的语义角色。如果一个句子含有`$n$`个谓词，这个句子会被处理`$n$`次。一个最为直接的模型是下面这样：
-
-1. 构造输入；
-- 输入1是谓词，输入2是句子
-- 将输入1扩展成和输入2一样长的序列，用one-hot方式表示；
-2. one-hot方式的谓词序列和句子序列通过词表，转换为实向量表示的词向量序列；
-3. 将步骤2中的2个词向量序列作为双向LSTM的输入，学习输入序列的特征表示；
-4. CRF以步骤3中模型学习到的特征为输入，以标记序列为监督信号，实现序列标注；
-
-大家可以尝试上面这种方法。这里，我们提出一些改进，引入两个简单但对提高系统性能非常有效的特征：
-
-- 谓词上下文：上面的方法中，只用到了谓词的词向量表达谓词相关的所有信息，这种方法始终是非常弱的，特别是如果谓词在句子中出现多次，有可能引起一定的歧义。从经验出发，谓词前后若干个词的一个小片段，能够提供更丰富的信息，帮助消解歧义。于是，我们把这样的经验也添加到模型中，为每个谓词同时抽取一个“谓词上下文” 片段，也就是从这个谓词前后各取`$n$`个词构成的一个窗口片段；
-- 谓词上下文区域标记：为句子中的每一个词引入一个0-1二值变量，表示它们是否在“谓词上下文”片段中；
-
-修改后的模型如下（图6是一个深度为4的模型结构示意图）：
-
-1. 构造输入
-- 输入1是句子序列，输入2是谓词序列，输入3是谓词上下文，从句子中抽取这个谓词前后各`$n$`个词，构成谓词上下文，用one-hot方式表示，输入4是谓词上下文区域标记，标记了句子中每一个词是否在谓词上下文中；
-- 将输入2~3均扩展为和输入1一样长的序列；
-2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列；其中输入1、3共享同一个词表，输入2和4各自独有词表；
-3. 第2步的4个词向量序列作为双向LSTM模型的输入；LSTM模型学习输入序列的特征表示，得到新的特性表示序列；
-4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
-
-![db_lstm_network](./image/db_lstm_network.png)
-<div  align="center">
-图6. SRL任务上的深层双向LSTM模型
-</div>
-
-
-## 数据介绍
-
-在此教程中，我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是，CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开，目前，能够获取到的只有测试集，包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中，我们以测试集中的WSJ数据为训练集来讲解模型。但是，由于测试集中样本的数量远远不够，如果希望训练一个可用的神经网络SRL系统，请考虑付费获取全量数据。
-
-原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中，我们使用test.wsj文件夹中的数据进行训练和测试，并只会用到words文件夹（文本序列）和props文件夹（标注结果）下的数据。本教程使用的数据目录如下：
-
-```text
-conll05st-release/
-└── test.wsj
-├── props  # 标注结果
-└── words  # 输入文本序列
-```
-
-标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同，但原理是相同的，关于标注结果标签含义的说明，请参考论文\[[9](#参考文献)\]。
-
-原始数据需要进行数据预处理才能被PaddlePaddle处理，预处理包括下面几个步骤:
-
-1. 将文本序列和标记序列其合并到一条记录中；
-2. 一个句子如果含有`$n$`个谓词，这个句子会被处理`$n$`次，变成`$n$`条独立的训练样本，每个样本一个不同的谓词；
-3. 抽取谓词上下文和构造谓词上下文区域标记；
-4. 构造以BIO法表示的标记；
-5. 依据词典获取词对应的整数索引。
-
-
-```python
-# import paddle.v2.dataset.conll05 as conll05
-# conll05.corpus_reader函数完成上面第1步和第2步.
-# conll05.reader_creator函数完成上面第3步到第5步.
-# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练.
-```
-
-预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
-
-| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
-|---|---|---|---|---|
-| A | set | n't been set . × | 0 | B-A1 |
-| record | set | n't been set . × | 0 | I-A1 |
-| date | set | n't been set . × | 0 | I-A1 |
-| has | set | n't been set . × | 0 | O |
-| n't | set | n't been set . × | 1 | B-AM-NEG |
-| been | set | n't been set . × | 1 | O |
-| set | set | n't been set . × | 1 | B-V |
-| . | set | n't been set . × | 1 | O |
-
-
-除数据之外，我们同时提供了以下资源：
-
-| 文件名称 | 说明 |
-|---|---|
-| word_dict | 输入句子的词典，共计44068个词 |
-| label_dict | 标记的词典，共计106个标记 |
-| predicate_dict | 谓词的词典，共计3162个词 |
-| emb | 一个训练好的词表，32维 |
-
-我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
-
-获取词典，打印词典大小：
-
-```python
-import math, os
-import numpy as np
-import paddle
-import paddle.v2.dataset.conll05 as conll05
-import paddle.fluid as fluid
-import time
-
-with_gpu = os.getenv('WITH_GPU', '0') != '0'
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_dict_len = len(verb_dict)
-
-print word_dict_len
-print label_dict_len
-print pred_dict_len
-```
-
-## 模型配置说明
-
-- 定义输入数据维度及模型超参数。
-
-```python
-mark_dict_len = 2   # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
-word_dim = 32       # 词向量维度
-mark_dim = 5        # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
-hidden_dim = 512    # LSTM隐层向量的维度 ： 512 / 4
-depth = 8           # 栈式LSTM的深度
-mix_hidden_lr = 1e-3
-
-IS_SPARSE = True
-PASS_NUM = 10
-BATCH_SIZE = 10
-
-embedding_name = 'emb'
-```
-
-这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
-
-- 如上文提到，我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数，在训练中不更新。
-
-```python
-# 这里加载PaddlePaddle上版保存的二进制模型
-def load_parameter(file_name, h, w):
-with open(file_name, 'rb') as f:
-f.read(16)  # skip header.
-return np.fromfile(f, dtype=np.float32).reshape(h, w)
-```
-
-- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
-
-```python
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-**ignored):
-# 8 features
-predicate_embedding = fluid.layers.embedding(
-input=predicate,
-size=[pred_dict_len, word_dim],
-dtype='float32',
-is_sparse=IS_SPARSE,
-param_attr='vemb')
-
-mark_embedding = fluid.layers.embedding(
-input=mark,
-size=[mark_dict_len, mark_dim],
-dtype='float32',
-is_sparse=IS_SPARSE)
-
-word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-# Since word vector lookup table is pre-trained, we won't update it this time.
-# trainable being False prevents updating the lookup table during training.
-emb_layers = [
-fluid.layers.embedding(
-size=[word_dict_len, word_dim],
-input=x,
-param_attr=fluid.ParamAttr(
-name=embedding_name, trainable=False)) for x in word_input
-]
-emb_layers.append(predicate_embedding)
-emb_layers.append(mark_embedding)
-
-# 8 LSTM units are trained through alternating left-to-right / right-to-left order
-# denoted by the variable `reverse`.
-hidden_0_layers = [
-fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
-for emb in emb_layers
-]
-
-hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-lstm_0 = fluid.layers.dynamic_lstm(
-input=hidden_0,
-size=hidden_dim,
-candidate_activation='relu',
-gate_activation='sigmoid',
-cell_activation='sigmoid')
-
-# stack L-LSTM and R-LSTM with direct edges
-input_tmp = [hidden_0, lstm_0]
-
-# In PaddlePaddle, state features and transition features of a CRF are implemented
-# by a fully connected layer and a CRF layer seperately. The fully connected layer
-# with linear activation learns the state features, here we use fluid.layers.sums
-# (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle:
-# fluid.layers.linear_chain_crf only
-# learns the transition features, which is a cost layer and is the last layer of the network.
-# fluid.layers.linear_chain_crf outputs the log probability of true tag sequence
-# as the cost by given the input sequence and it requires the true tag sequence
-# as target in the learning process.
-
-for i in range(1, depth):
-mix_hidden = fluid.layers.sums(input=[
-fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
-fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
-])
-
-lstm = fluid.layers.dynamic_lstm(
-input=mix_hidden,
-size=hidden_dim,
-candidate_activation='relu',
-gate_activation='sigmoid',
-cell_activation='sigmoid',
-is_reverse=((i % 2) == 1))
-
-input_tmp = [mix_hidden, lstm]
-
-# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
-# 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征
-feature_out = fluid.layers.sums(input=[
-fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
-fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
-])
-
-return feature_out
-```
-
-## 训练模型
-
-- 我们根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法(momentum设置为0)，同时设定了学习率、正则等。
-
-- 数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
-
-- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。
-
-- 可以使用event_handler回调函数来观察训练过程，或进行测试等。这里我们打印了训练过程的cost，该回调函数是trainer.train函数里设定。
-
-- 通过trainer.train函数训练
-
-```python
-def train(use_cuda, save_dirname=None, is_local=True):
-# define network topology
-
-# 句子序列
-word = fluid.layers.data(
-name='word_data', shape=[1], dtype='int64', lod_level=1)
-
-# 谓词
-predicate = fluid.layers.data(
-name='verb_data', shape=[1], dtype='int64', lod_level=1)
-
-# 谓词上下文5个特征
-ctx_n2 = fluid.layers.data(
-name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-ctx_n1 = fluid.layers.data(
-name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_0 = fluid.layers.data(
-name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p1 = fluid.layers.data(
-name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-ctx_p2 = fluid.layers.data(
-name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-
-# 谓词上下区域标志
-mark = fluid.layers.data(
-name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-# define network topology
-feature_out = db_lstm(**locals())
-
-# 标注序列
-target = fluid.layers.data(
-name='target', shape=[1], dtype='int64', lod_level=1)
-
-# 学习 CRF 的转移特征
-crf_cost = fluid.layers.linear_chain_crf(
-input=feature_out,
-label=target,
-param_attr=fluid.ParamAttr(
-name='crfw', learning_rate=mix_hidden_lr))
-
-avg_cost = fluid.layers.mean(crf_cost)
-
-sgd_optimizer = fluid.optimizer.SGD(
-learning_rate=fluid.layers.exponential_decay(
-learning_rate=0.01,
-decay_steps=100000,
-decay_rate=0.5,
-staircase=True))
-
-sgd_optimizer.minimize(avg_cost)
-
-# The CRF decoding layer is used for evaluation and inference.
-# It shares weights with CRF layer.  The sharing of parameters among multiple layers
-# is specified by using the same parameter name in these layers. If true tag sequence
-# is provided in training process, `fluid.layers.crf_decoding` calculates labelling error
-# for each input token and sums the error over the entire sequence.
-# Otherwise, `fluid.layers.crf_decoding`  generates the labelling tags.
-crf_decode = fluid.layers.crf_decoding(
-input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
-
-train_data = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.conll05.test(), buf_size=8192),
-batch_size=BATCH_SIZE)
-
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-
-feeder = fluid.DataFeeder(
-feed_list=[
-word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
-],
-place=place)
-exe = fluid.Executor(place)
-
-def train_loop(main_program):
-exe.run(fluid.default_startup_program())
-embedding_param = fluid.global_scope().find_var(
-embedding_name).get_tensor()
-embedding_param.set(
-load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
-place)
-
-start_time = time.time()
-batch_id = 0
-for pass_id in xrange(PASS_NUM):
-for data in train_data():
-cost = exe.run(main_program,
-feed=feeder.feed(data),
-fetch_list=[avg_cost])
-cost = cost[0]
-
-if batch_id % 10 == 0:
-print("avg_cost:" + str(cost))
-if batch_id != 0:
-print("second per batch: " + str((time.time(
-) - start_time) / batch_id))
-# Set the threshold low to speed up the CI test
-if float(cost) < 60.0:
-if save_dirname is not None:
-fluid.io.save_inference_model(save_dirname, [
-'word_data', 'verb_data', 'ctx_n2_data',
-'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-'ctx_p2_data', 'mark_data'
-], [feature_out], exe)
-return
-
-batch_id = batch_id + 1
-
-train_loop(fluid.default_main_program())
-```
-
-
-## 应用模型
-
-训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。
-
-```python
-def infer(use_cuda, save_dirname=None):
-if save_dirname is None:
-return
-
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-inference_scope = fluid.core.Scope()
-with fluid.scope_guard(inference_scope):
-# Use fluid.io.load_inference_model to obtain the inference program desc,
-# the feed_target_names (the names of variables that will be fed
-# data using feed operators), and the fetch_targets (variables that
-# we want to obtain data from using fetch operators).
-[inference_program, feed_target_names,
-fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-# Setup inputs by creating LoDTensors to represent sequences of words.
-# Here each word is the basic element of these LoDTensors and the shape of
-# each word (base_shape) should be [1] since it is simply an index to
-# look up for the corresponding word vector.
-# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-# which has only one lod level. Then the created LoDTensors will have only
-# one higher level structure (sequence of words, or sentence) than the basic
-# element (word). Hence the LoDTensor will hold data for three sentences of
-# length 3, 4 and 2, respectively.
-# Note that lod info should be a list of lists.
-lod = [[3, 4, 2]]
-base_shape = [1]
-# The range of random integers is [low, high]
-word = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=word_dict_len - 1)
-pred = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=pred_dict_len - 1)
-ctx_n2 = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=word_dict_len - 1)
-ctx_n1 = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=word_dict_len - 1)
-ctx_0 = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=word_dict_len - 1)
-ctx_p1 = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=word_dict_len - 1)
-ctx_p2 = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=word_dict_len - 1)
-mark = fluid.create_random_int_lodtensor(
-lod, base_shape, place, low=0, high=mark_dict_len - 1)
-
-# Construct feed as a dictionary of {feed_target_name: feed_target_data}
-# and results will contain a list of data corresponding to fetch_targets.
-assert feed_target_names[0] == 'word_data'
-assert feed_target_names[1] == 'verb_data'
-assert feed_target_names[2] == 'ctx_n2_data'
-assert feed_target_names[3] == 'ctx_n1_data'
-assert feed_target_names[4] == 'ctx_0_data'
-assert feed_target_names[5] == 'ctx_p1_data'
-assert feed_target_names[6] == 'ctx_p2_data'
-assert feed_target_names[7] == 'mark_data'
-
-results = exe.run(inference_program,
-feed={
-feed_target_names[0]: word,
-feed_target_names[1]: pred,
-feed_target_names[2]: ctx_n2,
-feed_target_names[3]: ctx_n1,
-feed_target_names[4]: ctx_0,
-feed_target_names[5]: ctx_p1,
-feed_target_names[6]: ctx_p2,
-feed_target_names[7]: mark
-},
-fetch_list=fetch_targets,
-return_numpy=False)
-print(results[0].lod())
-np_data = np.array(results[0])
-print("Inference Shape: ", np_data.shape)
-```
-
-整个程序的入口如下：
-
-```python
-def main(use_cuda, is_local=True):
-if use_cuda and not fluid.core.is_compiled_with_cuda():
-return
-
-# Directory for saving the trained model
-save_dirname = "label_semantic_roles.inference.model"
-
-train(use_cuda, save_dirname, is_local)
-infer(use_cuda, save_dirname)
-
-
-main(use_cuda=False)
-```
-
-## 总结
-
-语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例，介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放，教程中只使用测试数据作为示例。在这个过程中，我们希望减少对其它自然语言处理工具的依赖，利用神经网络数据驱动、端到端学习的能力，得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。
-
-## 参考文献
-1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
-2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
-3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
-5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
-6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
-7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
-8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
-9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
-10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/basics/machine_translation/index.md b/source/beginners_guide/basics/machine_translation/index.md
deleted file mode 100644
index 06dc48bdb6860da582587b9e7b6f5ab580173ef3..0000000000000000000000000000000000000000
--- a/source/beginners_guide/basics/machine_translation/index.md
+++ /dev/null
@@ -1,448 +0,0 @@
-# 机器翻译
-
-本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/179.html)。
-
-## 背景介绍
-
-机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
-
-早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
-
-为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
-
-近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-![nmt](./image/nmt.png)
-<p align="center">
-图1. 基于神经网络的机器翻译系统
-</p>
-
-本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
-
-## 效果展示
-
-以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
-```text
-这些 是 希望 的 曙光 和 解脱 的 迹象 .
-```
-如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
-```text
-0 -5.36816   These are signs of hope and relief . <e>
-1 -6.23177   These are the light of hope and relief . <e>
-2 -7.7914  These are the light of hope and the relief of hope . <e>
-```
-- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
-- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
-
-## 模型概览
-
-本节依次介绍双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架以及柱搜索（beam search）算法。
-
-### 双向循环神经网络
-
-我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
-
-具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。
-
-![bi_rnn](./image/bi_rnn.png)
-<p align="center">
-图3. 按时间步展开的双向循环神经网络
-</p>
-
-### 编码器-解码器框架
-
-编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
-![encoder_decoder](./image/encoder_decoder.png)
-<p align="center">
-图4. 编码器-解码器框架
-</p>
-
-#### 编码器
-
-编码阶段分为三步：
-
-1. one-hot vector表示：将源语言句子`$x=\left \{ x_1,x_2,...,x_T \right \}$`的每个词`$x_i$`表示成一个列向量`$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$`。这个向量`$w_i$`的维度与词汇表大小`$\left | V \right |$` 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
-
-2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为`$C\epsilon R^{K\times \left | V \right |}$`，用`$s_i=Cw_i$`表示第`$i$`个词的词向量，`$K$`为向量维度。
-
-3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。
-
-第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
-
-![encoder_attention](./image/encoder_attention.png)
-<p align="center">
-图5. 使用双向GRU的编码器
-</p>
-
-#### 解码器
-
-机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
-1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
-
-$$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
-
-其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
-
-2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下：
-
-$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
-
-其中`$W_sz_{i+1}+b_z$`是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第`$i+1$`个词的概率`$p_{i+1}$`。
-
-3. 根据`$p_{i+1}$`和`$u_{i+1}$`计算代价。
-4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
-
-机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
-
-### 柱搜索算法
-
-柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
-
-柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
-
-使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
-1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。
-2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。
-3. 根据`$p_{i+1}$`采样出单词`$u_{i+1}$`。
-4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
-
-注意：`$z_{i+1}$`和`$p_{i+1}$`的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
-
-## 数据介绍
-
-本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
-
-### 数据预处理
-
-我们的预处理流程包括两步：
-- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
-- 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
-- `XXX`中的第`$i$`行内容为`XXX.src`中的第`$i$`行和`XXX.trg`中的第`$i$`行连接，用'\t'分隔。
-- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
-
-### 示例数据
-
-因为完整的数据集数据量较大，为了验证训练流程，PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。
-
-该数据集有193319条训练数据，6003条测试数据，词典长度为30000。因为数据规模限制，使用该数据集训练出来的模型效果无法保证。
-
-## 模型配置说明
-
-下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
-
-```python
-import contextlib
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as pd
-from paddle.fluid.executor import Executor
-from functools import partial
-import os
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-hidden_dim = 32
-word_dim = 16
-batch_size = 2
-max_length = 8
-topk_size = 50
-beam_size = 2
-
-decoder_size = hidden_dim
-```
-
-然后如下实现编码器框架：
-
-```python
-def encoder(is_sparse):
-src_word_id = pd.data(
-name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-src_embedding = pd.embedding(
-input=src_word_id,
-size=[dict_size, word_dim],
-dtype='float32',
-is_sparse=is_sparse,
-param_attr=fluid.ParamAttr(name='vemb'))
-
-fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-encoder_out = pd.sequence_last_step(input=lstm_hidden0)
-return encoder_out
-```
-
-再实现训练模式下的解码器：
-
-```python
-def train_decoder(context, is_sparse):
-trg_language_word = pd.data(
-name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-trg_embedding = pd.embedding(
-input=trg_language_word,
-size=[dict_size, word_dim],
-dtype='float32',
-is_sparse=is_sparse,
-param_attr=fluid.ParamAttr(name='vemb'))
-
-rnn = pd.DynamicRNN()
-with rnn.block():
-current_word = rnn.step_input(trg_embedding)
-pre_state = rnn.memory(init=context)
-current_state = pd.fc(input=[current_word, pre_state],
-size=decoder_size,
-act='tanh')
-
-current_score = pd.fc(input=current_state,
-size=target_dict_dim,
-act='softmax')
-rnn.update_memory(pre_state, current_state)
-rnn.output(current_score)
-
-return rnn()
-```
-
-实现推测模式下的解码器：
-
-```python
-def decode(context, is_sparse):
-init_state = context
-array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
-counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
-
-# fill the first element with init_state
-state_array = pd.create_array('float32')
-pd.array_write(init_state, array=state_array, i=counter)
-
-# ids, scores as memory
-ids_array = pd.create_array('int64')
-scores_array = pd.create_array('float32')
-
-init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
-init_scores = pd.data(
-name="init_scores", shape=[1], dtype="float32", lod_level=2)
-
-pd.array_write(init_ids, array=ids_array, i=counter)
-pd.array_write(init_scores, array=scores_array, i=counter)
-
-cond = pd.less_than(x=counter, y=array_len)
-
-while_op = pd.While(cond=cond)
-with while_op.block():
-pre_ids = pd.array_read(array=ids_array, i=counter)
-pre_state = pd.array_read(array=state_array, i=counter)
-pre_score = pd.array_read(array=scores_array, i=counter)
-
-# expand the lod of pre_state to be the same with pre_score
-pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
-
-pre_ids_emb = pd.embedding(
-input=pre_ids,
-size=[dict_size, word_dim],
-dtype='float32',
-is_sparse=is_sparse)
-
-# use rnn unit to update rnn
-current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
-size=decoder_size,
-act='tanh')
-current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
-# use score to do beam search
-current_score = pd.fc(input=current_state_with_lod,
-size=target_dict_dim,
-act='softmax')
-topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
-selected_ids, selected_scores = pd.beam_search(
-pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
-
-pd.increment(x=counter, value=1, in_place=True)
-
-# update the memories
-pd.array_write(current_state, array=state_array, i=counter)
-pd.array_write(selected_ids, array=ids_array, i=counter)
-pd.array_write(selected_scores, array=scores_array, i=counter)
-
-pd.less_than(x=counter, y=array_len, cond=cond)
-
-translation_ids, translation_scores = pd.beam_search_decode(
-ids=ids_array, scores=scores_array)
-
-return translation_ids, translation_scores
-```
-
-进而，我们定义一个`train_program`来使用`inference_program`计算出的结果，在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。
-
-```python
-def train_program(is_sparse):
-context = encoder(is_sparse)
-rnn_out = train_decoder(context, is_sparse)
-label = pd.data(
-name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-cost = pd.cross_entropy(input=rnn_out, label=label)
-avg_cost = pd.mean(cost)
-return avg_cost
-
-
-def optimizer_func():
-return fluid.optimizer.Adagrad(
-learning_rate=1e-4,
-regularization=fluid.regularizer.L2DecayRegularizer(
-regularization_coeff=0.1))
-```
-
-## 训练模型
-
-### 定义训练环境
-定义您的训练环境，可以指定训练是发生在CPU还是GPU上。
-
-```python
-use_cuda = False
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-```
-
-### 定义数据提供器
-下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.wmt.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据，乱序化的大小为缓存大小`buf_size`。
-
-```python
-train_reader = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-batch_size=batch_size)
-```
-
-### 构造训练器(trainer)
-训练器需要一个训练程序和一个训练优化函数。
-
-```python
-is_sparse = False
-trainer = fluid.Trainer(
-train_func=partial(train_program, is_sparse),
-place=place,
-optimizer_func=optimizer_func)
-```
-
-### 提供数据
-
-`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`wmt14.train`产生的第一列的数据对应的是`src_word_id`这个特征。
-
-```python
-feed_order = [
-'src_word_id', 'target_language_word', 'target_language_next_word'
-]
-```
-
-### 事件处理器
-回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
-
-```python
-def event_handler(event):
-if isinstance(event, fluid.EndStepEvent):
-if event.step % 10 == 0:
-print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
-
-if event.step == 20:
-trainer.stop()
-```
-
-### 开始训练
-最后，我们传入训练循环数（`num_epoch`）和一些别的参数，调用 `trainer.train` 来开始训练。
-
-```python
-EPOCH_NUM = 1
-
-trainer.train(
-reader=train_reader,
-num_epochs=EPOCH_NUM,
-event_handler=event_handler,
-feed_order=feed_order)
-```
-
-## 应用模型
-
-### 定义解码部分
-
-使用上面定义的 `encoder` 和 `decoder` 函数来推测翻译后的对应id和分数.
-
-```python
-context = encoder(is_sparse)
-translation_ids, translation_scores = decode(context, is_sparse)
-```
-
-### 定义数据
-
-我们先初始化id和分数来生成tensors来作为输入数据。在这个预测例子中，我们用`wmt14.test`数据中的第一个记录来做推测，最后我们用"源字典"和"目标字典"来列印对应的句子结果。
-
-```python
-init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
-init_scores_data = np.array(
-[1. for _ in range(batch_size)], dtype='float32')
-init_ids_data = init_ids_data.reshape((batch_size, 1))
-init_scores_data = init_scores_data.reshape((batch_size, 1))
-init_lod = [1] * batch_size
-init_lod = [init_lod, init_lod]
-
-init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
-
-test_data = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.wmt14.test(dict_size), buf_size=1000),
-batch_size=batch_size)
-
-feed_order = ['src_word_id']
-feed_list = [
-framework.default_main_program().global_block().var(var_name)
-for var_name in feed_order
-]
-feeder = fluid.DataFeeder(feed_list, place)
-
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-```
-
-### 测试
-现在我们可以进行预测了。我们要在`feed_order`提供对应参数，放在`executor`上运行以取得id和分数结果
-
-```python
-exe = Executor(place)
-exe.run(framework.default_startup_program())
-
-for data in test_data():
-feed_data = map(lambda x: [x[0]], data)
-feed_dict = feeder.feed(feed_data)
-feed_dict['init_ids'] = init_ids
-feed_dict['init_scores'] = init_scores
-
-results = exe.run(
-framework.default_main_program(),
-feed=feed_dict,
-fetch_list=[translation_ids, translation_scores],
-return_numpy=False)
-
-result_ids = np.array(results[0])
-result_scores = np.array(results[1])
-
-print("Original sentence:")
-print(" ".join([src_dict[w] for w in feed_data[0][0]]))
-print("Translated sentence:")
-print(" ".join([trg_dict[w] for w in result_ids]))
-print("Corresponding score: ", result_scores)
-
-break
-```
-
-## 总结
-
-端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
-
-## 参考文献
-
-1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
-2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
-3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
-4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
-5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/basics/recommender_system/index.md b/source/beginners_guide/basics/recommender_system/index.md
deleted file mode 100644
index b0845ca816ae650799015831b2c7c5888a5843c7..0000000000000000000000000000000000000000
--- a/source/beginners_guide/basics/recommender_system/index.md
+++ /dev/null
@@ -1,528 +0,0 @@
-# 个性化推荐
-
-本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/176.html)。
-
-## 背景介绍
-
-在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
-
-个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
-
-传统的推荐系统方法主要有：
-
-- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
-- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
-- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
-
-其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
-
-深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
-
-## 效果展示
-
-我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[0,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
-
-```
-Input movie_id: 1962
-Input user_id: 1
-Prediction Score is 4.25
-```
-
-## 模型概览
-
-本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
-
-### YouTube的深度神经网络推荐系统
-
-YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
-
-![YouTube_Overview](./image/YouTube_Overview.png)
-<p align="center">
-图1. YouTube 推荐系统结构
-</p>
-
-#### 候选生成网络（Candidate Generation Network）
-
-候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
-
-首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的`$k$`个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
-
-![Deep_candidate_generation_model_architecture](./image/Deep_candidate_generation_model_architecture.png)
-<p align="center">
-图2. 候选生成网络结构
-</p>
-
-对于一个用户`$U$`，预测此刻用户要观看的视频`$\omega$`为视频`$i$`的概率公式为：
-
-$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
-
-其中`$u$`为用户`$U$`的特征表示，`$V$`为视频库集合，`$v_i$`为视频库中第`$i$`个视频的特征表示。`$u$`和`$v_i$`为长度相等的向量，两者点积可以通过全连接层实现。
-
-考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与`$u$`最近的`$k$`个视频作为生成的候选。
-
-#### 排序网络（Ranking Network）
-排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
-
-### 融合推荐模型
-本节会使卷积神经网络（Convolutional Neural Networks）来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。
-
-#### 文本卷积神经网络（CNN）
-
-卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。
-
-卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
-
-![text_cnn](./image/text_cnn.png)
-<p align="center">
-图3. 卷积神经网络文本分类模型
-</p>
-
-假设待处理句子的长度为`$n$`，其中第`$i$`个词的词向量（word embedding）为`$x_i\in\mathbb{R}^k$`，`$k$`为维度大小。
-
-首先，进行词向量的拼接操作：将每`$h$`个词拼接起来形成一个大小为`$h$`的词窗口，记为`$x_{i:i+h-1}$`，它表示词序列`$x_{i},x_{i+1},\ldots,x_{i+h-1}$`的拼接，其中，`$i$`表示词窗口中第一个词在整个句子中的位置，取值范围从`$1$`到`$n-h+1$`，`$x_{i:i+h-1}\in\mathbb{R}^{hk}$`。
-
-其次，进行卷积操作：把卷积核(kernel)`$w\in\mathbb{R}^{hk}$`应用于包含`$h$`个词的窗口`$x_{i:i+h-1}$`，得到特征`$c_i=f(w\cdot x_{i:i+h-1}+b)$`，其中`$b\in\mathbb{R}$`为偏置项（bias），`$f$`为非线性激活函数，如`$sigmoid$`。将卷积核应用于句子中所有的词窗口`${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$`，产生一个特征图（feature map）：
-
-$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
-
-接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征`$\hat c$`，它是特征图中所有元素的最大值：
-
-$$\hat c=max(c)$$
-
-#### 模型概览
-
-在融合推荐模型的电影推荐系统中：
-
-1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
-
-- 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
-
-- 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
-
-2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
-
-3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
-
-4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
-
-![rec_regression_network](./image/rec_regression_network.png)
-<p align="center">
-图4. 融合推荐模型
-</p>
-
-## 数据准备
-
-### 数据介绍与下载
-
-我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
-
-Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`
-
-
-```python
-import paddle
-movie_info = paddle.dataset.movielens.movie_info()
-print movie_info.values()[0]
-```
-
-
-```python
-# Run this block to show dataset's documentation
-# help(paddle.dataset.movielens)
-```
-
-在原始数据中包含电影的特征数据，用户的特征数据，和用户对电影的评分。
-
-例如，其中某一个电影特征为:
-
-
-```python
-movie_info = paddle.dataset.movielens.movie_info()
-print movie_info.values()[0]
-```
-
-<MovieInfo id(1), title(Toy Story ), categories(['Animation', "Children's", 'Comedy'])>
-
-
-这表示，电影的id是1，标题是《Toy Story》，该电影被分为到三个类别中。这三个类别是动画，儿童，喜剧。
-
-
-```python
-user_info = paddle.dataset.movielens.user_info()
-print user_info.values()[0]
-```
-
-<UserInfo id(1), gender(F), age(1), job(10)>
-
-
-这表示，该用户ID是1，女性，年龄比18岁还年轻。职业ID是10。
-
-
-其中，年龄使用下列分布
-*  1:  "Under 18"
-* 18:  "18-24"
-* 25:  "25-34"
-* 35:  "35-44"
-* 45:  "45-49"
-* 50:  "50-55"
-* 56:  "56+"
-
-职业是从下面几种选项里面选则得出:
-*  0:  "other" or not specified
-*  1:  "academic/educator"
-*  2:  "artist"
-*  3:  "clerical/admin"
-*  4:  "college/grad student"
-*  5:  "customer service"
-*  6:  "doctor/health care"
-*  7:  "executive/managerial"
-*  8:  "farmer"
-*  9:  "homemaker"
-* 10:  "K-12 student"
-* 11:  "lawyer"
-* 12:  "programmer"
-* 13:  "retired"
-* 14:  "sales/marketing"
-* 15:  "scientist"
-* 16:  "self-employed"
-* 17:  "technician/engineer"
-* 18:  "tradesman/craftsman"
-* 19:  "unemployed"
-* 20:  "writer"
-
-而对于每一条训练/测试数据，均为 <用户特征> + <电影特征> + 评分。
-
-例如，我们获得第一条训练数据:
-
-
-```python
-train_set_creator = paddle.dataset.movielens.train()
-train_sample = next(train_set_creator())
-uid = train_sample[0]
-mov_id = train_sample[len(user_info[uid].value())]
-print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
-```
-
-User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])> with Score [5.0]
-
-
-即用户1对电影1193的评价为5分。
-
-## 模型配置说明
-
-下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。
-
-
-```python
-import math
-import sys
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.nets as nets
-
-IS_SPARSE = True
-USE_GPU = False
-BATCH_SIZE = 256
-```
-
-然后为我们的用户特征综合模型定义模型配置
-
-```python
-def get_usr_combined_features():
-
-USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-
-uid = layers.data(name='user_id', shape=[1], dtype='int64')
-
-usr_emb = layers.embedding(
-input=uid,
-dtype='float32',
-size=[USR_DICT_SIZE, 32],
-param_attr='user_table',
-is_sparse=IS_SPARSE)
-
-usr_fc = layers.fc(input=usr_emb, size=32)
-
-USR_GENDER_DICT_SIZE = 2
-
-usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
-
-usr_gender_emb = layers.embedding(
-input=usr_gender_id,
-size=[USR_GENDER_DICT_SIZE, 16],
-param_attr='gender_table',
-is_sparse=IS_SPARSE)
-
-usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
-
-USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
-
-usr_age_emb = layers.embedding(
-input=usr_age_id,
-size=[USR_AGE_DICT_SIZE, 16],
-is_sparse=IS_SPARSE,
-param_attr='age_table')
-
-usr_age_fc = layers.fc(input=usr_age_emb, size=16)
-
-USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
-
-usr_job_emb = layers.embedding(
-input=usr_job_id,
-size=[USR_JOB_DICT_SIZE, 16],
-param_attr='job_table',
-is_sparse=IS_SPARSE)
-
-usr_job_fc = layers.fc(input=usr_job_emb, size=16)
-
-concat_embed = layers.concat(
-input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
-
-usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-
-return usr_combined_features
-```
-
-如上述代码所示，对于每个用户，我们输入4维特征。其中包括user_id,gender_id,age_id,job_id。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便，我们借鉴NLP中的语言模型，将这几维离散的整数值，变换成embedding取出。分别形成usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb。
-
-然后，我们对于所有的用户特征，均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。
-
-进而，我们对每一个电影特征做类似的变换，网络配置为:
-
-
-```python
-def get_mov_combined_features():
-
-MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-
-mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
-
-mov_emb = layers.embedding(
-input=mov_id,
-dtype='float32',
-size=[MOV_DICT_SIZE, 32],
-param_attr='movie_table',
-is_sparse=IS_SPARSE)
-
-mov_fc = layers.fc(input=mov_emb, size=32)
-
-CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-
-category_id = layers.data(
-name='category_id', shape=[1], dtype='int64', lod_level=1)
-
-mov_categories_emb = layers.embedding(
-input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-
-mov_categories_hidden = layers.sequence_pool(
-input=mov_categories_emb, pool_type="sum")
-
-MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-
-mov_title_id = layers.data(
-name='movie_title', shape=[1], dtype='int64', lod_level=1)
-
-mov_title_emb = layers.embedding(
-input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-
-mov_title_conv = nets.sequence_conv_pool(
-input=mov_title_emb,
-num_filters=32,
-filter_size=3,
-act="tanh",
-pool_type="sum")
-
-concat_embed = layers.concat(
-input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
-
-mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-
-return mov_combined_features
-```
-
-电影标题名称(title)是一个序列的整数，整数代表的是这个词在索引序列中的下标。这个序列会被送入 `sequence_conv_pool` 层，这个层会在时间维度上使用卷积和池化。因为如此，所以输出会是固定长度，尽管输入的序列长度各不相同。
-
-最后，我们定义一个`inference_program`来使用余弦相似度计算用户特征与电影特征的相似性。
-
-```python
-def inference_program():
-usr_combined_features = get_usr_combined_features()
-mov_combined_features = get_mov_combined_features()
-
-inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
-scale_infer = layers.scale(x=inference, scale=5.0)
-
-return scale_infer
-```
-
-进而，我们定义一个`train_program`来使用`inference_program`计算出的结果，在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。
-
-```python
-def train_program():
-
-scale_infer = inference_program()
-
-label = layers.data(name='score', shape=[1], dtype='float32')
-square_cost = layers.square_error_cost(input=scale_infer, label=label)
-avg_cost = layers.mean(square_cost)
-
-return [avg_cost, scale_infer]
-
-
-def optimizer_func():
-return fluid.optimizer.SGD(learning_rate=0.2)
-```
-
-
-## 训练模型
-
-### 定义训练环境
-定义您的训练环境，可以指定训练是发生在CPU还是GPU上。
-
-```python
-use_cuda = False
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-```
-
-### 定义数据提供器
-下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.movielens.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据，乱序化的大小为缓存大小`buf_size`。
-
-```python
-train_reader = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.movielens.train(), buf_size=8192),
-batch_size=BATCH_SIZE)
-
-test_reader = paddle.batch(
-paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
-```
-
-### 构造训练器(trainer)
-训练器需要一个训练程序和一个训练优化函数。
-
-```python
-trainer = fluid.Trainer(
-train_func=train_program, place=place, optimizer_func=optimizer_func)
-```
-
-### 提供数据
-
-`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`movielens.train`产生的第一列的数据对应的是`user_id`这个特征。
-
-```python
-feed_order = [
-'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
-'movie_title', 'score'
-]
-```
-
-### 事件处理器
-回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
-
-```python
-# Specify the directory path to save the parameters
-params_dirname = "recommender_system.inference.model"
-
-from paddle.v2.plot import Ploter
-test_title = "Test cost"
-plot_cost = Ploter(test_title)
-
-
-def event_handler(event):
-if isinstance(event, fluid.EndStepEvent):
-avg_cost_set = trainer.test(
-reader=test_reader, feed_order=feed_order)
-
-# get avg cost
-avg_cost = np.array(avg_cost_set).mean()
-
-plot_cost.append(test_title, event.step, avg_cost_set[0])
-plot_cost.plot()
-
-print("avg_cost: %s" % avg_cost)
-print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-float(avg_cost)))
-
-if event.step == 20: # Adjust this number for accuracy
-trainer.save_params(params_dirname)
-trainer.stop()
-```
-
-### 开始训练
-最后，我们传入训练循环数（`num_epoch`）和一些别的参数，调用 `trainer.train` 来开始训练。
-
-```python
-trainer.train(
-num_epochs=1,
-event_handler=event_handler,
-reader=train_reader,
-feed_order=feed_order)
-```
-
-## 应用模型
-
-### 构建预测器
-传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。
-
-```python
-inferencer = fluid.Inferencer(
-inference_program, param_path=params_dirname, place=place)
-```
-
-### 生成测试用输入数据
-使用 create_lod_tensor(data, lod, place) 的API来生成细节层次的张量。`data`是一个序列，每个元素是一个索引号的序列。`lod`是细节层次的信息，对应于`data`。比如，data = [[10, 2, 3], [2, 3]] 意味着它包含两个序列，长度分别是3和2。于是相应地 lod = [[3, 2]]，它表明其包含一层细节信息，意味着 `data` 有两个序列，长度分别是3和2。
-
-在这个预测例子中，我们试着预测用户ID为1的用户对于电影'Hunchback of Notre Dame'的评分
-
-```python
-infer_movie_id = 783
-infer_movie_name = paddle.dataset.movielens.movie_info()[infer_movie_id].title
-user_id = fluid.create_lod_tensor([[1]], [[1]], place)
-gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
-age_id = fluid.create_lod_tensor([[0]], [[1]], place)
-job_id = fluid.create_lod_tensor([[10]], [[1]], place)
-movie_id = fluid.create_lod_tensor([[783]], [[1]], place) # Hunchback of Notre Dame
-category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place) # Animation, Children's, Musical
-movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
-place) # 'hunchback','of','notre','dame','the'
-```
-
-### 测试
-现在我们可以进行预测了。我们要提供的`feed_order`应该和训练过程一致。
-
-
-```python
-results = inferencer.infer(
-{
-'user_id': user_id,
-'gender_id': gender_id,
-'age_id': age_id,
-'job_id': job_id,
-'movie_id': movie_id,
-'category_id': category_id,
-'movie_title': movie_title
-},
-return_numpy=False)
-```
-
-## 总结
-
-本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
-
-## 参考文献
-
-1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
-2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
-3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
-4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
-5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
-6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
-7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
-
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/basics/understand_sentiment/index.md b/source/beginners_guide/basics/understand_sentiment/index.md
deleted file mode 100644
index 792781aed97285953214525bd98c4b7884103103..0000000000000000000000000000000000000000
--- a/source/beginners_guide/basics/understand_sentiment/index.md
+++ /dev/null
@@ -1,354 +0,0 @@
-# 情感分析
-
-本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/177.html)。
-
-## 背景介绍
-
-在自然语言处理中，情感分析一般是指判断一段文本所表达的情绪状态。其中，一段文本可以是一个句子，一个段落或一个文档。情绪状态可以是两类，如（正面，负面），（高兴，悲伤）；也可以是三类，如（积极，消极，中性）等等。情感分析的应用场景十分广泛，如把用户在购物网站（亚马逊、天猫、淘宝等）、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论；或为了分析用户对于某一产品的整体使用感受，抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子：
-
-| 电影评论       | 类别  |
-| --------     | -----  |
-| 在冯小刚这几年的电影里，算最好的一部的了| 正面 |
-| 很不好看，好像一个地方台的电视剧     | 负面 |
-| 圆方镜头全程炫技，色调背景美则美矣，但剧情拖沓，口音不伦不类，一直努力却始终无法入戏| 负面|
-|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉，看得实在太舒服了。。|正面|
-
-<p align="center">表格 1 电影评论情感分析</p>
-
-在自然语言处理中，情感分析属于典型的**文本分类**问题，即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前，主流的文本表示方法为词袋模型BOW(bag of words)，话题模型等等；分类方法有SVM(support vector machine), LR(logistic regression)等等。
-
-对于一段文本，BOW表示会忽略其词顺序、语法和句法，将这段文本仅仅看做是一个词集合，因此BOW方法并不能充分表示文本的语义信息。例如，句子“这部电影糟糕透了”和“一个乏味，空洞，没有内涵的作品”在情感分析中具有很高的语义相似度，但是它们的BOW表示的相似度为0。又如，句子“一个空洞，没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高，但实际上它们的意思很不一样。
-
-本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷，它在考虑词顺序的基础上把文本映射到低维度的语义空间，并且以端对端（end to end）的方式进行文本表示及分类，其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。
-
-## 模型概览
-本章所使用的文本表示模型为卷积神经网络（Convolutional Neural Networks）和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。
-
-### 文本卷积神经网络简介（CNN）
-
-我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程，这里进行一个简单的回顾。
-
-对卷积神经网络来说，首先使用卷积处理输入的词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子，[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作。
-
-对于一般的短文本分类问题，上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示，可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。
-
-### 循环神经网络（RNN）
-
-循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
-
-![rnn](./image/rnn.png)
-<p align="center">
-图1. 循环神经网络按时间展开的示意图
-</p>
-
-循环神经网络按时间展开后如图1所示：在第`$t$`时刻，网络读入第`$t$`个输入`$x_t$`（向量表示）及前一时刻隐层的状态值`$h_{t-1}$`（向量表示，`$h_0$`一般初始化为`$0$`向量），计算得出本时刻隐层的状态值`$h_t$`，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为`$f$`，则其公式可表示为：
-
-$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h)$$
-
-其中`$W_{xh}$`是输入到隐层的矩阵参数，`$W_{hh}$`是隐层到隐层的矩阵参数，`$b_h$`为隐层的偏置向量（bias）参数，`$\sigma$`为`$sigmoid$`函数。
-
-在处理自然语言时，一般会先将词（one-hot表示）映射为其词向量（word embedding）表示，然后再作为循环神经网络每一时刻的输入`$x_t$`。此外，可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如，可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层（deep or stacked）循环神经网络，或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。
-
-### 长短期记忆网络（LSTM）
-
-对于较长的序列数据，循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题，Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。
-
-相比于简单的循环神经网络，LSTM增加了记忆单元`$c$`、输入门`$i$`、遗忘门`$f$`及输出门`$o$`。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为`$F$`，则其公式为：
-
-$$ h_t=F(x_t,h_{t-1})$$
-
-`$F$`由下列公式组合而成\[[7](#参考文献)\]：
-$$ i_t = \sigma{(W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}c_{t-1}+b_i)} $$
-$$ f_t = \sigma(W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}c_{t-1}+b_f) $$
-$$ c_t = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{t-1}+b_c) $$
-$$ o_t = \sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}c_{t}+b_o) $$
-$$ h_t = o_t\odot tanh(c_t) $$
-其中，`$i_t, f_t, c_t, o_t$`分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的`$W$`及`$b$`为模型参数，`$tanh$`为双曲正切函数，`$\odot$`表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元`$c$`的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元`$c$`，如图2所示：
-
-![lstm](./image/lstm.png)
-<p align="center">
-图2. 时刻`$t$`的LSTM [7]
-</p>
-
-LSTM通过给简单的循环神经网络增加记忆及控制门的方式，增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\]，其设计更为简洁一些。**这些改进虽然各有不同，但是它们的宏观描述却与简单的循环神经网络一样（如图2所示），即隐状态依据当前输入及前一时刻的隐状态来改变，不断地循环这一过程直至输入处理完毕：**
-
-$$ h_t=Recrurent(x_t,h_{t-1})$$
-
-其中，`$Recrurent$`可以表示简单的循环神经网络、GRU或LSTM。
-
-### 栈式双向LSTM（Stacked Bidirectional LSTM）
-
-对于正常顺序的循环神经网络，`$h_t$`包含了`$t$`时刻之前的输入信息，也就是上文信息。同样，为了得到下文信息，我们可以使用反方向（将输入逆序处理）的循环神经网络。结合构建深层循环神经网络的方法（深层神经网络往往能得到更抽象和高级的特征表示），我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\]，来对时序数据进行建模。
-
-如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
-
-![stacked_lstm](./image/stacked_lstm.jpg)
-<p align="center">
-图3. 栈式双向LSTM用于文本分类
-</p>
-
-
-## 数据集介绍
-
-我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中，负面评论的得分小于等于4，正面评论的得分大于等于7，满分10分。
-```text
-aclImdb
-|- test
-|-- neg
-|-- pos
-|- train
-|-- neg
-|-- pos
-```
-Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取，并提供了读取字典、训练数据、测试数据等API。
-
-## 配置模型
-
-在该示例中，我们实现了两种文本分类算法，分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络，以及[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。我们首先引入要用到的库和定义全局变量：
-
-```python
-import paddle
-import paddle.fluid as fluid
-from functools import partial
-import numpy as np
-
-CLASS_DIM = 2
-EMB_DIM = 128
-HID_DIM = 512
-BATCH_SIZE = 128
-USE_GPU = False
-```
-
-
-### 文本卷积神经网络
-我们构建神经网络`convolution_net`，示例代码如下。
-需要注意的是：`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。
-
-```python
-def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
-emb = fluid.layers.embedding(
-input=data, size=[input_dim, emb_dim], is_sparse=True)
-conv_3 = fluid.nets.sequence_conv_pool(
-input=emb,
-num_filters=hid_dim,
-filter_size=3,
-act="tanh",
-pool_type="sqrt")
-conv_4 = fluid.nets.sequence_conv_pool(
-input=emb,
-num_filters=hid_dim,
-filter_size=4,
-act="tanh",
-pool_type="sqrt")
-prediction = fluid.layers.fc(
-input=[conv_3, conv_4], size=class_dim, act="softmax")
-return prediction
-```
-
-网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
-
-### 栈式双向LSTM
-
-栈式双向神经网络`stacked_lstm_net`的代码片段如下：
-
-```python
-def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
-
-emb = fluid.layers.embedding(
-input=data, size=[input_dim, emb_dim], is_sparse=True)
-
-fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-inputs = [fc1, lstm1]
-
-for i in range(2, stacked_num + 1):
-fc = fluid.layers.fc(input=inputs, size=hid_dim)
-lstm, cell = fluid.layers.dynamic_lstm(
-input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-inputs = [fc, lstm]
-
-fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-size=class_dim,
-act='softmax')
-return prediction
-```
-以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。
-
-重申一下，此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个。我们以`convolution_net`为例。
-
-接下来我们定义预测程序（`inference_program`）。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。
-
-```python
-def inference_program(word_dict):
-data = fluid.layers.data(
-name="words", shape=[1], dtype="int64", lod_level=1)
-
-dict_dim = len(word_dict)
-net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
-return net
-```
-
-我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。
-
-因为是有监督的学习，训练集的标签也在`paddle.layer.data`中定义了。在训练过程中，交叉熵用来在`paddle.layer.classification_cost`中作为损失函数。
-
-在测试过程中，分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。
-
-```python
-def train_program(word_dict):
-prediction = inference_program(word_dict)
-label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-cost = fluid.layers.cross_entropy(input=prediction, label=label)
-avg_cost = fluid.layers.mean(cost)
-accuracy = fluid.layers.accuracy(input=prediction, label=label)
-return [avg_cost, accuracy]
-
-
-def optimizer_func():
-return fluid.optimizer.Adagrad(learning_rate=0.002)
-```
-
-## 训练模型
-
-### 定义训练环境
-
-定义您的训练是在CPU上还是在GPU上：
-
-
-```python
-use_cuda = False
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-```
-
-### 定义数据提供器
-
-下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据，乱序化的大小为缓存大小buf_size。
-
-注意：读取IMDB的数据可能会花费几分钟的时间，请耐心等待。
-
-```python
-print("Loading IMDB word dict....")
-word_dict = paddle.dataset.imdb.word_dict()
-
-print ("Reading training data....")
-train_reader = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.imdb.train(word_dict), buf_size=25000),
-batch_size=BATCH_SIZE)
-```
-
-### 构造训练器(trainer)
-训练器需要一个训练程序和一个训练优化函数。
-
-```python
-trainer = fluid.Trainer(
-train_func=partial(train_program, word_dict),
-place=place,
-optimizer_func=optimizer_func)
-```
-
-### 提供数据
-
-`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如，`imdb.train`产生的第一列的数据对应的是`words`这个特征。
-
-```python
-feed_order = ['words', 'label']
-```
-
-### 事件处理器
-
-回调函数event_handler在一个之前定义好的事件发生后会被调用。例如，我们可以在每步训练结束后查看误差。
-
-```python
-# Specify the directory path to save the parameters
-params_dirname = "understand_sentiment_conv.inference.model"
-
-def event_handler(event):
-if isinstance(event, fluid.EndStepEvent):
-print("Step {0}, Epoch {1} Metrics {2}".format(
-event.step, event.epoch, map(np.array, event.metrics)))
-
-if event.step == 10:
-trainer.save_params(params_dirname)
-trainer.stop()
-```
-
-### 开始训练
-
-最后，我们传入训练循环数（num_epoch）和一些别的参数，调用 trainer.train 来开始训练。
-
-```python
-trainer.train(
-num_epochs=1,
-event_handler=event_handler,
-reader=train_reader,
-feed_order=feed_order)
-```
-
-## 应用模型
-
-### 构建预测器
-
-传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。
-
-```python
-inferencer = fluid.Inferencer(
-inference_program, param_path=params_dirname, place=place)
-```
-
-### 生成测试用输入数据
-
-为了进行预测，我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词，则设为`unknown`。
-然后我们用`create_lod_tensor`来创建细节层次的张量。
-
-```python
-reviews_str = [
-'read the book forget the movie', 'this is a great movie', 'this is very bad'
-]
-reviews = [c.split() for c in reviews_str]
-
-UNK = word_dict['<unk>']
-lod = []
-for c in reviews:
-lod.append([word_dict.get(words, UNK) for words in c])
-
-base_shape = [[len(c) for c in lod]]
-
-tensor_words = fluid.create_lod_tensor(lod, base_shape, place)
-```
-
-## 应用模型
-
-现在我们可以对每一条评论进行正面或者负面的预测啦。
-
-```python
-results = inferencer.infer({'words': tensor_words})
-
-for i, r in enumerate(results[0]):
-print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'")
-
-```
-
-
-## 总结
-
-本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
-
-
-## 参考文献
-1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
-2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
-3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
-4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
-5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
-6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
-7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
-8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/basics/word2vec/index.md b/source/beginners_guide/basics/word2vec/index.md
deleted file mode 100644
index dff9471aa79a61f00515b5d3d4c364fd72707f9e..0000000000000000000000000000000000000000
--- a/source/beginners_guide/basics/word2vec/index.md
+++ /dev/null
@@ -1,440 +0,0 @@
-
-# 词向量
-
-本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/175.html)。
-
-## 背景介绍
-
-本章我们介绍词的向量表征，也称为word embedding。词向量是自然语言处理中常见的一个操作，是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。
-
-在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
-在这种方式里，每个词被表示成一个实数向量（one-hot vector），其长度为字典大小，每个维度对应一个字典里的每个词，除了这个词对应维度上的值是1，其他元素都是0。
-
-One-hot vector虽然自然，但是用处有限。比如，在互联网广告系统里，如果用户输入的query是“母亲节”，而有一个广告的关键词是“康乃馨”。虽然按照常理，我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨；但是这两个词对应的one-hot vectors之间的距离度量，无论是欧氏距离还是余弦相似度(cosine similarity)，由于其向量正交，都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是：每个词本身的信息量都太小。所以，仅仅给定两个词，不足以让我们准确判别它们是否相关。要想精确计算相关性，我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。
-
-在机器学习领域里，各种“知识”被各种模型表示，词向量模型(word embedding model)就是其中的一类。通过词向量模型可将一个 one-hot vector映射到一个维度更低的实数向量（embedding vector），如`$embedding(Mother's\ Day) = [0.3, 4.2, -1.5, ...], embedding(Carnation) = [0.2, 5.6, -2.3, ...]$`。在这个映射到的实数向量表示中，希望两个语义（或用法）上相似的词对应的词向量“更像”，这样如“母亲节”和“康乃馨”的对应词向量的余弦相似度就不再为零了。
-
-词向量模型可以是概率模型、共生矩阵(co-occurrence matrix)模型或神经元网络模型。在用神经网络求词向量之前，传统做法是统计一个词语的共生矩阵`$X$`。`$X$`是一个`$|V| \times |V|$` 大小的矩阵，`$X_{ij}$`表示在所有语料中，词汇表`V`(vocabulary)中第i个词和第j个词同时出现的词数，`$|V|$`为词汇表的大小。对`$X$`做矩阵分解（如奇异值分解，Singular Value Decomposition \[[5](#参考文献)\]），得到的`$U$`即视为所有词的词向量：
-
-$$X = USV^T$$
-
-但这样的传统做法有很多问题：<br/>
-1) 由于很多词没有出现，导致矩阵极其稀疏，因此需要对词频做额外处理来达到好的矩阵分解效果；<br/>
-2) 矩阵非常大，维度太高(通常达到`$10^6*10^6$`的数量级)；<br/>
-3) 需要手动去掉停用词（如although, a,...），不然这些频繁出现的词也会影响矩阵分解的效果。
-
-
-基于神经网络的模型不需要计算存储一个在全语料上统计的大表，而是通过学习语义信息得到词向量，因此能很好地解决以上问题。在本章里，我们将展示基于神经网络训练词向量的细节，以及如何用PaddlePaddle训练一个词向量模型。
-
-
-## 效果展示
-
-本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
-
-![2d_similarity](./image/2d_similarity.png)
-<p align="center">
-图1. 词向量的二维投影
-</p>
-
-另一方面，我们知道两个向量的余弦值在`$[-1,1]$`的区间内：两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0，两个方向完全相反的向量余弦值为-1，即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度:
-
-```
-similarity: 0.899180685161
-please input two words: big huge
-
-please input two words: from company
-similarity: -0.0997506977351
-```
-
-以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[应用模型](#应用模型)中详细描述用法。
-
-
-## 模型概览
-
-在这里我们介绍三个训练词向量的模型：N-gram模型，CBOW模型和Skip-gram模型，它们的中心思想都是通过上下文得到一个词出现的概率。对于N-gram模型，我们会先介绍语言模型的概念，并在之后的[训练模型](#训练模型)中，带大家用PaddlePaddle实现它。而后两个模型，是近年来最有名的神经元词向量模型，由 Tomas Mikolov 在Google 研发\[[3](#参考文献)\]，虽然它们很浅很简单，但训练效果很好。
-
-### 语言模型
-
-在介绍词向量模型之前，我们先来引入一个概念：语言模型。
-语言模型旨在为语句的联合概率函数`$P(w_1, ..., w_T)$`建模, 其中`$w_i$`表示句子中的第i个词。语言模型的目标是，希望模型对有意义的句子赋予大概率，对没意义的句子赋予小概率。
-这样的模型可以应用于很多领域，如机器翻译、语音识别、信息检索、词性标注、手写识别等，它们都希望能得到一个连续序列的概率。 以信息检索为例，当你在搜索“how long is a football bame”时（bame是一个医学名词），搜索引擎会提示你是否希望搜索"how long is a football game", 这是因为根据语言模型计算出“how long is a football bame”的概率很低，而与bame近似的，可能引起错误的词中，game会使该句生成的概率最大。
-
-对语言模型的目标概率`$P(w_1, ..., w_T)$`，如果假设文本中每个词都是相互独立的，则整句话的联合概率可以表示为其中所有词语条件概率的乘积，即：
-
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
-
-然而我们知道语句中的每个词出现的概率都与其前面的词紧密相关, 所以实际上通常用条件概率表示语言模型：
-
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
-
-
-
-### N-gram neural model
-
-在计算语言学中，n-gram是一种重要的文本表示方法，表示一个文本中连续的n个项。基于具体的应用场景，每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法，用n-gram训练语言模型时，一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。
-
-Yoshua Bengio等科学家就于2003年在著名论文 Neural Probabilistic Language Models \[[1](#参考文献)\] 中介绍如何学习一个神经元网络表示的词向量模型。文中的神经概率语言模型（Neural Network Language Model，NNLM）通过一个线性映射和一个非线性隐层连接，同时学习了语言模型和词向量，即通过学习大量语料得到词语的向量表达，通过这些向量得到整个句子的概率。用这种方法学习语言模型可以克服维度灾难（curse of dimensionality）,即训练和测试数据不同导致的模型不准。注意：由于“神经概率语言模型”说法较为泛泛，我们在这里不用其NNLM的本名，考虑到其具体做法，本文中称该模型为N-gram neural model。
-
-我们在上文中已经讲到用条件概率建模语言模型，即一句话中第`$t$`个词的概率和该句话的前`$t-1$`个词相关。可实际上越远的词语其实对该词的影响越小，那么如果考虑一个n-gram, 每个词都只受其前面`n-1`个词的影响，则有：
-
-$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
-
-给定一些真实语料，这些语料中都是有意义的句子，N-gram模型的优化目标则是最大化目标函数:
-
-$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
-
-其中`$f(w_t, w_{t-1}, ..., w_{t-n+1})$`表示根据历史n-1个词得到当前词`$w_t$`的条件概率，`$R(\theta)$`表示参数正则项。
-
-![nnlm](./image/nnlm.png)
-<p align="center">
-图2. N-gram神经网络模型
-</p>
-
-图2展示了N-gram神经网络模型，从下往上看，该模型分为以下几个部分：
-- 对于每个样本，模型输入`$w_{t-n+1},...w_{t-1}$`, 输出句子第t个词为字典中`|V|`个词的概率。
-
-每个输入词`$w_{t-n+1},...w_{t-1}$`首先通过映射矩阵映射到词向量`$C(w_{t-n+1}),...C(w_{t-1})$`。
-
-- 然后所有词语的词向量连接成一个大向量，并经过一个非线性映射得到历史词语的隐层表示：
-
-$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
-
-其中，`$x$`为所有词语的词向量连接成的大向量，表示文本历史特征；`$\theta$`、`$U$`、`$b_1$`、`$b_2$`和`$W$`分别为词向量层到隐层连接的参数。`$g$`表示未经归一化的所有输出单词概率，`$g_i$`表示未经归一化的字典中第`$i$`个单词的输出概率。
-
-- 根据softmax的定义，通过归一化`$g_i$`, 生成目标词`$w_t$`的概率为：
-
-$$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
-
-- 整个网络的损失值(cost)为多类分类交叉熵，用公式表示为
-
-$$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
-
-其中`$y_k^i$`表示第`$i$`个样本第`$k$`类的真实标签(0或1)，`$softmax(g_k^i)$`表示第i个样本第k类softmax输出的概率。
-
-
-
-### Continuous Bag-of-Words model(CBOW)
-
-CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
-
-![cbow](./image/cbow.png)
-<p align="center">
-图3. CBOW模型
-</p>
-
-具体来说，不考虑上下文的词语输入顺序，CBOW是用上下文词语的词向量的均值来预测当前词。即：
-
-$$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
-
-其中`$x_t$`为第`$t$`个词的词向量，分类分数（score）向量 `$z=U*context$`，最终的分类`$y$`采用softmax，损失函数采用多类分类交叉熵。
-
-### Skip-gram model
-
-CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
-
-![skipgram](./image/skipgram.png)
-<p align="center">
-图4. Skip-gram模型
-</p>
-
-如上图所示，Skip-gram模型的具体做法是，将一个词的词向量映射到`$2n$`个词的词向量（`$2n$`表示当前输入词的前后各`$n$`个词），然后分别通过softmax得到这`$2n$`个词的分类损失值之和。
-
-
-## 数据准备
-
-### 数据介绍
-
-本教程使用Penn Treebank （PTB）（经Tomas Mikolov预处理过的版本）数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
-
-<p align="center">
-<table>
-<tr>
-<td>训练数据</td>
-<td>验证数据</td>
-<td>测试数据</td>
-</tr>
-<tr>
-<td>ptb.train.txt</td>
-<td>ptb.valid.txt</td>
-<td>ptb.test.txt</td>
-</tr>
-<tr>
-<td>42068句</td>
-<td>3370句</td>
-<td>3761句</td>
-</tr>
-</table>
-</p>
-
-
-### 数据预处理
-
-本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理，方便大家使用。
-
-预处理会把数据集中的每一句话前后加上开始符号`<s>`以及结束符号`<e>`。然后依据窗口大小（本教程中为5），从头到尾每次向右滑动窗口并生成一条数据。
-
-如"I have a dream that one day" 一句提供了5条数据：
-
-```text
-<s> I have a dream
-I have a dream that
-have a dream that one
-a dream that one day
-dream that one day <e>
-```
-
-最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
-
-## 编程实现
-
-本配置的模型结构如下图所示：
-
-![ngram](./image/ngram.png)
-<p align="center">
-图5. 模型配置中的N-gram神经网络模型
-</p>
-
-首先，加载所需要的包：
-
-```python
-import paddle
-import paddle.fluid as fluid
-import numpy
-from functools import partial
-import math
-import os
-import sys
-```
-
-然后，定义参数：
-```python
-EMBED_SIZE = 32  # word vector dimension
-HIDDEN_SIZE = 256  # hidden layer dimension
-N = 5  # train 5-gram
-BATCH_SIZE = 32  # batch size
-
-# can use CPU or GPU
-use_cuda = os.getenv('WITH_GPU', '0') != '0'
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-```
-
-不同于之前的PaddlePaddle v2版本，在新的Fluid版本里，我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`，我们就可以直接用它来构造 N-gram 神经网络。
-
-- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏，我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。
-
-```python
-def inference_program(is_sparse):
-first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-fourth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64')
-
-embed_first = fluid.layers.embedding(
-input=first_word,
-size=[dict_size, EMBED_SIZE],
-dtype='float32',
-is_sparse=is_sparse,
-param_attr='shared_w')
-embed_second = fluid.layers.embedding(
-input=second_word,
-size=[dict_size, EMBED_SIZE],
-dtype='float32',
-is_sparse=is_sparse,
-param_attr='shared_w')
-embed_third = fluid.layers.embedding(
-input=third_word,
-size=[dict_size, EMBED_SIZE],
-dtype='float32',
-is_sparse=is_sparse,
-param_attr='shared_w')
-embed_fourth = fluid.layers.embedding(
-input=fourth_word,
-size=[dict_size, EMBED_SIZE],
-dtype='float32',
-is_sparse=is_sparse,
-param_attr='shared_w')
-
-concat_embed = fluid.layers.concat(
-input=[embed_first, embed_second, embed_third, embed_fourth], axis=1)
-hidden1 = fluid.layers.fc(input=concat_embed,
-size=HIDDEN_SIZE,
-act='sigmoid')
-predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
-return predict_word
-```
-
-- 基于以上的神经网络结构，我们可以如下定义我们的`训练`方法
-
-```python
-def train_program(is_sparse):
-# The declaration of 'next_word' must be after the invoking of inference_program,
-# or the data input order of train program would be [next_word, firstw, secondw,
-# thirdw, fourthw], which is not correct.
-predict_word = inference_program(is_sparse)
-next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = fluid.layers.mean(cost)
-return avg_cost
-```
-
-- 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集：`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。在PaddlePaddle中，读取器是一个Python的函数，每次调用，会读取下一条数据。它是一个Python的generator。
-
-`paddle.batch` 会读入一个读取器，然后输出一个批次化了的读取器。`event_handler`亦可以一并传入`trainer.train`来时不时的输出每个步骤，批次的训练情况。
-
-```python
-def optimizer_func():
-# Note here we need to choose more sophisticated optimizers
-# such as AdaGrad with a decay rate. The normal SGD converges
-# very slowly.
-# optimizer=fluid.optimizer.SGD(learning_rate=0.001),
-return fluid.optimizer.AdagradOptimizer(
-learning_rate=3e-3,
-regularization=fluid.regularizer.L2DecayRegularizer(8e-4))
-
-
-def train(use_cuda, train_program, params_dirname):
-train_reader = paddle.batch(
-paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-test_reader = paddle.batch(
-paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
-
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-def event_handler(event):
-if isinstance(event, fluid.EndStepEvent):
-# We output cost every 10 steps.
-if event.step % 10 == 0:
-outs = trainer.test(
-reader=test_reader,
-feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw'])
-avg_cost = outs[0]
-
-print "Step %d: Average Cost %f" % (event.step, avg_cost)
-
-# If average cost is lower than 5.8, we consider the model good enough to stop.
-# Note 5.8 is a relatively high value. In order to get a better model, one should
-# aim for avg_cost lower than 3.5. But the training could take longer time.
-if avg_cost < 5.8:
-trainer.save_params(params_dirname)
-trainer.stop()
-
-if math.isnan(avg_cost):
-sys.exit("got NaN loss, training failed.")
-
-trainer = fluid.Trainer(
-train_func=train_program,
-optimizer_func=optimizer_func,
-place=place)
-
-trainer.train(
-reader=train_reader,
-num_epochs=1,
-event_handler=event_handler,
-feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw'])
-```
-
-- `trainer.train`将会开始训练。从`event_handler`返回的监控情况如下：
-
-```python
-Step 0: Average Cost 7.337213
-Step 10: Average Cost 6.136128
-Step 20: Average Cost 5.766995
-...
-```
-
-## 模型应用
-在模型训练后，我们可以用它做一些预测。
-
-### 预测下一个词
-我们可以用我们训练过的模型，在得知之前的 N-gram 后，预测下一个词。
-
-```python
-def infer(use_cuda, inference_program, params_dirname=None):
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-inferencer = fluid.Inferencer(
-infer_func=inference_program, param_path=params_dirname, place=place)
-
-# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
-# is simply an index to look up for the corresponding word vector and hence
-# the shape of word (base_shape) should be [1]. The length-based level of
-# detail (lod) info of each LoDtensor should be [[1]] meaning there is only
-# one lod_level and there is only one sequence of one word on this level.
-# Note that lod info should be a list of lists.
-
-data1 = [[211]]  # 'among'
-data2 = [[6]]    # 'a'
-data3 = [[96]]   # 'group'
-data4 = [[4]]    # 'of'
-lod = [[1]]
-
-first_word  = fluid.create_lod_tensor(data1, lod, place)
-second_word = fluid.create_lod_tensor(data2, lod, place)
-third_word  = fluid.create_lod_tensor(data3, lod, place)
-fourth_word = fluid.create_lod_tensor(data4, lod, place)
-
-result = inferencer.infer(
-{
-'firstw': first_word,
-'secondw': second_word,
-'thirdw': third_word,
-'fourthw': fourth_word
-},
-return_numpy=False)
-
-print(numpy.array(result[0]))
-most_possible_word_index = numpy.argmax(result[0])
-print(most_possible_word_index)
-print([
-key for key, value in word_dict.iteritems()
-if value == most_possible_word_index
-][0])
-```
-
-在经历3分钟的短暂训练后，我们得到如下的预测。我们的模型预测 `among a group of` 的下一个词是`a`。这比较符合文法规律。如果我们训练时间更长，比如几个小时，那么我们会得到的下一个预测是 `workers`。
-
-
-```python
-[[0.00106646 0.0007907  0.00072041 ... 0.00049024 0.00041355 0.00084464]]
-6
-a
-```
-
-整个程序的入口很简单：
-
-```python
-def main(use_cuda, is_sparse):
-if use_cuda and not fluid.core.is_compiled_with_cuda():
-return
-
-params_dirname = "word2vec.inference.model"
-
-train(
-use_cuda=use_cuda,
-train_program=partial(train_program, is_sparse),
-params_dirname=params_dirname)
-
-infer(
-use_cuda=use_cuda,
-inference_program=partial(inference_program, is_sparse),
-params_dirname=params_dirname)
-
-
-main(use_cuda=use_cuda, is_sparse=True)
-```
-
-
-## 总结
-本章中，我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中，我们可以根据向量间的余弦夹角，来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中，训练好的词向量可以用来初始化模型，以得到更好的效果。在文档分类中，有了词向量之后，可以用聚类的方法将文档中同义词进行分组，也可以用 N-gram 来预测下一个词。希望大家在本章后能够自行运用词向量进行相关领域的研究。
-
-
-## 参考文献
-1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
-2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
-3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
-4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
-5. https://en.wikipedia.org/wiki/Singular_value_decomposition
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/install/install_doc.rst b/source/beginners_guide/install/install_doc.rst
deleted file mode 100644
index d862eded2ff9892f9d92469ff6fa7f54ad01fb3e..0000000000000000000000000000000000000000
--- a/source/beginners_guide/install/install_doc.rst
+++ /dev/null
@@ -1,526 +0,0 @@
-.. _how_to_install:
-
-安装说明
-^^^^^^^^
-
-您可以使用我们提供的安装包，或使用源代码，安装PaddlePaddle。
-
-.. _install_linux:
-
-在Linux安装PaddlePaddle
---------
-
-推荐您使用 `pip <https://pypi.org/project/pip/>`_
-安装，它是Linux系统下最简单的安装方式。
-
-注意事项：
-
-- PaddlePaddle Python API 依赖Python 2.7版本。
-
-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
-
-  .. code-block:: bash
-
-     pip install paddlepaddle
-
-当前的默认版本为0.13.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如：
-
-  .. code-block:: bash
-
-      pip install paddlepaddle==0.12.0
-
-
-如果需要安装支持GPU的版本（cuda9.0_cudnn7_avx_openblas），需要执行：
-
-  .. code-block:: bash
-
-     pip install paddlepaddle-gpu
-
-当前的默认版本是0.13.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
-
-=================================   ========================================
-版本号                               版本说明
-=================================   ========================================
-paddlepaddle-gpu==0.13.0            使用CUDA 9.0和cuDNN 7编译的0.13.0版本
-paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
-paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
-paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
-paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
-=================================   ========================================
-
-您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_
-中找到paddlepaddle-gpu的各个发行版本。
-
-如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl
-安装包和c-api开发包并安装，您可以从下面的表格中找到需要的版本：
-
-如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
-
-.. image:: paddleci.png
-   :scale: 50 %
-   :align: center
-
-..  csv-table:: 各个版本最新的whl包
-    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
-    :widths: 1, 3, 3
-
-    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
-    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
-    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
-    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
-
-.. _FAQ:
-
-安装常见问题和解决方法
-======================
-
-- paddlepaddle*.whl is not a supported wheel on this platform.
-
-出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。
-请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，
-需要使用最新的pip (>9.0.0) 才可以安装。
-
-可以使用下面的命令更新您的pip：
-
-  .. code-block:: bash
-
-      pip install --upgrade pip
-
-如果仍然存在问题，可以执行：
-
-    .. code-block:: bash
-
-        python -c "import pip; print(pip.pep425tags.get_supported())"
-
-获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包
-可以在 `这里 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 找到。
-
-如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新；
-如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64，
-可以重命名这个whl包为 manylinux1_x86_64 再安装。
-
-
-.. _install_windows:
-
-在windows安装PaddlePaddle
-------------------------------
-
-若您的系统为windows，您可以通过Docker来使用PaddlePaddle。
-
-推荐您下载 `PaddlePaddle快速安装包 <http://paddle-windows.bj.bcebos.com/PaddlePaddle-windows.zip>`_，
-该安装包能够帮助您判断、安装适合的Docker，并引导您在Docker中使用PaddlePaddle。
-
-..
- todo: windows的安装包要放在百度云上
-
-注意事项：
-
-* 系统要求：windows7&8&10。
-
-* 下载安装包后，请您右键选择“以管理员身份运行”。
-
-* PaddlePaddle不支持在windows使用GPU。
-
-Docker安装完成后，请您执行下面的步骤：
-
-请您右键选择”以管理员身份运行“，来启动Docker客户端
-
-获取Image ID
-
-.. code-block:: bash
-
-   docker images
-
-启动Docker
-
-.. code-block:: bash
-
-   docker run -d it -t imageid /bin/bash
-
-获取Docker Container
-
-.. code-block:: bash
-
-   docker ps -a
-
-进入Container
-
-.. code-block:: bash
-
-   docker attach container
-
-.. _others:
-
-其他安装方式
--------------
-
-.. _source:
-从源码编译
-==========
-
-.. _requirements:
-
-需要的软硬件
-"""""""""""""
-
-为了编译PaddlePaddle，我们需要
-
-1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
-2. Docker
-
-不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
-
-.. _build_step:
-
-编译方法
-"""""""""""""
-
-PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。或者
-参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
-
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `附录：编译依赖`_ 之后才能开始编译的步骤。
-
-编译PaddlePaddle，需要执行：
-
-.. code-block:: bash
-
-   # 1. 获取源码
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
-   docker build -t paddle:dev .
-   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
-   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
-
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`docker_build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
-
-编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
-
-.. code-block:: bash
-
-   pip install build/python/dist/*.whl
-
-如果机器中已经安装过PaddlePaddle，有两种方法：
-
-.. code-block:: bash
-
-   1. 先卸载之前的版本，再重新安装
-   pip uninstall paddlepaddle
-   pip install build/python/dist/*.whl
-
-   2. 直接升级到更新的版本
-   pip install build/python/dist/*.whl -U
-
-.. _run_test:
-
-执行单元测试
-"""""""""""""
-
-如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
-
-设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
-开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
-
-.. code-block:: bash
-
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
-
-如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
-
-.. code-block:: bash
-
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
-   cd /paddle/build
-   ctest -R test_sum_op -V
-
-.. _faq_docker:
-
-常见问题
-"""""""""""""
-
-- 什么是 Docker?
-
-  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
-
-- Docker 还是虚拟机？
-
-  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
-
-- 为什么用 Docker?
-
-  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
-
-  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
-
-- 可以选择不用Docker吗？
-
-  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
-
-- 学习 Docker 有多难？
-
-  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_。
-  这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
-
-- 可以用 IDE 吗？
-
-  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
-
-  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
-
-  .. code-block:: bash
-
-    (global-set-key "\C-cc" 'compile)
-    (setq compile-command
-     "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-
-  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
-
-- 可以并行编译吗？
-
-  是的。我们的 Docker image 运行一个 `Bash 脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/paddle/scripts/paddle_build.sh>`_。这个脚本调用 :code:`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
-
-- Docker 需要 sudo
-
-  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
-
-- 在 Windows/MacOS 上编译很慢
-
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_。
-
-- 磁盘不够
-
-  本文中的例子里， :code:`docker run` 命令里都用了 :code:`--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 :code:`docker ps -a` 命令看到停止后但是没有删除的 containers。 :code:`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
-
-
-.. _compile_deps:
-
-附录：编译依赖
-"""""""""""""
-
-PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
-
-.. csv-table:: PaddlePaddle编译依赖
-   :header: "依赖", "版本", "说明"
-   :widths: 10, 15, 30
-
-   "CMake", ">=3.2", ""
-   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
-   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
-
-
-.. _build_options:
-
-附录：编译选项
-"""""""""""""
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
-用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
-`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: 编译选项说明
-    :header: "选项", "说明", "默认值"
-    :widths: 1, 7, 2
-
-    "WITH_GPU", "是否支持GPU", "ON"
-    "WITH_C_API", "是否仅编译CAPI", "OFF"
-    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
-    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
-    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
-    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
-    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
-    "WITH_TESTING", "是否开启单元测试", "OFF"
-    "WITH_DOC", "是否编译中英文文档", "OFF"
-    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
-    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
-
-BLAS
-+++++
-
-PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
-`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
-
-如果关闭MKL，则会使用OpenBLAS作为BLAS库。
-
-CUDA/cuDNN
-+++++++++++
-
-PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
-使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
-
-PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
-我们推荐使用最新版本的cuDNN。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
-
-**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
-
-.. _install_docker:
-
-使用Docker安装运行
-==================
-
-使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境。
-您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_
-获得基本的Docker安装和使用方法。
-
-在了解Docker的基本使用方法之后，即可开始下面的步骤：
-
-.. _docker_pull:
-
-获取PaddlePaddle的Docker镜像
-""""""""""""""""""""""""""""
-
-执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl：
-
-  .. code-block:: bash
-
-     docker pull paddlepaddle/paddle
-
-对于国内用户，我们提供了加速访问的镜像源：
-
-  .. code-block:: bash
-
-     docker pull docker.paddlepaddlehub.com/paddle
-
-下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
-
-  .. code-block:: bash
-
-     docker pull paddlepaddle/paddle:latest-gpu
-     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
-
-选择下载使用不同的BLAS库的Docker镜像：
-
-  .. code-block:: bash
-
-     # 默认是使用MKL的镜像
-     docker pull paddlepaddle/paddle
-     # 使用OpenBLAS的镜像
-     docker pull paddlepaddle/paddle:latest-openblas
-
-下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
-
-  .. code-block:: bash
-
-     docker pull paddlepaddle/paddle:[tag]
-     # 比如：
-     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
-
-.. _docker_run:
-
-在Docker中执行PaddlePaddle训练程序
-"""""""""""""""""""""""""""""""""""
-
-假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
-`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
-编写），就可以使用下面的命令开始执行训练：
-
-  .. code-block:: bash
-
-     cd /home/work
-     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
-
-上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
-指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
-目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
-为容器内执行的命令，即运行训练程序。
-
-当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
-
-  .. code-block:: bash
-     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
-     cd /work
-     python train.py
-
-**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
-
-.. _docker_run_book:
-
-使用Docker启动PaddlePaddle Book教程
-""""""""""""""""""""""""""""""""""""
-
-使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
-如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
-大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
-
-我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
-
-  .. code-block:: bash
-
-     docker run -p 8888:8888 paddlepaddle/book
-
-国内用户可以使用下面的镜像源来加速访问：
-
-  .. code-block: bash
-
-    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
-
-然后在浏览器中输入以下网址：
-
-  .. code-block:: text
-
-     http://localhost:8888/
-
-就这么简单，享受您的旅程！
-
-.. _docker_run_gpu:
-
-使用Docker执行GPU训练
-""""""""""""""""""""""""""""
-
-为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
-`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
-请不要忘记提前在物理机上安装GPU最新驱动。
-
-  .. code-block:: bash
-
-     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
-
-**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
-
-  .. code-block:: bash
-
-     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
-
-**关于AVX：**
-
-AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
-是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
-
-以下指令能检查Linux电脑是否支持AVX：
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/source/beginners_guide/quick_start/fit_a_line/README.cn.md b/source/beginners_guide/quick_start/fit_a_line/README.cn.md
deleted file mode 100644
index ba43ada5100ed1db7192de9c795b4b8a6596d705..0000000000000000000000000000000000000000
--- a/source/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ /dev/null
@@ -1,329 +0,0 @@
-```eval_rst
-..  _quick_start_fit_a_line:
-```
-# 线性回归
-让我们从经典的线性回归（Linear Regression \[[1](#参考文献)\]）模型开始这份教程。在这一章里，你将使用真实的数据集建立起一个房价预测模型，并且了解到机器学习中的若干重要概念。
-
-本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。
-
-## 背景介绍
-给定一个大小为`$n$`的数据集  `${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$`，其中`$x_{i1}, \ldots, x_{id}$`是第`$i$`个样本`$d$`个属性上的取值，`$y_i$`是该样本待预测的目标。线性回归模型假设目标`$y_i$`可以被属性间的线性组合描述，即
-
-$$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldots,n$$
-
-例如，在我们将要建模的房价预测问题里，`$x_{ij}$`是描述房子`$i$`的各种属性（比如房间的个数、周围学校和医院的个数、交通状况等），而 `$y_i$`是房屋的价格。
-
-初看起来，这个假设实在过于简单了，变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点，它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍\[[2,3,4](#参考文献)\]也选择对线性模型独立成章重点讲解。
-
-## 效果展示
-我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
-
-![BostonHousePricePredictions](./image/predictions.png)
-<p align="center">图1. 预测值 V.S. 真实值</p>
-
-## 模型概览
-
-### 模型定义
-
-在波士顿房价数据集中，和房屋相关的值共有14个：前13个用来描述房屋相关的各种信息，即模型中的 `$x_i$`；最后一个值为我们要预测的该类房屋价格的中位数，即模型中的 `$y_i$`。因此，我们的模型就可以表示成：
-
-$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
-
-`$\hat{Y}$` 表示模型的预测结果，用来和真实值`$Y$`区分。模型要学习的参数即：`$\omega_1, \ldots, \omega_{13}, b$`。
-
-建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值`$\hat{Y}$`尽可能地接近真实值`$Y$`。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值`$y_{i}$`和模型给出的预测值`$\hat{y_{i}}$`，损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。
-
-对于线性回归模型来讲，最常见的损失函数就是均方误差（Mean Squared Error， [MSE](https://en.wikipedia.org/wiki/Mean_squared_error)）了，它的形式是：
-
-$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
-
-即对于一个大小为`$n$`的测试集，`$MSE$`是`$n$`个数据预测结果误差平方的均值。
-
-### 训练过程
-
-定义好模型结构之后，我们要通过以下几个步骤进行模型训练
-1. 初始化参数，其中包括权重`$\omega_i$`和偏置`$b$`，对其进行初始化（如0均值，1方差）。
-2. 网络正向传播计算网络输出和损失函数。
-3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
-4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
-
-## 数据集
-
-### 数据集介绍
-这份数据集共506行，每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下：
-
-<p align="center">
-<table>
-    <thead>
-    <tr>
-        <th>属性名</th>
-        <th>解释</th>
-        <th>类型</th>
-    </tr>
-    </thead>
-    <tbody>
-    <tr>
-        <td>CRIM</td>
-        <td>该镇的人均犯罪率</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>ZN</td>
-        <td>占地面积超过25,000平方呎的住宅用地比例</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>INDUS</td>
-        <td>非零售商业用地比例</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>CHAS</td>
-        <td>是否邻近 Charles River</td>
-        <td>离散值，1=邻近；0=不邻近</td>
-    </tr>
-    <tr>
-        <td>NOX</td>
-        <td>一氧化氮浓度</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>RM</td>
-        <td>每栋房屋的平均客房数</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>AGE</td>
-        <td>1940年之前建成的自用单位比例</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>DIS</td>
-        <td>到波士顿5个就业中心的加权距离</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>RAD</td>
-        <td>到径向公路的可达性指数</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>TAX</td>
-        <td>全值财产税率</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>PTRATIO</td>
-        <td>学生与教师的比例</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>B</td>
-        <td>1000(BK - 0.63)^2，其中BK为黑人占比</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>LSTAT</td>
-        <td>低收入人群占比</td>
-        <td>连续值</td>
-    </tr>
-    <tr>
-        <td>MEDV</td>
-        <td>同类房屋价格的中位数</td>
-        <td>连续值</td>
-    </tr>
-    </tbody>
-</table>
-</p>
-
-### 数据预处理
-#### 连续值与离散值
-观察一下数据，我们的第一个发现是：所有的13维属性中，有12维的连续值和1维的离散值（CHAS）。离散值虽然也常使用类似0、1、2这样的数字表示，但是其含义与连续值是不同的，因为这里的差值没有实际意义。例如，我们用0、1、2来分别表示红色、绿色和蓝色的话，我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有`$d$`个可能取值的离散属性，我们会将它们转为`$d$`个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言，因为CHAS本身就是一个二值属性，就省去了这个麻烦。
-
-#### 属性的归一化
-另外一个稍加观察即可发现的事实是，各维属性的取值范围差别很大（如图2所示）。例如，属性B的取值范围是[0.32, 396.90]，而属性NOX的取值范围是[0.3850, 0.8170]。这里就要用到一个常见的操作-归一化（normalization）了。归一化的目标是把各位属性的取值范围放缩到差不多的区间，例如[-0.5,0.5]。这里我们使用一种很常见的操作方法：减掉均值，然后除以原取值范围。
-
-做归一化（或 [Feature scaling](https://en.wikipedia.org/wiki/Feature_scaling)）至少有以下3个理由：
-- 过大或过小的数值范围会导致计算时的浮点上溢或下溢。
-- 不同的数值范围会导致不同属性对模型的重要性不同（至少在训练的初始阶段如此），而这个隐含的假设常常是不合理的。这会对优化的过程造成困难，使训练时间大大的加长。
-- 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
-
-![featureScale](./image/ranges.png)
-<p align="center">图2. 各维属性的取值范围</p>
-
-#### 整理训练集与测试集
-我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。我们这个例子中设置的分割比例为`$8:2$`
-
-
-在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
-
-## 训练
-
-`fit_a_line/trainer.py`演示了训练的整体过程。
-
-### 配置数据提供器(Datafeeder)
-首先我们引入必要的库：
-```python
-import paddle
-import paddle.fluid as fluid
-import numpy
-```
-
-我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
-
-其中，在uci_housing模块中封装了：
-
-1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
-2. [数据预处理](#数据预处理)的过程。
-
-接下来我们定义了用于训练和测试的数据提供器。提供器每次读入一个大小为`BATCH_SIZE`的数据批次。如果用户希望加一些随机性，她可以同时定义一个批次大小和一个缓存大小。这样的话，每次数据提供器会从缓存中随机读取批次大小那么多的数据。
-
-```python
-BATCH_SIZE = 20
-
-train_reader = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.uci_housing.train(), buf_size=500),
-batch_size=BATCH_SIZE)
-
-test_reader = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.uci_housing.test(), buf_size=500),
-batch_size=BATCH_SIZE)
-```
-
-### 配置训练程序
-训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲，它就是一个从输入到输出的简单的全连接层。更加复杂的结果，比如卷积神经网络，递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值，因为它会被后面反向传播算法所用到。
-
-```python
-def train_program():
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-# feature vector of length 13
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_loss = fluid.layers.mean(loss)
-
-return avg_loss
-```
-
-### 定义运算场所
-我们可以定义运算是发生在CPU还是GPU
-
-```python
-use_cuda = False
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-```
-
-### 创建训练器
-训练器会读入一个训练程序和一些必要的其他参数：
-
-```python
-trainer = fluid.Trainer(
-train_func=train_program,
-place=place,
-optimizer_func=fluid.optimizer.SGD(learning_rate=0.001))
-```
-
-### 开始提供数据
-PaddlePaddle提供了读取数据者发生器机制来读取训练数据。读取数据者会一次提供多列数据，因此我们需要一个Python的list来定义读取顺序。
-
-```python
-feed_order=['x', 'y']
-```
-
-除此之外，可以定义一个事件相应器来处理类似`打印训练进程`的事件：
-
-```python
-# Specify the directory path to save the parameters
-params_dirname = "fit_a_line.inference.model"
-
-# Plot data
-from paddle.v2.plot import Ploter
-train_title = "Train cost"
-test_title = "Test cost"
-plot_cost = Ploter(train_title, test_title)
-
-step = 0
-
-# event_handler to print training and testing info
-def event_handler_plot(event):
-global step
-if isinstance(event, fluid.EndStepEvent):
-if event.step % 10 == 0: # every 10 batches, record a test cost
-test_metrics = trainer.test(
-reader=test_reader, feed_order=feed_order)
-
-plot_cost.append(test_title, step, test_metrics[0])
-plot_cost.plot()
-
-if test_metrics[0] < 10.0:
-# If the accuracy is good enough, we can stop the training.
-print('loss is less than 10.0, stop')
-trainer.stop()
-
-# We can save the trained parameters for the inferences later
-if params_dirname is not None:
-trainer.save_params(params_dirname)
-
-step += 1
-```
-
-### 开始训练
-我们现在可以通过调用`trainer.train()`来开始训练
-
-```python
-%matplotlib inline
-
-# The training could take up to a few minutes.
-trainer.train(
-reader=train_reader,
-num_epochs=100,
-event_handler=event_handler_plot,
-feed_order=feed_order)
-```
-
-![trainTestCost](./image/train_and_test.png)
-
-## 预测
-提供一个`inference_program`和一个`params_dirname`来初始化预测器。`params_dirname`用来存储我们的参数。
-
-### 设定预测程序
-类似于`trainer.train`，预测器需要一个预测程序来做预测。我们可以稍加修改我们的训练程序来把预测值包含进来。
-
-
-```python
-def inference_program():
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
-return y_predict
-```
-
-### 预测
-预测器会从`params_dirname`中读取已经训练好的模型，来对从未遇见过的数据进行预测。
-
-```python
-inferencer = fluid.Inferencer(
-infer_func=inference_program, param_path=params_dirname, place=place)
-
-batch_size = 10
-tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
-
-results = inferencer.infer({'x': tensor_x})
-print("infer results: ", results[0])
-```
-
-## 总结
-在这章里，我们借助波士顿房价这一数据集，介绍了线性回归模型的基本概念，以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来，因此弄清楚线性模型的原理和局限非常重要。
-
-
-## 参考文献
-1. https://en.wikipedia.org/wiki/Linear_regression
-2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
-3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
-4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/quick_start/recognize_digits/README.cn.md b/source/beginners_guide/quick_start/recognize_digits/README.cn.md
deleted file mode 100644
index 71d64339d8633f7113df682e509b988ec06edf23..0000000000000000000000000000000000000000
--- a/source/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ /dev/null
@@ -1,430 +0,0 @@
-# 识别数字
-
-本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/167.html)。
-
-## 背景介绍
-当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
-
-![MNIST](./image/mnist_example_image.png)
-<p align="center">图1. MNIST图片示例</p>
-
-MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
-
-Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
-
-有很多算法在MNIST上进行实验。1998年，LeCun分别用单层线性分类器、多层感知器（Multilayer Perceptron, MLP）和多层卷积神经网络LeNet进行实验，使得测试集上的误差不断下降（从12%下降到0.7%）\[[1](#参考文献)\]。此后，科学家们又基于K近邻（K-Nearest Neighbors）算法\[[2](#参考文献)\]、支持向量机（SVM）\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验，并采用多种预处理方法（如去除歪曲、去噪、模糊等）来提高识别的准确率。
-
-本教程中，我们从简单的模型Softmax回归开始，带大家入门手写字符识别，并逐步进行模型优化。
-
-
-## 模型概览
-
-基于MNIST数据训练一个分类器，在介绍本教程使用的三个基本图像分类网络前，我们先给出一些定义：
-- `$X$`是输入：MNIST图片是`$28\times28$` 的二维图像，为了进行计算，我们将其转化为`$784$`维向量，即`$X=\left ( x_0, x_1, \dots, x_{783} \right )$`。
-- `$Y$`是输出：分类器的输出是10类数字（0-9），即`$Y=\left ( y_0, y_1, \dots, y_9 \right )$`，每一维`$y_i$`代表图片分类为第`$i$`类数字的概率。
-- `$L$`是图片的真实标签：`$L=\left ( l_0, l_1, \dots, l_9 \right )$`也是10维，但只有一维为1，其他都为0。
-
-### Softmax回归(Softmax Regression)
-
-最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征，然后直接通过softmax 函数进行多分类\[[9](#参考文献)\]。
-
-输入层的数据`$X$`传到输出层，在激活操作之前，会乘以相应的权重 `$W$` ，并加上偏置变量 `$b$` ，具体如下：
-
-$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
-
-其中 `$ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $`
-
-对于有 `$N$` 个类别的多分类问题，指定 `$N$` 个输出节点，`$N$` 维结果向量经过softmax将归一化为 `$N$` 个[0,1]范围内的实数值，分别表示该样本属于这 `$N$` 个类别的概率。此处的 `$y_i$` 即对应该图片为数字 `$i$` 的预测概率。
-
-在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy），公式如下：
-
-$$  \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$
-
-图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
-
-![softmaxRegression](./image/softmax_regression.png)
-<p align="center">图2. softmax回归网络结构图</p>
-
-### 多层感知器(Multilayer Perceptron, MLP)
-
-Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。
-
-1.  经过第一个隐藏层，可以得到 `$ H_1 = \phi(W_1X + b_1) $`，其中`$\phi$`代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
-2.  经过第二个隐藏层，可以得到 `$ H_2 = \phi(W_2H_1 + b_2) $`。
-3.  最后，再经过输出层，得到的`$Y=\text{softmax}(W_3H_2 + b_3)$`，即为最后的分类结果向量。
-
-
-图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
-
-![multilayerPerceptron](./image/mlp.png)
-<p align="center">图3. 多层感知器网络结构图</p>
-
-### 卷积神经网络(Convolutional Neural Network, CNN)
-
-在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
-
-![cnnStructure](./image/cnn.png)
-<p align="center">图4. LeNet-5卷积神经网络结构</p>
-
-#### 卷积层
-
-卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
-
-![cnn](./image/conv_layer.png)
-<p align="center">图5. 卷积层图片</p>
-
-图5给出一个卷积计算过程的示例图，输入图像大小为`$H=5,W=5,D=3$`，即`$5 \times 5$`大小的3通道（RGB，也称作深度）彩色图像。这个示例图中包含两（用`$K$`表示）组卷积核，即图中滤波器`$W_0$`和`$W_1$`。在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含（`$D=3$`）个`$3 \times 3$`（用`$F \times F$`表示）大小的卷积核。另外，这个示例中卷积核在图像的水平方向（`$W$`方向）和垂直方向（`$H$`方向）的滑动步长为2（用`$S$`表示）；对输入图像周围各填充1（用`$P$`表示）个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为`$3 \times 3 \times 2$`（用`$H_{o} \times W_{o} \times K$`表示）大小的特征图，即`$3 \times 3$`大小的2通道特征图，其中`$H_o$`计算公式为：`$H_o = (H - F + 2 \times P)/S + 1$`，`$W_o$`同理。 而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置`$b_o$`，偏置通常对于每个输出特征图是共享的。输出特征图`$o[:,:,0]$`中的最后一个`$-2$`计算如图5右下角公式所示。
-
-在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为`$D \times F \times F \times K$`。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。
-
-- 局部连接：每个神经元仅与输入神经元的一块区域连接，这块局部区域称作感受野（receptive field）。在图像卷积操作中，即神经元在空间维度（spatial dimension，即上图示例H和W所在的平面）是局部连接，但在深度上是全部连接。对于二维图像本身而言，也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想，也是受启发于生物学里面的视觉系统结构，视觉皮层的神经元就是局部接受信息的。
-
-- 权重共享：计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算`$o[:,:,0]$`的每个每个神经元的滤波器均相同，都为`$W_0$`，这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的，例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的，比如输入的图片是人脸，眼睛和头发位于不同的位置，希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的，在卷积层，通常采用多组卷积核提取不同特征，即对应不同深度切片的特征，不同深度切片的神经元权重是不共享。另外，偏重对同一深度切片的所有神经元都是共享的。
-
-通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性（shift-invariant），平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。
-
-#### 池化层
-
-![pooling](./image/max_pooling.png)
-<p align="center">图6. 池化层图片</p>
-
-池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
-
-更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
-
-### 常见激活函数介绍
-- sigmoid激活函数： `$ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $`
-
-- tanh激活函数： `$ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $`
-
-实际上，tanh函数只是规模变化的sigmoid函数，将sigmoid函数值放大2倍之后再向下平移1个单位：tanh(x) = 2sigmoid(2x) - 1 。
-
-- ReLU激活函数： `$ f(x) = max(0, x) $`
-
-更详细的介绍请参考[维基百科激活函数](https://en.wikipedia.org/wiki/Activation_function)。
-
-## 数据介绍
-
-PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mnist/)数据的模块`paddle.dataset.mnist`。加载后的数据位于`/home/username/.cache/paddle/dataset/mnist`下：
-
-| 文件名称                | 说明                       |
-|-------------------------|----------------------------|
-| train-images-idx3-ubyte | 训练数据图片，60,000条数据 |
-| train-labels-idx1-ubyte | 训练数据标签，60,000条数据 |
-| t10k-images-idx3-ubyte  | 测试数据图片，10,000条数据 |
-| t10k-labels-idx1-ubyte  | 测试数据标签，10,000条数据 |
-
-## Fluid API 概述
-
-演示将使用最新的 `Fluid API`。Fluid API是最新的 PaddlePaddle API。它在不牺牲性能的情况下简化了模型配置。
-我们建议使用 Fluid API，因为它更容易学起来。
-
-下面是快速的 Fluid API 概述。
-1. `inference_program`：指定如何从数据输入中获得预测的函数。
-这是指定网络流的地方。
-
-1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
-这是指定损失计算的地方。
-
-1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
-
-1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
-通过 `event_handler` 回调函数，用户可以监控培训的进展。
-
-1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
-然后，它可以推断数据和返回预测。
-
-在这个演示中，我们将深入了解它们。
-
-## 配置说明
-加载 PaddlePaddle 的 Fluid API 包。
-
-```python
-import paddle
-import paddle.fluid as fluid
-```
-
-### Program Functions 配置
-
-我们需要设置“推理程序”函数。我们想用这个程序来演示三个不同的分类器，每个分类器都定义为 Python 函数。
-我们需要将图像数据馈送到分类器。Paddle 为读取数据提供了一个特殊的层 `layer.data` 层。
-让我们创建一个数据层来读取图像并将其连接到分类网络。
-
-- Softmax回归：只通过一层简单的以softmax为激活函数的全连接层，就可以得到分类的结果。
-
-```python
-def softmax_regression():
-img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-predict = fluid.layers.fc(
-input=img, size=10, act='softmax')
-return predict
-```
-
-- 多层感知器：下面代码实现了一个含有两个隐藏层（即全连接层）的多层感知器。其中两个隐藏层的激活函数均采用ReLU，输出层的激活函数用Softmax。
-
-```python
-def multilayer_perceptron():
-img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-# 第一个全连接层，激活函数为ReLU
-hidden = fluid.layers.fc(input=img, size=200, act='relu')
-# 第二个全连接层，激活函数为ReLU
-hidden = fluid.layers.fc(input=hidden, size=200, act='relu')
-# 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
-prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-return prediction
-```
-
-- 卷积神经网络LeNet-5: 输入的二维图像，首先经过两次卷积层到池化层，再经过全连接层，最后使用以softmax为激活函数的全连接层作为输出层。
-
-```python
-def convolutional_neural_network():
-img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-# 第一个卷积-池化层
-conv_pool_1 = fluid.nets.simple_img_conv_pool(
-input=img,
-filter_size=5,
-num_filters=20,
-pool_size=2,
-pool_stride=2,
-act="relu")
-conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
-# 第二个卷积-池化层
-conv_pool_2 = fluid.nets.simple_img_conv_pool(
-input=conv_pool_1,
-filter_size=5,
-num_filters=50,
-pool_size=2,
-pool_stride=2,
-act="relu")
-# 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
-prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-return prediction
-```
-
-#### Train Program 配置
-然后我们需要设置训练程序 `train_program`。它首先从分类器中进行预测。
-在训练期间，它将从预测中计算 `avg_cost`。
-
-**注意:** 训练程序应该返回一个数组，第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。
-
-请随意修改代码，测试 Softmax 回归 `softmax_regression`, `MLP` 和 卷积神经网络 `convolutional neural network` 分类器之间的不同结果。
-
-```python
-def train_program():
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-# predict = softmax_regression() # uncomment for Softmax回归
-# predict = multilayer_perceptron() # uncomment for 多层感知器
-predict = convolutional_neural_network() # uncomment for LeNet5卷积神经网络
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(cost)
-acc = fluid.layers.accuracy(input=predict, label=label)
-return [avg_cost, acc]
-
-
-# 该模型运行在单个CPU上
-```
-
-#### Optimizer Function 配置
-
-在下面的 `Adam optimizer`，`learning_rate` 是训练的速度，与网络的训练收敛速度有关系。
-
-```python
-def optimizer_program():
-return fluid.optimizer.Adam(learning_rate=0.001)
-```
-
-### 数据集 Feeders 配置
-
-下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
-
-下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B —— reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
-
-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
-
-```python
-train_reader = paddle.batch(
-paddle.reader.shuffle(
-paddle.dataset.mnist.train(), buf_size=500),
-batch_size=64)
-
-test_reader = paddle.batch(
-paddle.dataset.mnist.test(), batch_size=64)
-```
-
-### Trainer 配置
-
-现在，我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer`。
-
-```python
-# 该模型运行在单个CPU上
-use_cuda = False # set to True if training with GPU
-place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-
-trainer = fluid.Trainer(
-train_func=train_program, place=place, optimizer_func=optimizer_program)
-```
-
-#### Event Handler 配置
-
-Fluid API 在训练期间为回调函数提供了一个钩子。用户能够通过机制监控培训进度。
-我们将在这里演示两个 `event_handler` 程序。请随意修改 Jupyter 笔记本 ，看看有什么不同。
-
-`event_handler` 用来在训练过程中输出训练结果
-
-```python
-# Save the parameter into a directory. The Inferencer can load the parameters from it to do infer
-params_dirname = "recognize_digits_network.inference.model"
-lists = []
-def event_handler(event):
-if isinstance(event, fluid.EndStepEvent):
-if event.step % 100 == 0:
-# event.metrics maps with train program return arguments.
-# event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example.
-print "Pass %d, Batch %d, Cost %f" % (
-event.step, event.epoch, event.metrics[0])
-
-if isinstance(event, fluid.EndEpochEvent):
-avg_cost, acc = trainer.test(
-reader=test_reader, feed_order=['img', 'label'])
-
-print("Test with Epoch %d, avg_cost: %s, acc: %s" % (event.epoch, avg_cost, acc))
-
-# save parameters
-trainer.save_params(params_dirname)
-lists.append((event.epoch, avg_cost, acc))
-```
-
-`event_handler_plot` 可以用来在训练过程中画图如下：
-
-![png](./image/train_and_test.png)
-
-```python
-from paddle.v2.plot import Ploter
-
-train_title = "Train cost"
-test_title = "Test cost"
-cost_ploter = Ploter(train_title, test_title)
-step = 0
-lists = []
-
-# event_handler to plot a figure
-def event_handler_plot(event):
-global step
-if isinstance(event, fluid.EndStepEvent):
-if step % 100 == 0:
-# event.metrics maps with train program return arguments.
-# event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example.
-cost_ploter.append(train_title, step, event.metrics[0])
-cost_ploter.plot()
-step += 1
-if isinstance(event, fluid.EndEpochEvent):
-# save parameters
-trainer.save_params(params_dirname)
-
-avg_cost, acc = trainer.test(
-reader=test_reader, feed_order=['img', 'label'])
-cost_ploter.append(test_title, step, avg_cost)
-lists.append((event.epoch, avg_cost, acc))
-```
-
-#### 开始训练
-
-既然我们设置了 `event_handler` 和 `data reader`，我们就可以开始训练模型了。
-
-`feed_order` 用于将数据目录映射到 `train_program`
-
-```python
-trainer.train(
-num_epochs=5,
-event_handler=event_handler,
-reader=train_reader,
-feed_order=['img', 'label'])
-```
-
-训练过程是完全自动的，event_handler里打印的日志类似如下所示：
-
-```
-Pass 0, Batch 0, Cost 0.125650
-Pass 100, Batch 0, Cost 0.161387
-Pass 200, Batch 0, Cost 0.040036
-Pass 300, Batch 0, Cost 0.023391
-Pass 400, Batch 0, Cost 0.005856
-Pass 500, Batch 0, Cost 0.003315
-Pass 600, Batch 0, Cost 0.009977
-Pass 700, Batch 0, Cost 0.020959
-Pass 800, Batch 0, Cost 0.105560
-Pass 900, Batch 0, Cost 0.239809
-Test with Epoch 0, avg_cost: 0.053097883707459624, acc: 0.9822850318471338
-```
-
-训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。
-
-
-## 应用模型
-
-可以使用训练好的模型对手写体数字图片进行分类，下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断。
-
-### Inference 配置
-
-`Inference` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。
-我们可以简单地插入在此之前定义的分类器。
-
-```python
-inferencer = fluid.Inferencer(
-# infer_func=softmax_regression, # uncomment for softmax regression
-# infer_func=multilayer_perceptron, # uncomment for MLP
-infer_func=convolutional_neural_network,  # uncomment for LeNet5
-param_path=params_dirname,
-place=place)
-```
-
-### 生成预测输入数据
-
-`infer_3.png` 是数字 3 的一个示例图像。把它变成一个 numpy 数组以匹配数据馈送格式。
-
-```python
-# Prepare the test image
-import os
-import numpy as np
-from PIL import Image
-def load_image(file):
-im = Image.open(file).convert('L')
-im = im.resize((28, 28), Image.ANTIALIAS)
-im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32)
-im = im / 255.0 * 2.0 - 1.0
-return im
-
-cur_dir = cur_dir = os.getcwd()
-img = load_image(cur_dir + '/image/infer_3.png')
-```
-
-### 预测
-
-现在我们准备做预测。
-
-```python
-results = inferencer.infer({'img': img})
-lab = np.argsort(results)  # probs and lab are the results of one batch data
-print "Label of image/infer_3.png is: %d" % lab[0][0][-1]
-```
-
-## 总结
-
-本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型，后续章节中复杂的神经网络都是从它们衍生出来的，因此这几个模型对之后的学习大有裨益。同时，我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候，MNIST数据集上的识别准确率有了大幅度的提升，原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候，希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外，本教程还介绍了PaddlePaddle模型搭建的基本流程，从dataprovider的编写、网络层的构建，到最后的训练和预测。对这个流程熟悉以后，大家就可以用自己的数据，定义自己的网络模型，并完成自己的训练和预测任务了。
-
-## 参考文献
-
-1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
-2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
-3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
-4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
-5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
-6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
-7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
-8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
-9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
-10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/source/beginners_guide/quick_start/recognize_digits/image/conv_layer.png b/source/beginners_guide/quick_start/recognize_digits/image/conv_layer.png
deleted file mode 100644
index c751892ba0be3ae803b5933c3f33487ecfb6fe7f..0000000000000000000000000000000000000000
Binary files a/source/beginners_guide/quick_start/recognize_digits/image/conv_layer.png and /dev/null differ
diff --git a/source/conf.py b/source/conf.py
deleted file mode 100644
index 42b24c810e72301a7d626665e3fb96ed68a4377a..0000000000000000000000000000000000000000
--- a/source/conf.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# PaddlePaddle Fluid documentation build configuration file, created by
-# sphinx-quickstart on Thu Jun  7 17:04:53 2018.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import sys
-import os
-import shlex
-from recommonmark.parser import CommonMarkParser
-from recommonmark.transform import AutoStructify
-import paddle.fluid
-import sphinx.ext.napoleon
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.todo',
-    'sphinx_markdown_tables',
-    'sphinx.ext.napoleon',
-]
-
-# napoleon configuration
-napoleon_use_admonition_for_examples = True
-napoleon_use_admonition_for_notes = True
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-source_parsers = {
-    '.md': CommonMarkParser,
-}
-
-source_suffix = ['.rst', '.md']
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'PaddlePaddle Fluid'
-copyright = u'2018, paddle-dev@baidu.com'
-author = u'paddle-dev@baidu.com'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '0.14.0'
-# The full version, including alpha/beta/rc tags.
-release = '0.14.0'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = 'zh_CN'
-# i18n
-locale_dirs = ['locale/']   # path is example but recommended.
-gettext_compact = False     # optional.
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = []
-
-# The reST default role (used for this markup: `text`) to use for all
-# documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Add any extra paths that contain custom files (such as robots.txt or
-# .htaccess) here, relative to this directory. These files are copied
-# directly to the root of the documentation.
-#html_extra_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Language to be used for generating the HTML full-text search index.
-# Sphinx supports the following languages:
-#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
-#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
-#html_search_language = 'en'
-
-# A dictionary with options for the search language support, empty by default.
-# Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
-
-# The name of a javascript file (relative to the configuration directory) that
-# implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'PaddlePaddleFluiddoc'
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
-
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-  (master_doc, 'PaddlePaddleFluid.tex', u'PaddlePaddle Fluid Documentation',
-   u'paddle-dev@baidu.com', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'paddlepaddlefluid', u'PaddlePaddle Fluid Documentation',
-     [author], 1)
-]
-
-# If true, show URL addresses after external links.
-#man_show_urls = False
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-  (master_doc, 'PaddlePaddleFluid', u'PaddlePaddle Fluid Documentation',
-   author, 'PaddlePaddleFluid', 'One line description of project.',
-   'Miscellaneous'),
-]
-
-# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
-
-# If false, no module index is generated.
-#texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
-
-# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
-
-def setup(app):
-    app.add_config_value('recommonmark_config', {
-            'auto_toc_tree_section': 'Contents',
-            'enable_inline_math': True,
-            'enable_eval_rst': True,
-            'enable_math': True
-            }, True)
-    app.add_transform(AutoStructify)
diff --git a/source/index.rst b/source/index.rst
deleted file mode 100644
index d3c56d918fbb39a4447dc9bf057f4f57ebf5b7c3..0000000000000000000000000000000000000000
--- a/source/index.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-.. PaddlePaddle Fluid documentation master file, created by
-   sphinx-quickstart on Thu Jun  7 17:04:53 2018.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-##############
-欢迎来到 Fluid
-##############
-
-..  todo::
-    内容简介，导引
-
-
-..  toctree::
-    :maxdepth: 1
-
-    beginners_guide/index.rst
-    user_guides/index.rst
-    advanced_usage/index.rst
-    api_guides/index.rst
-    api_reference/index.rst
-    faq.rst
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
diff --git a/source/locale/en/LC_MESSAGES/advanced_usage/deploy/index.po b/source/locale/en/LC_MESSAGES/advanced_usage/deploy/index.po
deleted file mode 100644
index f3401c178c6d8504e8871f1c26741dc6956a3f1d..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/advanced_usage/deploy/index.po
+++ /dev/null
@@ -1,32 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/advanced_usage/deploy/index.rst:3
-msgid "预测部署"
-msgstr ""
-
-#: ../../source/advanced_usage/deploy/index.rst:6
-msgid "服务端"
-msgstr ""
-
-#: ../../source/advanced_usage/deploy/index.rst:10
-msgid "移动端"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/advanced_usage/development/index.po b/source/locale/en/LC_MESSAGES/advanced_usage/development/index.po
deleted file mode 100644
index 35f9edb9f81ee28242cea91c17c08417ff7f3dfb..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/advanced_usage/development/index.po
+++ /dev/null
@@ -1,44 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/advanced_usage/development/index.rst:3
-msgid "如何开发PaddlePaddle"
-msgstr ""
-
-#: ../../source/advanced_usage/development/index.rst:7
-msgid "如何贡献代码"
-msgstr ""
-
-#: ../../source/advanced_usage/development/index.rst:10
-msgid "如何贡献文档"
-msgstr ""
-
-#: ../../source/advanced_usage/development/index.rst:13
-msgid "如何写新的operator"
-msgstr ""
-
-#: ../../source/advanced_usage/development/index.rst:16
-msgid "CPU性能调优"
-msgstr ""
-
-#: ../../source/advanced_usage/development/index.rst:19
-msgid "GPU性能调优"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/advanced_usage/index.po b/source/locale/en/LC_MESSAGES/advanced_usage/index.po
deleted file mode 100644
index 3104ea279ccafe5992c1fb0bd924152723fcb696..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/advanced_usage/index.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/advanced_usage/index.rst:3
-msgid "进阶使用"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/high_level/index.po b/source/locale/en/LC_MESSAGES/api_guides/high_level/index.po
deleted file mode 100644
index 608cc3cd5f8de296cbf4d830075d6202d3b25924..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/high_level/index.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/high_level/index.rst:3
-msgid "High level API"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/index.po b/source/locale/en/LC_MESSAGES/api_guides/index.po
deleted file mode 100644
index 3a92dcfa3c0c65736f139b0c2fc79b978c850d00..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/index.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/index.rst:3
-msgid "API Guide"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/executor/executor.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/executor/executor.po
deleted file mode 100644
index 6e65164a5ab0d82f2eb8ed111918665658c1603c..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/executor/executor.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/executor/executor.rst:3
-msgid "Executor"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/executor/parallel_executor.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/executor/parallel_executor.po
deleted file mode 100644
index ad04ba662ecffd2c3f5253cf7500b0ba6dc50780..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/executor/parallel_executor.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/executor/parallel_executor.rst:3
-msgid "ParallelExecutor"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/index.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/index.po
deleted file mode 100644
index 2a07e33d681ee7f0ee5eacca1cf6d394b912c54b..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/index.po
+++ /dev/null
@@ -1,88 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/index.rst:3
-msgid "Low level API"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:6
-msgid "Layers"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:8
-msgid "神经网络的主体API是一些层函数，他们包括"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:23
-msgid "执行引擎"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:31
-msgid "数据读取"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:34
-msgid "参数属性与参数初始化(ParamAttr)"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:38
-msgid "预测引擎"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:41
-msgid "Program/Block/Variable"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:44
-msgid "Scope"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:47
-msgid "CreateOperator"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:50
-msgid "Backward"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:53
-msgid "模型平均(Model Average)"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:56
-msgid "Optimizers"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:59
-msgid "正则化"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:62
-msgid "Transpiler"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:65
-msgid "Gradient Clipping"
-msgstr ""
-
-#: ../../source/api_guides/low_level/index.rst:68
-msgid "调试工具/VisualDL"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/activations.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/activations.po
deleted file mode 100644
index 6a3003541aa5e8430611fa41c540bb448f4f64e7..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/activations.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/activations.rst:3
-msgid "激活函数"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/convolution.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/convolution.po
deleted file mode 100644
index 406fcbdf9a79adcfb132cc3c1fdbab80e30af175..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/convolution.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/convolution.rst:3
-msgid "卷积操作"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/detection.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/detection.po
deleted file mode 100644
index f5a0fc47d482ca6a56b542bfe1d028c2b71237fa..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/detection.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/detection.rst:3
-msgid "图像检测"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/io.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/io.po
deleted file mode 100644
index 953893d569fe1f332aa4d6f7d2aa545d77fdc0ca..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/io.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/io.rst:3
-msgid "输入输出"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/math.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/math.po
deleted file mode 100644
index e6c1472bd51c82ba8582251953e27561420aa88d..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/math.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/math.rst:3
-msgid "数学算子"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/metrics.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/metrics.po
deleted file mode 100644
index 570cdeb05e82165fdfeb6ccc4f2219ee0afbd9cb..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/metrics.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/metrics.rst:3
-msgid "评价指标"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/pooling.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/pooling.po
deleted file mode 100644
index 66b0abe3db23f5e354efb0c22e0d8b6638a88731..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/pooling.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/pooling.rst:3
-msgid "池化操作"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/preprocessing.po b/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/preprocessing.po
deleted file mode 100644
index 470bd9fdb17f0f1b7b6cdc1cc9281649b7867244..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_guides/low_level/layers/preprocessing.po
+++ /dev/null
@@ -1,32 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_guides/low_level/layers/preprocessing.rst:3
-msgid "预处理操作"
-msgstr ""
-
-#: ../../source/api_guides/low_level/layers/preprocessing.rst:8
-msgid "图像预处理操作"
-msgstr ""
-
-#: ../../source/api_guides/low_level/layers/preprocessing.rst:12
-msgid "语音预处理操作"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/clip.po b/source/locale/en/LC_MESSAGES/api_reference/clip.po
deleted file mode 100644
index b94536fc5571f6ec13feb817b5fbab8157279590..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/clip.po
+++ /dev/null
@@ -1,51 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/clip.rst:6
-msgid "fluid.clip"
-msgstr ""
-
-#: ../../source/api_reference/clip.rst:11
-msgid "ErrorClipByValue"
-msgstr ""
-
-#: ../../source/api_reference/clip.rst:20
-msgid "GradientClipByValue"
-msgstr ""
-
-#: ../../source/api_reference/clip.rst:29
-msgid "GradientClipByNorm"
-msgstr ""
-
-#: ../../source/api_reference/clip.rst:38
-msgid "GradientClipByGlobalNorm"
-msgstr ""
-
-#: ../../source/api_reference/clip.rst:47
-msgid "append_gradient_clip_ops"
-msgstr ""
-
-#: ../../source/api_reference/clip.rst:55
-msgid "error_clip_callback"
-msgstr ""
-
-#~ msgid "clip"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/data/data_reader.po b/source/locale/en/LC_MESSAGES/api_reference/data/data_reader.po
deleted file mode 100644
index 3c943d0639e6cbb44f253b436f8aaba38fc1bd43..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/data/data_reader.po
+++ /dev/null
@@ -1,546 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/data/data_reader.rst:3 of paddle.reader:14
-msgid "Data Reader Interface"
-msgstr ""
-
-#: ../../source/api_reference/data/data_reader.rst:7
-msgid "DataTypes"
-msgstr ""
-
-#: of paddle.v2.data_type.dense_array:1
-msgid ""
-"Dense Array. It means the input feature is dense array with float type. "
-"For example, if the input is an image with 28*28 pixels, the input of "
-"Paddle neural network could be a dense vector with dimension 784 or a "
-"numpy array with shape (28, 28)."
-msgstr ""
-
-#: of paddle.v2.data_type.dense_array:6
-msgid ""
-"For the 2-D convolution operation, each sample in one mini-batch must "
-"have the similarly size in PaddlePaddle now. But, it supports variable-"
-"dimension feature across mini-batch. For the variable-dimension, the "
-"param dim is not used. While the data reader must yield numpy array and "
-"the data feeder will set the data shape correctly."
-msgstr ""
-
-#: of paddle.reader.buffered paddle.reader.chain paddle.reader.compose
-#: paddle.reader.creator.np_array paddle.reader.firstn
-#: paddle.reader.map_readers paddle.reader.shuffle
-#: paddle.v2.data_type.InputType paddle.v2.data_type.dense_array
-#: paddle.v2.data_type.integer_value paddle.v2.data_type.integer_value_sequence
-#: paddle.v2.data_type.sparse_binary_vector
-#: paddle.v2.data_type.sparse_binary_vector_sequence
-#: paddle.v2.data_type.sparse_float_vector
-#: paddle.v2.data_type.sparse_float_vector_sequence
-#: paddle.v2.data_type.sparse_non_value_slot
-#: paddle.v2.data_type.sparse_value_slot paddle.v2.minibatch.batch
-msgid "参数"
-msgstr ""
-
-#: of paddle.v2.data_type.dense_array:12
-#: paddle.v2.data_type.sparse_binary_vector:4
-#: paddle.v2.data_type.sparse_float_vector:4
-#: paddle.v2.data_type.sparse_non_value_slot:4
-#: paddle.v2.data_type.sparse_value_slot:4
-msgid "dimension of this vector."
-msgstr ""
-
-#: of paddle.v2.data_type.dense_array:14
-msgid "sequence type of input."
-msgstr ""
-
-#: of paddle.reader.buffered paddle.reader.chain paddle.reader.compose
-#: paddle.reader.creator.np_array paddle.reader.creator.recordio
-#: paddle.reader.creator.text_file paddle.reader.firstn
-#: paddle.reader.map_readers paddle.reader.shuffle
-#: paddle.v2.data_type.dense_array paddle.v2.data_type.integer_value
-#: paddle.v2.data_type.sparse_binary_vector
-#: paddle.v2.data_type.sparse_binary_vector_sequence
-#: paddle.v2.data_type.sparse_float_vector
-#: paddle.v2.data_type.sparse_float_vector_sequence
-#: paddle.v2.data_type.sparse_non_value_slot
-#: paddle.v2.data_type.sparse_value_slot paddle.v2.minibatch.batch
-msgid "返回"
-msgstr ""
-
-#: of paddle.v2.data_type.dense_array:16
-#: paddle.v2.data_type.sparse_binary_vector:8
-#: paddle.v2.data_type.sparse_float_vector:8
-#: paddle.v2.data_type.sparse_non_value_slot:8
-#: paddle.v2.data_type.sparse_value_slot:8
-msgid "An input type object."
-msgstr ""
-
-#: of paddle.reader.chain paddle.reader.firstn paddle.reader.map_readers
-#: paddle.reader.shuffle paddle.v2.data_type.dense_array
-#: paddle.v2.data_type.integer_value paddle.v2.data_type.sparse_binary_vector
-#: paddle.v2.data_type.sparse_binary_vector_sequence
-#: paddle.v2.data_type.sparse_float_vector
-#: paddle.v2.data_type.sparse_float_vector_sequence
-#: paddle.v2.data_type.sparse_non_value_slot
-#: paddle.v2.data_type.sparse_value_slot paddle.v2.minibatch.batch
-msgid "返回类型"
-msgstr ""
-
-#: of paddle.v2.data_type.integer_value:1
-msgid "Data type of integer."
-msgstr ""
-
-#: of paddle.v2.data_type.integer_value:3
-#: paddle.v2.data_type.sparse_binary_vector:6
-#: paddle.v2.data_type.sparse_float_vector:6
-#: paddle.v2.data_type.sparse_non_value_slot:6
-#: paddle.v2.data_type.sparse_value_slot:6
-msgid "sequence type of this input."
-msgstr ""
-
-#: of paddle.v2.data_type.integer_value:5
-msgid "range of this integer."
-msgstr ""
-
-#: of paddle.v2.data_type.integer_value:7
-#: paddle.v2.data_type.sparse_binary_vector_sequence:6
-#: paddle.v2.data_type.sparse_float_vector_sequence:6
-msgid "An input type object"
-msgstr ""
-
-#: of paddle.v2.data_type.integer_value_sequence:1
-msgid "Data type of a sequence of integer."
-msgstr ""
-
-#: of paddle.v2.data_type.integer_value_sequence:3
-msgid "range of each element."
-msgstr ""
-
-#: of paddle.v2.data_type.sparse_binary_vector:1
-#: paddle.v2.data_type.sparse_non_value_slot:1
-msgid ""
-"Sparse binary vector. It means the input feature is a sparse vector and "
-"the every element in this vector is either zero or one."
-msgstr ""
-
-#: of paddle.v2.data_type.sparse_binary_vector_sequence:2
-msgid ""
-"Data type of a sequence of sparse vector, which every element is either "
-"zero"
-msgstr ""
-
-#: of paddle.v2.data_type.sparse_binary_vector_sequence:2
-msgid "or one."
-msgstr ""
-
-#: of paddle.v2.data_type.sparse_binary_vector_sequence:4
-#: paddle.v2.data_type.sparse_float_vector_sequence:4
-msgid "dimension of sparse vector."
-msgstr ""
-
-#: of paddle.v2.data_type.sparse_float_vector:1
-#: paddle.v2.data_type.sparse_value_slot:1
-msgid ""
-"Sparse vector. It means the input feature is a sparse vector. Most of the"
-" elements in this vector are zero, others could be any float value."
-msgstr ""
-
-#: of paddle.v2.data_type.sparse_float_vector_sequence:1
-msgid ""
-"Data type of a sequence of sparse vector, which most elements are zero, "
-"others could be any float value."
-msgstr ""
-
-#: of paddle.v2.data_type.InputType:1
-msgid "InputType is the base class for paddle input types."
-msgstr ""
-
-#: of paddle.v2.data_type.InputType:5
-msgid "this is a base class, and should never be used by user."
-msgstr ""
-
-#: of paddle.v2.data_type.InputType:7
-msgid ""
-"dimension of input. If the input is an integer, it means the value range."
-" Otherwise, it means the size of layer."
-msgstr ""
-
-#: of paddle.v2.data_type.InputType:10
-msgid ""
-"sequence type of input. 0 means it is not a sequence. 1 means it is a "
-"variable length sequence. 2 means it is a nested sequence."
-msgstr ""
-
-#: of paddle.v2.data_type.InputType:14
-msgid "data type of input."
-msgstr ""
-
-#: ../../source/api_reference/data/data_reader.rst:50
-msgid "DataFeeder"
-msgstr ""
-
-#: ../../source/api_reference/data/data_reader.rst:57
-msgid "Reader"
-msgstr ""
-
-#: of paddle.reader:1
-msgid ""
-"At training and testing time, PaddlePaddle programs need to read data. To"
-" ease the users' work to write data reading code, we define that"
-msgstr ""
-
-#: of paddle.reader:4
-msgid ""
-"A *reader* is a function that reads data (from file, network, random "
-"number generator, etc) and yields data items."
-msgstr ""
-
-#: of paddle.reader:6
-msgid "A *reader creator* is a function that returns a reader function."
-msgstr ""
-
-#: of paddle.reader:7
-msgid ""
-"A *reader decorator* is a function, which accepts one or more readers, "
-"and returns a reader."
-msgstr ""
-
-#: of paddle.reader:9
-msgid ""
-"A *batch reader* is a function that reads data (from *reader*, file, "
-"network, random number generator, etc) and yields a batch of data items."
-msgstr ""
-
-#: of paddle.reader:16
-msgid ""
-"Indeed, *data reader* doesn't have to be a function that reads and yields"
-" data items. It can be any function with no parameter that creates a "
-"iterable (anything can be used in :code:`for x in iterable`)\\:"
-msgstr ""
-
-#: of paddle.reader:24
-msgid ""
-"Element produced from the iterable should be a **single** entry of data, "
-"**not** a mini batch. That entry of data could be a single item, or a "
-"tuple of items. Item should be of `supported type "
-"<http://www.paddlepaddle.org/doc/ui/data_provider "
-"/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy "
-"1d array of float32, int, list of int)"
-msgstr ""
-
-#: of paddle.reader:31
-msgid "An example implementation for single item data reader creator:"
-msgstr ""
-
-#: of paddle.reader:41
-msgid "An example implementation for multiple item data reader creator:"
-msgstr ""
-
-#: of paddle.reader:52
-msgid "TODO(yuyang18): Should we add whole design doc here?"
-msgstr ""
-
-#: of paddle.reader.map_readers:1
-msgid ""
-"Creates a data reader that outputs return value of function using output "
-"of each data readers as arguments."
-msgstr ""
-
-#: of paddle.reader.map_readers:4
-msgid "function to use. The type of func should be (Sample) => Sample"
-msgstr ""
-
-#: of paddle.reader.map_readers:6
-msgid "readers whose outputs will be used as arguments of func."
-msgstr ""
-
-#: of paddle.reader.map_readers
-msgid "type"
-msgstr ""
-
-#: of paddle.reader.map_readers:5
-msgid "callable"
-msgstr ""
-
-#: of paddle.reader.map_readers:7
-msgid "the created data reader."
-msgstr ""
-
-#: of paddle.reader.buffered:1
-msgid "Creates a buffered data reader."
-msgstr ""
-
-#: of paddle.reader.buffered:3
-msgid ""
-"The buffered data reader will read and save data entries into a buffer. "
-"Reading from the buffered data reader will proceed as long as the buffer "
-"is not empty."
-msgstr ""
-
-#: of paddle.reader.buffered:7 paddle.reader.firstn:3
-#: paddle.v2.minibatch.batch:3
-msgid "the data reader to read from."
-msgstr ""
-
-#: of paddle.reader.buffered:9
-msgid "max buffer size."
-msgstr ""
-
-#: of paddle.reader.buffered:12
-msgid "the buffered data reader."
-msgstr ""
-
-#: of paddle.reader.compose:1
-msgid "Creates a data reader whose output is the combination of input readers."
-msgstr ""
-
-#: of paddle.reader.compose:3
-msgid ""
-"If input readers output following data entries: (1, 2)    3    (4, 5) The"
-" composed reader will output: (1, 2, 3, 4, 5)"
-msgstr ""
-
-#: of paddle.reader.compose:8
-msgid "readers that will be composed together."
-msgstr ""
-
-#: of paddle.reader.compose:9
-msgid ""
-"if True, will check if input readers are aligned correctly. If False, "
-"will not check alignment and trailing outputs will be discarded. Defaults"
-" to True."
-msgstr ""
-
-#: of paddle.reader.chain:12 paddle.reader.compose:14
-msgid "the new data reader."
-msgstr ""
-
-#: of paddle.reader.compose
-msgid "引发"
-msgstr ""
-
-#: of paddle.reader.compose:16
-msgid ""
-"outputs of readers are not aligned. Will not raise when check_alignment "
-"is set to False."
-msgstr ""
-
-#: of paddle.reader.chain:1
-msgid ""
-"Creates a data reader whose output is the outputs of input data readers "
-"chained together."
-msgstr ""
-
-#: of paddle.reader.chain:4
-msgid ""
-"If input readers output following data entries: [0, 0, 0] [1, 1, 1] [2, "
-"2, 2] The chained reader will output: [0, 0, 0, 1, 1, 1, 2, 2, 2]"
-msgstr ""
-
-#: of paddle.reader.chain:11
-msgid "input readers."
-msgstr ""
-
-#: of paddle.reader.shuffle:1
-msgid "Creates a data reader whose data output is shuffled."
-msgstr ""
-
-#: of paddle.reader.shuffle:3
-msgid ""
-"Output from the iterator that created by original reader will be buffered"
-" into shuffle buffer, and then shuffled. The size of shuffle buffer is "
-"determined by argument buf_size."
-msgstr ""
-
-#: of paddle.reader.shuffle:7
-msgid "the original reader whose output will be shuffled."
-msgstr ""
-
-#: of paddle.reader.shuffle:9
-msgid "shuffle buffer size."
-msgstr ""
-
-#: of paddle.reader.shuffle:12
-msgid "the new reader whose output is shuffled."
-msgstr ""
-
-#: of paddle.reader.firstn:1
-msgid "Limit the max number of samples that reader could return."
-msgstr ""
-
-#: of paddle.reader.firstn:5
-msgid "the max number of samples that return."
-msgstr ""
-
-#: of paddle.reader.firstn:7
-msgid "the decorated reader."
-msgstr ""
-
-#: of paddle.reader.xmap_readers:1
-msgid ""
-"Use multiprocess to map samples from reader by a mapper defined by user. "
-"And this function contains a buffered decorator. :param mapper:  a "
-"function to map sample. :type mapper: callable :param reader: the data "
-"reader to read from :type reader: callable :param process_num: process "
-"number to handle original sample :type process_num: int :param "
-"buffer_size: max buffer size :type buffer_size: int :param order: keep "
-"the order of reader :type order: bool :return: the decarated reader "
-":rtype: callable"
-msgstr ""
-
-#: of paddle.reader.PipeReader:1
-msgid ""
-"PipeReader read data by stream from a command, take it's stdout into a "
-"pipe buffer and redirect it to the parser to parse, then yield data as "
-"your desired format."
-msgstr ""
-
-#: of paddle.reader.PipeReader:5
-msgid ""
-"You can using standard linux command or call another program to read "
-"data, from HDFS, Ceph, URL, AWS S3 etc:"
-msgstr ""
-
-#: of paddle.reader.PipeReader:14
-msgid "An example:"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line
-msgid "param cut_lines"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line:1
-msgid "cut buffer to lines"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line
-msgid "type cut_lines"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line:2
-msgid "bool"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line
-msgid "param line_break"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line:3
-msgid "line break of the file, like"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line:7
-msgid "or"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line
-msgid "type line_break"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line:5 paddle.reader.PipeReader.get_line:8
-msgid "string"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line
-msgid "return"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line:7
-msgid "one line or a buffer of bytes"
-msgstr ""
-
-#: of paddle.reader.PipeReader.get_line
-msgid "rtype"
-msgstr ""
-
-#: of paddle.reader.creator:1
-msgid ""
-"Creator package contains some simple reader creator, which could be used "
-"in user program."
-msgstr ""
-
-#: of paddle.reader.creator.np_array:1
-msgid ""
-"Creates a reader that yields elements of x, if it is a numpy vector. Or "
-"rows of x, if it is a numpy matrix. Or any sub-hyperplane indexed by the "
-"highest dimension."
-msgstr ""
-
-#: of paddle.reader.creator.np_array:5
-msgid "the numpy array to create reader from."
-msgstr ""
-
-#: of paddle.reader.creator.np_array:6
-msgid "data reader created from x."
-msgstr ""
-
-#: of paddle.reader.creator.text_file:1
-msgid ""
-"Creates a data reader that outputs text line by line from given text "
-"file. Trailing new line ('\\\\n') of each line will be removed."
-msgstr ""
-
-#: of paddle.reader.creator.recordio paddle.reader.creator.text_file
-msgid "path"
-msgstr ""
-
-#: of paddle.reader.creator.text_file:4
-msgid "path of the text file."
-msgstr ""
-
-#: of paddle.reader.creator.text_file:5
-msgid "data reader of text file"
-msgstr ""
-
-#: of paddle.reader.creator.recordio:1
-msgid "Creates a data reader from given RecordIO file paths separated by \",\","
-msgstr ""
-
-#: of paddle.reader.creator.recordio:2
-msgid "glob pattern is supported."
-msgstr ""
-
-#: of paddle.reader.creator.recordio:3
-msgid "path of recordio files, can be a string or a string list."
-msgstr ""
-
-#: of paddle.reader.creator.recordio:4
-msgid "data reader of recordio files."
-msgstr ""
-
-#: ../../source/api_reference/data/data_reader.rst:68
-msgid "minibatch"
-msgstr ""
-
-#: of paddle.v2.minibatch.batch:1
-msgid "Create a batched reader."
-msgstr ""
-
-#: of paddle.v2.minibatch.batch:5
-msgid "size of each mini-batch"
-msgstr ""
-
-#: of paddle.v2.minibatch.batch:7
-msgid "drop the last batch, if the size of last batch is not equal to batch_size."
-msgstr ""
-
-#: of paddle.v2.minibatch.batch:9
-msgid "the batched reader."
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/data/dataset.po b/source/locale/en/LC_MESSAGES/api_reference/data/dataset.po
deleted file mode 100644
index 8db2cfd5f80ac9177247c99ddfdabbdd19ba687a..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/data/dataset.po
+++ /dev/null
@@ -1,577 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/data/dataset.rst:2
-msgid "Dataset"
-msgstr ""
-
-#: of paddle.dataset:1
-msgid "Dataset package."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:9
-msgid "mnist"
-msgstr ""
-
-#: of paddle.dataset.mnist:1
-msgid "MNIST dataset."
-msgstr ""
-
-#: of paddle.dataset.mnist:3
-msgid ""
-"This module will download dataset from http://yann.lecun.com/exdb/mnist/ "
-"and parse training set and test set into paddle reader creators."
-msgstr ""
-
-#: of paddle.dataset.mnist.train:1
-msgid "MNIST training set creator."
-msgstr ""
-
-#: of paddle.dataset.cifar.test10:3 paddle.dataset.cifar.test100:3
-#: paddle.dataset.cifar.train10:3 paddle.dataset.mnist.test:3
-#: paddle.dataset.mnist.train:3
-msgid ""
-"It returns a reader creator, each sample in the reader is image pixels in"
-" [0, 1] and label in [0, 9]."
-msgstr ""
-
-#: of paddle.dataset.cifar.test10 paddle.dataset.cifar.test100
-#: paddle.dataset.cifar.train10 paddle.dataset.cifar.train100
-#: paddle.dataset.conll05.test paddle.dataset.imdb.test
-#: paddle.dataset.imdb.train paddle.dataset.imikolov.test
-#: paddle.dataset.imikolov.train paddle.dataset.mnist.test
-#: paddle.dataset.mnist.train paddle.dataset.uci_housing.test
-#: paddle.dataset.uci_housing.train paddle.dataset.wmt14.test
-#: paddle.dataset.wmt14.train paddle.dataset.wmt16.get_dict
-#: paddle.dataset.wmt16.test paddle.dataset.wmt16.train
-#: paddle.dataset.wmt16.validation
-msgid "返回"
-msgstr ""
-
-#: of paddle.dataset.cifar.train10:6 paddle.dataset.cifar.train100:6
-#: paddle.dataset.conll05.test:8 paddle.dataset.imdb.train:8
-#: paddle.dataset.imikolov.train:12 paddle.dataset.mnist.train:6
-#: paddle.dataset.uci_housing.train:6 paddle.dataset.wmt14.train:7
-msgid "Training reader creator"
-msgstr ""
-
-#: of paddle.dataset.cifar.test10 paddle.dataset.cifar.test100
-#: paddle.dataset.cifar.train10 paddle.dataset.cifar.train100
-#: paddle.dataset.conll05.test paddle.dataset.imdb.test
-#: paddle.dataset.imdb.train paddle.dataset.imikolov.test
-#: paddle.dataset.imikolov.train paddle.dataset.mnist.test
-#: paddle.dataset.mnist.train paddle.dataset.uci_housing.test
-#: paddle.dataset.uci_housing.train paddle.dataset.wmt14.test
-#: paddle.dataset.wmt14.train paddle.dataset.wmt16.get_dict
-#: paddle.dataset.wmt16.test paddle.dataset.wmt16.train
-#: paddle.dataset.wmt16.validation
-msgid "返回类型"
-msgstr ""
-
-#: of paddle.dataset.mnist.test:1
-msgid "MNIST test set creator."
-msgstr ""
-
-#: of paddle.dataset.cifar.test10:6 paddle.dataset.cifar.test100:6
-#: paddle.dataset.mnist.test:6
-msgid "Test reader creator."
-msgstr ""
-
-#: of paddle.dataset.cifar.convert:1 paddle.dataset.imdb.convert:1
-#: paddle.dataset.imikolov.convert:1 paddle.dataset.mnist.convert:1
-#: paddle.dataset.movielens.convert:1 paddle.dataset.sentiment.convert:1
-#: paddle.dataset.wmt14.convert:1
-msgid "Converts dataset to recordio format"
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:16
-msgid "cifar"
-msgstr ""
-
-#: of paddle.dataset.cifar:1
-msgid "CIFAR dataset."
-msgstr ""
-
-#: of paddle.dataset.cifar:3
-msgid ""
-"This module will download dataset from "
-"https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into"
-" paddle reader creators."
-msgstr ""
-
-#: of paddle.dataset.cifar:7
-msgid ""
-"The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,"
-" with 6000 images per class. There are 50000 training images and 10000 "
-"test images."
-msgstr ""
-
-#: of paddle.dataset.cifar:11
-msgid ""
-"The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 "
-"classes containing 600 images each. There are 500 training images and 100"
-" testing images per class."
-msgstr ""
-
-#: of paddle.dataset.cifar.train100:1
-msgid "CIFAR-100 training set creator."
-msgstr ""
-
-#: of paddle.dataset.cifar.train100:3
-msgid ""
-"It returns a reader creator, each sample in the reader is image pixels in"
-" [0, 1] and label in [0, 99]."
-msgstr ""
-
-#: of paddle.dataset.cifar.test100:1
-msgid "CIFAR-100 test set creator."
-msgstr ""
-
-#: of paddle.dataset.cifar.train10:1
-msgid "CIFAR-10 training set creator."
-msgstr ""
-
-#: of paddle.dataset.cifar.test10:1
-msgid "CIFAR-10 test set creator."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:23
-msgid "conll05"
-msgstr ""
-
-#: of paddle.dataset.conll05:1
-msgid ""
-"Conll05 dataset. Paddle semantic role labeling Book and demo use this "
-"dataset as an example. Because Conll05 is not free in public, the default"
-" downloaded URL is test set of Conll05 (which is public). Users can "
-"change URL and MD5 to their Conll dataset. And a pre-trained word vector "
-"model based on Wikipedia corpus is used to initialize SRL model."
-msgstr ""
-
-#: of paddle.dataset.conll05.get_dict:1
-msgid "Get the word, verb and label dictionary of Wikipedia corpus."
-msgstr ""
-
-#: of paddle.dataset.conll05.get_embedding:1
-msgid "Get the trained word vector based on Wikipedia corpus."
-msgstr ""
-
-#: of paddle.dataset.conll05.test:1
-msgid "Conll05 test set creator."
-msgstr ""
-
-#: of paddle.dataset.conll05.test:3
-msgid ""
-"Because the training dataset is not free, the test dataset is used for "
-"training. It returns a reader creator, each sample in the reader is nine "
-"features, including sentence sequence, predicate, predicate context, "
-"predicate context flag and tagged sequence."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:30
-msgid "imdb"
-msgstr ""
-
-#: of paddle.dataset.imdb:1
-msgid "IMDB dataset."
-msgstr ""
-
-#: of paddle.dataset.imdb:3
-#, python-format
-msgid ""
-"This module downloads IMDB dataset from "
-"http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a "
-"set of 25,000 highly polar movie reviews for training, and 25,000 for "
-"testing. Besides, this module also provides API for building dictionary."
-msgstr ""
-
-#: of paddle.dataset.imdb.build_dict:1
-msgid ""
-"Build a word dictionary from the corpus. Keys of the dictionary are "
-"words, and values are zero-based IDs of these words."
-msgstr ""
-
-#: of paddle.dataset.imdb.train:1
-msgid "IMDB training set creator."
-msgstr ""
-
-#: of paddle.dataset.imdb.test:3 paddle.dataset.imdb.train:3
-msgid ""
-"It returns a reader creator, each sample in the reader is an zero-based "
-"ID sequence and label in [0, 1]."
-msgstr ""
-
-#: of paddle.dataset.imdb.test paddle.dataset.imdb.train
-#: paddle.dataset.imikolov.test paddle.dataset.imikolov.train
-#: paddle.dataset.wmt16.get_dict paddle.dataset.wmt16.test
-#: paddle.dataset.wmt16.train paddle.dataset.wmt16.validation
-msgid "参数"
-msgstr ""
-
-#: of paddle.dataset.imdb.test:6 paddle.dataset.imdb.train:6
-#: paddle.dataset.imikolov.test:6 paddle.dataset.imikolov.train:6
-msgid "word dictionary"
-msgstr ""
-
-#: of paddle.dataset.imdb.test:1
-msgid "IMDB test set creator."
-msgstr ""
-
-#: of paddle.dataset.imdb.test:8 paddle.dataset.imikolov.test:12
-#: paddle.dataset.uci_housing.test:6 paddle.dataset.wmt14.test:7
-msgid "Test reader creator"
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:37
-msgid "imikolov"
-msgstr ""
-
-#: of paddle.dataset.imikolov:1
-msgid "imikolov's simple dataset."
-msgstr ""
-
-#: of paddle.dataset.imikolov:3
-msgid ""
-"This module will download dataset from "
-"http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test "
-"set into paddle reader creators."
-msgstr ""
-
-#: of paddle.dataset.imikolov.train:1
-msgid "imikolov training set creator."
-msgstr ""
-
-#: of paddle.dataset.imikolov.test:3 paddle.dataset.imikolov.train:3
-msgid "It returns a reader creator, each sample in the reader is a word ID tuple."
-msgstr ""
-
-#: of paddle.dataset.imikolov.test:8 paddle.dataset.imikolov.train:8
-msgid "sliding window size if type is ngram, otherwise max length of sequence"
-msgstr ""
-
-#: of paddle.dataset.imikolov.test:10 paddle.dataset.imikolov.train:10
-msgid "data type (ngram or sequence)"
-msgstr ""
-
-#: of paddle.dataset.imikolov.test:1
-msgid "imikolov test set creator."
-msgstr ""
-
-#: of paddle.dataset.imikolov.build_dict:1
-msgid ""
-"Build a word dictionary from the corpus,  Keys of the dictionary are "
-"words, and values are zero-based IDs of these words."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:44
-msgid "movielens"
-msgstr ""
-
-#: of paddle.dataset.movielens:1
-msgid "Movielens 1-M dataset."
-msgstr ""
-
-#: of paddle.dataset.movielens:3
-msgid ""
-"Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 "
-"movies, which was collected by GroupLens Research. This module will "
-"download Movielens 1-M dataset from "
-"http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse "
-"training set and test set into paddle reader creators."
-msgstr ""
-
-#: of paddle.dataset.movielens.get_movie_title_dict:1
-msgid "Get movie title dictionary."
-msgstr ""
-
-#: of paddle.dataset.movielens.max_movie_id:1
-msgid "Get the maximum value of movie id."
-msgstr ""
-
-#: of paddle.dataset.movielens.max_user_id:1
-msgid "Get the maximum value of user id."
-msgstr ""
-
-#: of paddle.dataset.movielens.movie_categories:1
-msgid "Get movie categoriges dictionary."
-msgstr ""
-
-#: of paddle.dataset.movielens.max_job_id:1
-msgid "Get the maximum value of job id."
-msgstr ""
-
-#: of paddle.dataset.movielens.user_info:1
-msgid "Get user info dictionary."
-msgstr ""
-
-#: of paddle.dataset.movielens.movie_info:1
-msgid "Get movie info dictionary."
-msgstr ""
-
-#: of paddle.dataset.movielens.MovieInfo:1
-msgid "Movie id, title and categories information are stored in MovieInfo."
-msgstr ""
-
-#: of paddle.dataset.movielens.UserInfo:1
-msgid "User id, gender, age, and job information are stored in UserInfo."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:57
-msgid "sentiment"
-msgstr ""
-
-#: of paddle.dataset.sentiment:1
-msgid ""
-"The script fetch and preprocess movie_reviews data set that provided by "
-"NLTK"
-msgstr ""
-
-#: of paddle.dataset.sentiment:3
-msgid "TODO(yuyang18): Complete dataset."
-msgstr ""
-
-#: of paddle.dataset.sentiment.train:1
-msgid "Default training set reader creator"
-msgstr ""
-
-#: of paddle.dataset.sentiment.test:1
-msgid "Default test set reader creator"
-msgstr ""
-
-#: of paddle.dataset.sentiment.get_word_dict:1
-msgid "Sorted the words by the frequency of words which occur in sample :return:"
-msgstr ""
-
-#: of paddle.dataset.sentiment.get_word_dict:3
-msgid "words_freq_sorted"
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:64
-msgid "uci_housing"
-msgstr ""
-
-#: of paddle.dataset.uci_housing:1
-msgid "UCI Housing dataset."
-msgstr ""
-
-#: of paddle.dataset.uci_housing:3
-msgid ""
-"This module will download dataset from https://archive.ics.uci.edu/ml"
-"/machine-learning-databases/housing/ and parse training set and test set "
-"into paddle reader creators."
-msgstr ""
-
-#: of paddle.dataset.uci_housing.train:1
-msgid "UCI_HOUSING training set creator."
-msgstr ""
-
-#: of paddle.dataset.uci_housing.test:3 paddle.dataset.uci_housing.train:3
-msgid ""
-"It returns a reader creator, each sample in the reader is features after "
-"normalization and price number."
-msgstr ""
-
-#: of paddle.dataset.uci_housing.test:1
-msgid "UCI_HOUSING test set creator."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:71
-msgid "wmt14"
-msgstr ""
-
-#: of paddle.dataset.wmt14:1
-msgid ""
-"WMT14 dataset. The original WMT14 dataset is too large and a small set of"
-" data for set is provided. This module will download dataset from "
-"http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and "
-"parse training set and test set into paddle reader creators."
-msgstr ""
-
-#: of paddle.dataset.wmt14.train:1
-msgid "WMT14 training set creator."
-msgstr ""
-
-#: of paddle.dataset.wmt14.test:3 paddle.dataset.wmt14.train:3
-msgid ""
-"It returns a reader creator, each sample in the reader is source language"
-" word ID sequence, target language word ID sequence and next word ID "
-"sequence."
-msgstr ""
-
-#: of paddle.dataset.wmt14.test:1
-msgid "WMT14 test set creator."
-msgstr ""
-
-#: ../../source/api_reference/data/dataset.rst:78
-msgid "wmt16"
-msgstr ""
-
-#: of paddle.dataset.wmt16:1
-msgid ""
-"ACL2016 Multimodal Machine Translation. Please see this website for more "
-"details: http://www.statmt.org/wmt16/multimodal-task.html#task1"
-msgstr ""
-
-#: of paddle.dataset.wmt16:4
-msgid ""
-"If you use the dataset created for your task, please cite the following "
-"paper: Multi30K: Multilingual English-German Image Descriptions."
-msgstr ""
-
-#: of paddle.dataset.wmt16:12
-msgid "@article{elliott-EtAl:2016:VL16,"
-msgstr ""
-
-#: of paddle.dataset.wmt16:8
-msgid ""
-"author    = {{Elliott}, D. and {Frank}, S. and {Sima\"an}, K. and "
-"{Specia}, L.}, title     = {Multi30K: Multilingual English-German Image "
-"Descriptions}, booktitle = {Proceedings of the 6th Workshop on Vision and"
-" Language}, year      = {2016}, pages     = {70--74}, year      = 2016"
-msgstr ""
-
-#: of paddle.dataset.wmt16:14
-msgid "}"
-msgstr ""
-
-#: of paddle.dataset.wmt16.train:1
-msgid "WMT16 train set reader."
-msgstr ""
-
-#: of paddle.dataset.wmt16.train:3
-msgid ""
-"This function returns the reader for train data. Each sample the reader "
-"returns is made up of three fields: the source language word index "
-"sequence, target language word index sequence and next word index "
-"sequence."
-msgstr ""
-
-#: of paddle.dataset.wmt16.train:8
-msgid ""
-"NOTE: The original like for training data is: "
-"http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz"
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:11 paddle.dataset.wmt16.train:12
-#: paddle.dataset.wmt16.validation:11
-msgid ""
-"paddle.dataset.wmt16 provides a tokenized version of the original dataset"
-" by using moses's tokenization script: https://github.com/moses-"
-"smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl"
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:15 paddle.dataset.wmt16.train:16
-#: paddle.dataset.wmt16.validation:15
-msgid ""
-"Size of the source language dictionary. Three special tokens will be "
-"added into the dictionary: <s> for start mark, <e> for end mark, and "
-"<unk> for unknown word."
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:20 paddle.dataset.wmt16.train:21
-#: paddle.dataset.wmt16.validation:20
-msgid ""
-"Size of the target language dictionary. Three special tokens will be "
-"added into the dictionary: <s> for start mark, <e> for end mark, and "
-"<unk> for unknown word."
-msgstr ""
-
-#: of paddle.dataset.wmt16.get_dict:3 paddle.dataset.wmt16.test:25
-#: paddle.dataset.wmt16.train:26 paddle.dataset.wmt16.validation:25
-msgid ""
-"A string indicating which language is the source language. Available "
-"options are: \"en\" for English and \"de\" for Germany."
-msgstr ""
-
-#: of paddle.dataset.wmt16.train:31
-msgid "The train reader."
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:1
-msgid "WMT16 test set reader."
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:3
-msgid ""
-"This function returns the reader for test data. Each sample the reader "
-"returns is made up of three fields: the source language word index "
-"sequence, target language word index sequence and next word index "
-"sequence."
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:7
-msgid ""
-"NOTE: The original like for test data is: "
-"http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz"
-msgstr ""
-
-#: of paddle.dataset.wmt16.test:30
-msgid "The test reader."
-msgstr ""
-
-#: of paddle.dataset.wmt16.validation:1
-msgid "WMT16 validation set reader."
-msgstr ""
-
-#: of paddle.dataset.wmt16.validation:3
-msgid ""
-"This function returns the reader for validation data. Each sample the "
-"reader returns is made up of three fields: the source language word index"
-" sequence, target language word index sequence and next word index "
-"sequence."
-msgstr ""
-
-#: of paddle.dataset.wmt16.validation:7
-msgid ""
-"NOTE: The original like for validation data is: "
-"http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz"
-msgstr ""
-
-#: of paddle.dataset.wmt16.validation:30
-msgid "The validation reader."
-msgstr ""
-
-#: of paddle.dataset.wmt16.convert:1
-msgid "Converts dataset to recordio format."
-msgstr ""
-
-#: of paddle.dataset.wmt16.fetch:1
-msgid "download the entire dataset."
-msgstr ""
-
-#: of paddle.dataset.wmt16.get_dict:1
-msgid "return the word dictionary for the specified language."
-msgstr ""
-
-#: of paddle.dataset.wmt16.get_dict:7
-msgid "Size of the specified language dictionary."
-msgstr ""
-
-#: of paddle.dataset.wmt16.get_dict:9
-msgid ""
-"If reverse is set to False, the returned python dictionary will use word "
-"as key and use index as value. If reverse is set to True, the returned "
-"python dictionary will use index as key and word as value."
-msgstr ""
-
-#: of paddle.dataset.wmt16.get_dict:15
-msgid "The word dictionary for the specific language."
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/data/image.po b/source/locale/en/LC_MESSAGES/api_reference/data/image.po
deleted file mode 100644
index b6d35dd9ea7244c552aaafe77db06a10756549f5..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/data/image.po
+++ /dev/null
@@ -1,219 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/data/image.rst:2
-msgid "Image Interface"
-msgstr ""
-
-#: of paddle.v2.image:1
-msgid ""
-"This file contains some common interfaces for image preprocess. Many "
-"users are confused about the image layout. We introduce the image layout "
-"as follows."
-msgstr ""
-
-#: of paddle.v2.image:5
-msgid "CHW Layout"
-msgstr ""
-
-#: of paddle.v2.image:7
-msgid "The abbreviations: C=channel, H=Height, W=Width"
-msgstr ""
-
-#: of paddle.v2.image:8
-msgid ""
-"The default layout of image opened by cv2 or PIL is HWC. PaddlePaddle "
-"only supports the CHW layout. And CHW is simply a transpose of HWC. It "
-"must transpose the input image."
-msgstr ""
-
-#: of paddle.v2.image:12
-msgid "Color format: RGB or BGR"
-msgstr ""
-
-#: of paddle.v2.image:14
-msgid ""
-"OpenCV use BGR color format. PIL use RGB color format. Both formats can "
-"be used for training. Noted that, the format should be keep consistent "
-"between the training and inference peroid."
-msgstr ""
-
-#: of paddle.v2.image.load_image_bytes:1
-msgid "Load an color or gray image from bytes array."
-msgstr ""
-
-#: of paddle.v2.image.center_crop:3 paddle.v2.image.left_right_flip:4
-#: paddle.v2.image.load_and_transform:5 paddle.v2.image.load_image:3
-#: paddle.v2.image.load_image_bytes:3 paddle.v2.image.random_crop:3
-#: paddle.v2.image.resize_short:3 paddle.v2.image.simple_transform:4
-#: paddle.v2.image.to_chw:5
-msgid "Example usage:"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar paddle.v2.image.center_crop
-#: paddle.v2.image.left_right_flip paddle.v2.image.load_and_transform
-#: paddle.v2.image.load_image paddle.v2.image.load_image_bytes
-#: paddle.v2.image.random_crop paddle.v2.image.resize_short
-#: paddle.v2.image.simple_transform paddle.v2.image.to_chw
-msgid "参数"
-msgstr ""
-
-#: of paddle.v2.image.load_image_bytes:10
-msgid "the input image bytes array."
-msgstr ""
-
-#: of paddle.v2.image.load_image:11 paddle.v2.image.load_image_bytes:12
-msgid ""
-"If set is_color True, it will load and return a color image. Otherwise, "
-"it will load and return a gray image."
-msgstr ""
-
-#: of paddle.v2.image.load_image:1
-msgid "Load an color or gray image from the file path."
-msgstr ""
-
-#: of paddle.v2.image.load_image:9
-msgid "the input image path."
-msgstr ""
-
-#: of paddle.v2.image.resize_short:1
-msgid "Resize an image so that the length of shorter edge is size."
-msgstr ""
-
-#: of paddle.v2.image.center_crop:9 paddle.v2.image.random_crop:9
-#: paddle.v2.image.resize_short:10 paddle.v2.image.to_chw:13
-msgid "the input image with HWC layout."
-msgstr ""
-
-#: of paddle.v2.image.resize_short:12
-msgid "the shorter edge size of image after resizing."
-msgstr ""
-
-#: of paddle.v2.image.to_chw:1
-msgid ""
-"Transpose the input image order. The image layout is HWC format opened by"
-" cv2 or PIL. Transpose the input image to CHW layout according the order "
-"(2,0,1)."
-msgstr ""
-
-#: of paddle.v2.image.to_chw:15
-msgid "the transposed order."
-msgstr ""
-
-#: of paddle.v2.image.center_crop:1
-msgid "Crop the center of image with size."
-msgstr ""
-
-#: of paddle.v2.image.center_crop:11 paddle.v2.image.random_crop:11
-msgid "the cropping size."
-msgstr ""
-
-#: of paddle.v2.image.center_crop:13 paddle.v2.image.load_and_transform:19
-#: paddle.v2.image.random_crop:13 paddle.v2.image.simple_transform:18
-msgid "whether the image is color or not."
-msgstr ""
-
-#: of paddle.v2.image.random_crop:1
-msgid "Randomly crop input image with size."
-msgstr ""
-
-#: of paddle.v2.image.left_right_flip:1
-msgid "Flip an image along the horizontal direction. Return the flipped image."
-msgstr ""
-
-#: of paddle.v2.image.left_right_flip:10
-msgid "input image with HWC layout or HW layout for gray image"
-msgstr ""
-
-#: of paddle.v2.image.left_right_flip:12
-msgid "whether input image is color or not"
-msgstr ""
-
-#: of paddle.v2.image.simple_transform:1
-msgid ""
-"Simply data argumentation for training. These operations include "
-"resizing, croping and flipping."
-msgstr ""
-
-#: of paddle.v2.image.simple_transform:10
-msgid "The input image with HWC layout."
-msgstr ""
-
-#: of paddle.v2.image.load_and_transform:13 paddle.v2.image.simple_transform:12
-msgid "The shorter edge length of the resized image."
-msgstr ""
-
-#: of paddle.v2.image.load_and_transform:15 paddle.v2.image.simple_transform:14
-msgid "The cropping size."
-msgstr ""
-
-#: of paddle.v2.image.load_and_transform:17 paddle.v2.image.simple_transform:16
-msgid "Whether it is training or not."
-msgstr ""
-
-#: of paddle.v2.image.load_and_transform:21 paddle.v2.image.simple_transform:20
-msgid ""
-"the mean values, which can be element-wise mean values or mean values per"
-" channel."
-msgstr ""
-
-#: of paddle.v2.image.load_and_transform:1
-msgid ""
-"Load image from the input file `filename` and transform image for data "
-"argumentation. Please refer to the `simple_transform` interface for the "
-"transform operations."
-msgstr ""
-
-#: of paddle.v2.image.load_and_transform:11
-msgid "The file name of input image."
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar:1
-msgid "Read images from tar file and batch them into batch file."
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar:3
-msgid "path of image tar file"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar:5
-msgid "'train','test' or 'valid'"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar:7
-msgid "a dic with image file name as key and image's label as value"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar:10
-msgid "image number per batch file"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar
-msgid "返回"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar:12
-msgid "path of list file containing paths of batch file"
-msgstr ""
-
-#: of paddle.v2.image.batch_images_from_tar
-msgid "返回类型"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/data_feeder.po b/source/locale/en/LC_MESSAGES/api_reference/data_feeder.po
deleted file mode 100644
index c9862dbbe0fa185abdb9999e6a680bc62df68a3d..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/data_feeder.po
+++ /dev/null
@@ -1,31 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/data_feeder.rst:6
-msgid "fluid.data_feeder"
-msgstr ""
-
-#: ../../source/api_reference/data_feeder.rst:11
-msgid "DataFeeder"
-msgstr ""
-
-#~ msgid "data_feeder"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/executor.po b/source/locale/en/LC_MESSAGES/api_reference/executor.po
deleted file mode 100644
index b38fa6f9e99c152462a64b7d4da6c26e6a76acc1..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/executor.po
+++ /dev/null
@@ -1,122 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/executor.rst:6
-msgid "fluid.executor"
-msgstr ""
-
-#: ../../source/api_reference/executor.rst:11
-msgid "Executor"
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run:1
-msgid ""
-"Run program by this Executor. Feed data by feed map, fetch result by "
-"fetch_list."
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run:3
-msgid ""
-"Python executor takes a program, add feed operators and fetch operators "
-"to this program according to feed map and fetch_list. Feed map provides "
-"input data for the program. fetch_list provides the variables(or names) "
-"that user want to get after program run. Note: the executor will run all "
-"operators in the program but not only the operators dependent by the "
-"fetch_list"
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run paddle.fluid.executor.fetch_var
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run:8
-msgid ""
-"the program that need to run, if not provied, then default_main_program "
-"will be used."
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run:9
-msgid "feed variable map, e.g. {\"image\": ImageData, \"label\": LableData}"
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run:10
-msgid ""
-"a list of variable or variable names that user want to get, run will "
-"return them according"
-msgstr ""
-
-#: of paddle.fluid.executor.Executor.run:11
-msgid ""
-"to this list. :param feed_var_name: the name for the input variable of "
-"feed Operator. :param fetch_var_name: the name for the output variable of"
-" feed Operator. :param scope: the scope used to run this program, you can"
-" switch it to different scope. default is global_scope :param "
-"return_numpy: if convert the fetched tensor to numpy :param "
-"use_program_cache: set use_program_cache to true if program not changed "
-"compare to the last step. :return: result according to fetch_list."
-msgstr ""
-
-#: ../../source/api_reference/executor.rst:20
-msgid "global_scope"
-msgstr ""
-
-#: ../../source/api_reference/executor.rst:28
-msgid "scope_guard"
-msgstr ""
-
-#: ../../source/api_reference/executor.rst:36
-msgid "switch_scope"
-msgstr ""
-
-#: ../../source/api_reference/executor.rst:44
-msgid "fetch_var"
-msgstr ""
-
-#: of paddle.fluid.executor.fetch_var:1
-msgid ""
-"Fetch the value of the variable with the given name from the given scope "
-":param name: name of the variable. Typically, only persistable variables"
-msgstr ""
-
-#: of paddle.fluid.executor.fetch_var:3
-msgid "can be found in the scope used for running the program."
-msgstr ""
-
-#: of paddle.fluid.executor.fetch_var:5
-msgid ""
-"scope object. It should be the scope where you pass to Executor.run() "
-"when running your program. If None, global_scope() will be used."
-msgstr ""
-
-#: of paddle.fluid.executor.fetch_var:9
-msgid "whether convert the tensor to numpy.ndarray"
-msgstr ""
-
-#: of paddle.fluid.executor.fetch_var
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.executor.fetch_var:12
-msgid "LodTensor|numpy.ndarray"
-msgstr ""
-
-#~ msgid "executor"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/fluid.po b/source/locale/en/LC_MESSAGES/api_reference/fluid.po
deleted file mode 100644
index 207ae536f512543e48001c40f39702a9b1f86ccb..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/fluid.po
+++ /dev/null
@@ -1,1005 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/fluid.rst:6
-msgid "fluid"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:11
-msgid "Block"
-msgstr ""
-
-#: of paddle.fluid.Block.clone_variable:1
-msgid ""
-"Clone a variable into current block. :param var: the variable to be "
-"cloned."
-msgstr ""
-
-#: of paddle.fluid.Block.clone_variable paddle.fluid.Block.copy_param_info_from
-#: paddle.fluid.Program.copy_data_info_from
-#: paddle.fluid.Program.copy_param_info_from paddle.fluid.Variable.astype
-#: paddle.fluid.channel_recv paddle.fluid.channel_send
-#: paddle.fluid.create_lod_tensor paddle.fluid.create_random_int_lodtensor
-#: paddle.fluid.default_main_program paddle.fluid.default_startup_program
-#: paddle.fluid.fetch_var paddle.fluid.get_var paddle.fluid.make_channel
-#: paddle.fluid.memory_optimize paddle.fluid.program_guard
-#: paddle.fluid.switch_main_program paddle.fluid.switch_startup_program
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.Block.clone_variable:4
-msgid "The new  variable cloned from 'var' in current block."
-msgstr ""
-
-#: of paddle.fluid.Block.copy_param_info_from:1
-msgid ""
-"Copy the information of parameters from the other block :param other: the"
-" other block :type other: Block"
-msgstr ""
-
-#: of paddle.fluid.Block.copy_param_info_from:5
-#: paddle.fluid.Program.copy_data_info_from:5
-#: paddle.fluid.Program.copy_param_info_from:5 paddle.fluid.program_guard:15
-msgid "None"
-msgstr ""
-
-#: of paddle.fluid.Block.rename_var:1
-msgid "Rename variable in vars and ops' inputs and outputs"
-msgstr ""
-
-#: of paddle.fluid.Block.sync_with_cpp:1
-msgid "Sync from the desc on the c++ end."
-msgstr ""
-
-#: of paddle.fluid.Block.sync_with_cpp:3
-msgid ""
-"This method is used to synchronize the c++ desc instance generated by "
-"backward."
-msgstr ""
-
-#: of paddle.fluid.Block.to_string:1 paddle.fluid.Operator.to_string:1
-#: paddle.fluid.Program.to_string:1
-msgid ""
-"To debug string. :param throw_on_error: raise exception when self is not "
-"initialized"
-msgstr ""
-
-#: of paddle.fluid.Block.to_string:3 paddle.fluid.Operator.to_string:3
-#: paddle.fluid.Program.to_string:3
-msgid "when throw_on_error is True"
-msgstr ""
-
-#: of paddle.fluid.Block.to_string paddle.fluid.Executor.run
-#: paddle.fluid.Inferencer.infer paddle.fluid.ParallelExecutor.run
-#: paddle.fluid.Program.clone paddle.fluid.Program.to_string
-#: paddle.fluid.Trainer paddle.fluid.Trainer.test paddle.fluid.Trainer.train
-#: paddle.fluid.Variable paddle.fluid.Variable.to_string
-#: paddle.fluid.channel_close paddle.fluid.channel_recv
-#: paddle.fluid.channel_send paddle.fluid.create_lod_tensor
-#: paddle.fluid.create_random_int_lodtensor paddle.fluid.fetch_var
-#: paddle.fluid.make_channel paddle.fluid.memory_optimize
-#: paddle.fluid.program_guard paddle.fluid.switch_main_program
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.Block.to_string:5 paddle.fluid.Program.to_string:5
-#: paddle.fluid.Variable.to_string:6
-msgid ""
-"more details about variables and parameters (e.g. trainable, "
-"optimize_attr, ...) will be printed when with_details is True"
-msgstr ""
-
-#: of paddle.fluid.Block.to_string:9 paddle.fluid.Operator.to_string:6
-#: paddle.fluid.Program.to_string:9 paddle.fluid.Variable.to_string:10
-msgid "Returns(str): The debug string."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:20 of paddle.fluid.get_var:8
-msgid "Variable"
-msgstr ""
-
-#: of paddle.fluid.Variable:1
-msgid ""
-"Python variable. Every input and output of an operator is a variable. "
-"Every variable belongs to a block. The variable has a name and two "
-"variables in different blocks could have the same name."
-msgstr ""
-
-#: of paddle.fluid.Variable:5
-msgid ""
-"There are many kinds of variables. Please reference the framework.proto "
-"for details."
-msgstr ""
-
-#: of paddle.fluid.Variable:8
-msgid ""
-"Notes: The constructor of Variable should not be invoked directly. Please"
-" use `Block.create_var` to create a variable."
-msgstr ""
-
-#: of paddle.fluid.Variable:16
-msgid ""
-"The associated block. It will be passed by `Block.create_var` "
-"automatically."
-msgstr ""
-
-#: of paddle.fluid.Variable:19
-msgid "Variable type. Please reference the framework.proto for details."
-msgstr ""
-
-#: of paddle.fluid.Variable:22
-msgid ""
-"The shape of variable. -1 means the batch size. Some kinds of variable do"
-" not contain shape, just set it to None."
-msgstr ""
-
-#: of paddle.fluid.Variable:25
-msgid "The data type of variable."
-msgstr ""
-
-#: of paddle.fluid.Variable:27
-msgid "The level of lod tensor. 0 means it is not a time series data."
-msgstr ""
-
-#: of paddle.fluid.Variable:30
-msgid "The capacity of Channel variable. Ignored for other types."
-msgstr ""
-
-#: of paddle.fluid.Variable:33
-msgid "True if the variable should be saved as check point. Defaults to False."
-msgstr ""
-
-#: of paddle.fluid.Variable:36
-msgid ""
-"True if the variable will stop to calculate gradients when backward. "
-"Defaults to False."
-msgstr ""
-
-#: of paddle.fluid.Variable.astype:1
-msgid ""
-"Cast a variable to a specified data type. NOTE: The variable must be a "
-"Tensor :param self: The source variable :type self: Variable :param "
-"dtype: The target dtype"
-msgstr ""
-
-#: of paddle.fluid.Variable.astype:7
-msgid "Variable with new dtype"
-msgstr ""
-
-#: of paddle.fluid.Variable.to_string:1
-msgid "Get debug string."
-msgstr ""
-
-#: of paddle.fluid.Variable.to_string:3
-msgid "True if raise an exception when self is not intialized."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:29
-msgid "Program"
-msgstr ""
-
-#: of paddle.fluid.Program.clone:1
-msgid "Clone the Program object"
-msgstr ""
-
-#: of paddle.fluid.Program.clone:3
-msgid ""
-"Set for_test to False when we want to clone the program for training. Set"
-" for_test to True when we want to clone the program for testing."
-msgstr ""
-
-#: of paddle.fluid.Program.clone:6
-msgid ""
-"Some operators, such as batch_norm and drop_out ops, behave differently "
-"in training and testing. If for_test is True, the is_test attributes in "
-"these operators will be set to True for testing purposes, otherwise, they"
-" remain unchanged."
-msgstr ""
-
-#: of paddle.fluid.Program.clone:12
-msgid "Returns(Program):"
-msgstr ""
-
-#: of paddle.fluid.Program.clone:13
-msgid "The cloned Program object."
-msgstr ""
-
-#: of paddle.fluid.Program.copy_data_info_from:1
-msgid ""
-"Copy the information of data variables from other program. :param other: "
-"Other program :type other: Program"
-msgstr ""
-
-#: of paddle.fluid.Program.copy_param_info_from:1
-msgid ""
-"Copy the information of parameters from other program. :param other: "
-"Other program :type other: Program"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:38
-msgid "Operator"
-msgstr ""
-
-#: of paddle.fluid.Operator:1
-msgid ""
-"Python Operator class. The operator represents the build in instructions "
-"in a Block. Users can use the build in instructions to describe their "
-"neural network."
-msgstr ""
-
-#: of paddle.fluid.Operator.all_attrs:1
-msgid "Get the attribute dict Returns(dict): The Operator's attribute dict"
-msgstr ""
-
-#: of paddle.fluid.Operator.attr:1
-msgid "Get attribute by name :param name: the attribute name :type name: str"
-msgstr ""
-
-#: of paddle.fluid.Operator.attr:5
-msgid "Returns(bool|int|str|float|list): The attribute value. The return value"
-msgstr ""
-
-#: of paddle.fluid.Operator.attr:6
-msgid "can be any valid attribute type."
-msgstr ""
-
-#: of paddle.fluid.Operator.attr_names:1
-msgid "Get all attribute names Returns(list): The list of attribute name"
-msgstr ""
-
-#: of paddle.fluid.Operator.attr_type:1
-msgid ""
-"Get the type of attribute by attribute name :param name: the attribute "
-"name :type name: str"
-msgstr ""
-
-#: of paddle.fluid.Operator.attr_type:5
-msgid "Returns(core.AttrType): the attribute type"
-msgstr ""
-
-#: of paddle.fluid.Operator.block_attr:1
-msgid ""
-"Get the block attribute by name :param name: the attribute name :type "
-"name: str"
-msgstr ""
-
-#: of paddle.fluid.Operator.block_attr:5
-msgid "Returns(int): the block index"
-msgstr ""
-
-#: of paddle.fluid.Operator.has_attr:1
-msgid ""
-"operator has the attribute with name or not. :param name: the attribute "
-"name :type name: str"
-msgstr ""
-
-#: of paddle.fluid.Operator.has_attr:5
-msgid "Returns(bool): True if has this attribute."
-msgstr ""
-
-#: of paddle.fluid.Operator.idx:1
-msgid ""
-"Return the array index of current operator. Returns(int): The array index"
-" in block.ops array :raises: :exc:`ValueError` -- when the operator is "
-"not found."
-msgstr ""
-
-#: of paddle.fluid.Operator.input:1
-msgid ""
-"Get input arguments by the input parameter name :param name: The input "
-"parameter name :type name: str"
-msgstr ""
-
-#: of paddle.fluid.Operator.input:5 paddle.fluid.Operator.output:5
-msgid "Returns(list): return the list of argument names associated with the"
-msgstr ""
-
-#: of paddle.fluid.Operator.input:6 paddle.fluid.Operator.output:6
-msgid "specific parameter name."
-msgstr ""
-
-#: of paddle.fluid.Operator.input_names:1
-msgid ""
-"Get all input parameter names Returns(list): return a list of input "
-"parameter names"
-msgstr ""
-
-#: of paddle.fluid.Operator.output:1
-msgid ""
-"Get output arguments by the output parameter name :param name: The output"
-" parameter name :type name: str"
-msgstr ""
-
-#: of paddle.fluid.Operator.output_names:1
-msgid ""
-"Get all output parameter names Returns(list): return a list of output "
-"parameter names"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:47
-msgid "default_startup_program"
-msgstr ""
-
-#: of paddle.fluid.default_startup_program:1
-msgid ""
-"Get default startup program. In startup program, Paddle will initialize "
-"parameters, initialize nccl handle, etc."
-msgstr ""
-
-#: of paddle.fluid.default_startup_program:4
-msgid "startup program"
-msgstr ""
-
-#: of paddle.fluid.channel_recv paddle.fluid.channel_send
-#: paddle.fluid.default_main_program paddle.fluid.default_startup_program
-#: paddle.fluid.make_channel paddle.fluid.switch_main_program
-#: paddle.fluid.switch_startup_program
-msgid "返回类型"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:55
-msgid "default_main_program"
-msgstr ""
-
-#: of paddle.fluid.default_main_program:1
-msgid ""
-"Get default main program. The main program is used for training or "
-"testing."
-msgstr ""
-
-#: of paddle.fluid.default_main_program:3
-msgid "main program"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:63
-msgid "program_guard"
-msgstr ""
-
-#: of paddle.fluid.program_guard:1
-msgid "Switch program with `with` statement"
-msgstr ""
-
-#: of paddle.fluid.channel_close:7 paddle.fluid.channel_recv:17
-#: paddle.fluid.channel_send:21 paddle.fluid.make_channel:29
-#: paddle.fluid.program_guard:3
-msgid "Examples"
-msgstr ""
-
-#: of paddle.fluid.program_guard:9
-msgid "New main program inside `with` statement"
-msgstr ""
-
-#: of paddle.fluid.program_guard:11
-msgid ""
-"New startup program inside `with` statement. None means do not change "
-"startup program."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:71
-msgid "switch_startup_program"
-msgstr ""
-
-#: of paddle.fluid.switch_startup_program:1
-msgid ""
-"Switch the startup program to a new program :param program: The new "
-"startup program :type program: Program"
-msgstr ""
-
-#: of paddle.fluid.switch_startup_program:5
-msgid "The previous startup program"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:79
-msgid "switch_main_program"
-msgstr ""
-
-#: of paddle.fluid.switch_main_program:1
-msgid "Switch the main program to a new program."
-msgstr ""
-
-#: of paddle.fluid.switch_main_program:3
-msgid "The new main program"
-msgstr ""
-
-#: of paddle.fluid.switch_main_program:6
-msgid "The previous main program"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:87
-msgid "get_var"
-msgstr ""
-
-#: of paddle.fluid.get_var:1
-msgid ""
-"Get a variable by name from the global block of a program :param name: "
-"name of the variable :type name: str :param program: program object."
-msgstr ""
-
-#: of paddle.fluid.get_var:5
-msgid "If None, default_global_program() will be used."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:95
-msgid "Executor"
-msgstr ""
-
-#: of paddle.fluid.Executor.run:1
-msgid ""
-"Run program by this Executor. Feed data by feed map, fetch result by "
-"fetch_list."
-msgstr ""
-
-#: of paddle.fluid.Executor.run:3
-msgid ""
-"Python executor takes a program, add feed operators and fetch operators "
-"to this program according to feed map and fetch_list. Feed map provides "
-"input data for the program. fetch_list provides the variables(or names) "
-"that user want to get after program run. Note: the executor will run all "
-"operators in the program but not only the operators dependent by the "
-"fetch_list"
-msgstr ""
-
-#: of paddle.fluid.Executor.run:8
-msgid ""
-"the program that need to run, if not provied, then default_main_program "
-"will be used."
-msgstr ""
-
-#: of paddle.fluid.Executor.run:9
-msgid "feed variable map, e.g. {\"image\": ImageData, \"label\": LableData}"
-msgstr ""
-
-#: of paddle.fluid.Executor.run:10
-msgid ""
-"a list of variable or variable names that user want to get, run will "
-"return them according"
-msgstr ""
-
-#: of paddle.fluid.Executor.run:11
-msgid ""
-"to this list. :param feed_var_name: the name for the input variable of "
-"feed Operator. :param fetch_var_name: the name for the output variable of"
-" feed Operator. :param scope: the scope used to run this program, you can"
-" switch it to different scope. default is global_scope :param "
-"return_numpy: if convert the fetched tensor to numpy :param "
-"use_program_cache: set use_program_cache to true if program not changed "
-"compare to the last step. :return: result according to fetch_list."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:104
-msgid "global_scope"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:112
-msgid "scope_guard"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:120
-msgid "switch_scope"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:128
-msgid "fetch_var"
-msgstr ""
-
-#: of paddle.fluid.fetch_var:1
-msgid ""
-"Fetch the value of the variable with the given name from the given scope "
-":param name: name of the variable. Typically, only persistable variables"
-msgstr ""
-
-#: of paddle.fluid.fetch_var:3
-msgid "can be found in the scope used for running the program."
-msgstr ""
-
-#: of paddle.fluid.fetch_var:5
-msgid ""
-"scope object. It should be the scope where you pass to Executor.run() "
-"when running your program. If None, global_scope() will be used."
-msgstr ""
-
-#: of paddle.fluid.fetch_var:9
-msgid "whether convert the tensor to numpy.ndarray"
-msgstr ""
-
-#: of paddle.fluid.fetch_var:12
-msgid "LodTensor|numpy.ndarray"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:136
-msgid "Go"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:145
-msgid "make_channel"
-msgstr ""
-
-#: of paddle.fluid.make_channel:1
-msgid ""
-"Helps implementation of a concurrent program by creating a \"channel\" of"
-" a defined data type. Channels allow for the passing of data in "
-"concurrent scenarios - such as when using threads to divide computation. "
-"Channels can be used to \"send\" and \"receive\" such data concurrently."
-msgstr ""
-
-#: of paddle.fluid.make_channel:6
-msgid ""
-"There are two kinds of channels: unbuffered and buffered. Unbuffered "
-"channels have no capacity - and thus, block on send and only unblock only"
-" once what they have sent has been received."
-msgstr ""
-
-#: of paddle.fluid.make_channel:10
-msgid ""
-"On the other hand, buffered channels are initialized with a capacity - "
-"and do not block on sends."
-msgstr ""
-
-#: of paddle.fluid.make_channel:13
-msgid ""
-"Use this method in combination with `channel_send`, `channel_recv`, "
-"`channel_close`, and `Go` to design a concurrent Paddle program."
-msgstr ""
-
-#: of paddle.fluid.make_channel:16
-msgid "Data type of the data sent in the channel."
-msgstr ""
-
-#: of paddle.fluid.make_channel:19
-msgid "Size of the channel. Defaults to 0 for"
-msgstr ""
-
-#: of paddle.fluid.make_channel:23
-msgid ""
-"The channel variable that can be used to send an receive data           "
-"of the defined dtype."
-msgstr ""
-
-#: of paddle.fluid.make_channel:25
-msgid "The channel variable that can be used to send an receive data"
-msgstr ""
-
-#: of paddle.fluid.make_channel:26
-msgid "of the defined dtype."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:153
-msgid "channel_send"
-msgstr ""
-
-#: of paddle.fluid.channel_send:1
-msgid ""
-"Sends a value through a channel variable. Used by an unbuffered or "
-"buffered channel to pass data from within or to a concurrent Go block, "
-"where `channel_recv` to used to get the passed value."
-msgstr ""
-
-#: of paddle.fluid.channel_close:3 paddle.fluid.channel_recv:6
-#: paddle.fluid.channel_send:5
-msgid "Channel variable created using"
-msgstr ""
-
-#: of paddle.fluid.channel_send:8
-msgid "Value to send to channel"
-msgstr ""
-
-#: of paddle.fluid.channel_send:10
-msgid "Copy data while channel send. If False, then data"
-msgstr ""
-
-#: of paddle.fluid.channel_send:15
-msgid ""
-"The boolean status on whether or not the channel           successfully "
-"sent the passed value."
-msgstr ""
-
-#: of paddle.fluid.channel_send:17
-msgid "The boolean status on whether or not the channel"
-msgstr ""
-
-#: of paddle.fluid.channel_send:18
-msgid "successfully sent the passed value."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:161
-msgid "channel_recv"
-msgstr ""
-
-#: of paddle.fluid.channel_recv:1
-msgid ""
-"Receives a value through a channel variable. Used by an unbuffered or "
-"buffered channel within a concurrent Go block to get data from originally"
-" sent using `channel_send`, or from outside such a block where "
-"`channel_send` is used to send the value."
-msgstr ""
-
-#: of paddle.fluid.channel_recv:9
-msgid "Variable to set as a result of running channel_recv_op"
-msgstr ""
-
-#: of paddle.fluid.channel_recv:12
-msgid ""
-"The received value from the channel. Variable: The boolean status on "
-"whether or not the channel           successfully received the passed "
-"value."
-msgstr ""
-
-#: of paddle.fluid.channel_recv:12
-msgid ""
-"The received value from the channel. Variable: The boolean status on "
-"whether or not the channel"
-msgstr ""
-
-#: of paddle.fluid.channel_recv:14
-msgid "successfully received the passed value."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:169
-msgid "channel_close"
-msgstr ""
-
-#: of paddle.fluid.channel_close:1
-msgid "Closes a channel created using `make_channel`."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:177
-msgid "Select"
-msgstr ""
-
-#: of paddle.fluid.Select.case:1
-msgid "Create a new block for this condition."
-msgstr ""
-
-#: of paddle.fluid.Select.default:1
-msgid "Create a default case block for this condition."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:186
-msgid "Trainer"
-msgstr ""
-
-#: of paddle.fluid.Trainer:1
-msgid "A function which will return loss. The loss must be a scalar."
-msgstr ""
-
-#: of paddle.fluid.Trainer:3
-msgid "A function that returns an Optimizer object."
-msgstr ""
-
-#: of paddle.fluid.Trainer:5
-msgid "The device place of this trainer."
-msgstr ""
-
-#: of paddle.fluid.Trainer.stop:1
-msgid "stop training"
-msgstr ""
-
-#: of paddle.fluid.Trainer.test:1
-msgid "Test the model on given test data"
-msgstr ""
-
-#: of paddle.fluid.Trainer.test:3
-msgid "The reader that yields test data."
-msgstr ""
-
-#: of paddle.fluid.Trainer.test:4 paddle.fluid.Trainer.train:6
-msgid "Feeding order of reader. None will following the defining order in program"
-msgstr ""
-
-#: of paddle.fluid.Trainer.train:1
-msgid "Train the model."
-msgstr ""
-
-#: of paddle.fluid.Trainer.train:3
-msgid "The number of epoch. An epoch will process all data in reader"
-msgstr ""
-
-#: of paddle.fluid.Trainer.train:4
-msgid "The event handler. A function with type (ev:Event)->void"
-msgstr ""
-
-#: of paddle.fluid.Trainer.train:9
-msgid "Returns:"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:195
-msgid "BeginEpochEvent"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:204
-msgid "EndEpochEvent"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:213
-msgid "BeginStepEvent"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:222
-msgid "EndStepEvent"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:231
-msgid "CheckpointConfig"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:240
-msgid "Inferencer"
-msgstr ""
-
-#: of paddle.fluid.Inferencer.infer:1
-msgid ""
-"a map of {\"input_name\": input_var} that will be feed into the inference"
-" program"
-msgstr ""
-
-#: of paddle.fluid.Inferencer.infer:2
-msgid "to get the predict value :return: the predict value of the inference model"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:249
-msgid "memory_optimize"
-msgstr ""
-
-#: of paddle.fluid.memory_optimize:1
-msgid "Optimize memory by reusing var memory."
-msgstr ""
-
-#: of paddle.fluid.memory_optimize:3
-msgid "Note: it doesn't not support subblock nested in subblock."
-msgstr ""
-
-#: of paddle.fluid.memory_optimize:5
-msgid "Input Program"
-msgstr ""
-
-#: of paddle.fluid.memory_optimize:6
-msgid "whether to print debug log."
-msgstr ""
-
-#: of paddle.fluid.memory_optimize:7
-msgid "If level=0, reuse if the shape is completely equal, o"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:257
-msgid "release_memory"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:265
-msgid "ParallelExecutor"
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:1
-msgid "Run a parallel executor with fetch_list."
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:3
-msgid ""
-"The feed parameter can be a dict or a list. If feed is a dict, the feed "
-"data will be split into multiple devices. If feed is a list, we assume "
-"the data has been splitted into multiple devices, the each element in the"
-" list will be copied to each device directly."
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:8
-msgid ""
-"For example, if the feed is a dict: >>> exe = ParallelExecutor() >>> # "
-"the image will be splitted into devices. If there is two devices >>> # "
-"each device will process an image with shape (24, 1, 28, 28) >>> "
-"exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})"
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:14
-msgid ""
-"For example, if the feed is a list: >>> exe = ParallelExecutor() >>> # "
-"each device will process each element in the list. >>> # the 1st device "
-"will process an image with shape (48, 1, 28, 28) >>> # the 2nd device "
-"will process an image with shape (32, 1, 28, 28) >>> # >>> # you can use "
-"exe.device_count to get the device number. >>> exe.run(feed=[{\"image\": "
-"numpy.random.random(size=(48, 1, 28, 28))}, >>>               {\"image\":"
-" numpy.random.random(size=(32, 1, 28, 28))}, >>>              ])"
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:26
-msgid "The fetched variable names"
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:28
-msgid ""
-"The feed variables. If the feed is a dict, tensors in that dict will be "
-"splitted into each devices. If the feed is a list, each element of the "
-"list will be copied to each device."
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:33
-msgid ""
-"Alias for feed parameter, for backward compatibility. This parameter is "
-"deprecated."
-msgstr ""
-
-#: of paddle.fluid.ParallelExecutor.run:36
-msgid "Returns: fetched result list."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:274
-msgid "ExecutionStrategy"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:283
-msgid "BuildStrategy"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:292
-msgid "create_lod_tensor"
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:1
-msgid "Create a lod tensor from a numpy array, a list, or an existing lod tensor."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:3
-msgid ""
-"Create a lod tensor by doing the following: 1. Check that the length-"
-"based input lod is valid. 2. Convert the length-based lod to a offset-"
-"based LoD. 3. Copy the data from a numpy array, a list or a existing lod "
-"tensor to"
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:7
-msgid "CPU or GPU device (based on input place)."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:8
-msgid "Set the level of detail (LoD) using the offset-based LoD."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:10
-msgid ""
-"Use example: Suppose we want LoDTensor to hold data for sequences of "
-"word, where each word is represented by an integer. If we want to create "
-"a LoDTensor to represent two sentences, one of 2 words, and one of 3 "
-"words."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:15
-msgid ""
-"Then 'data' can be a numpy array of integers with shape (5, 1). 'lod' "
-"will be [[2, 3]], indicating the length(# of words) in each sentence. "
-"This length-based input lod [[2, 3]] will be converted to offset-based "
-"lod [[0, 2, 5]] inside the function call."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:20
-msgid ""
-"Please refer to "
-"github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md"
-" for more details regarding LoD."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:24
-msgid "a numpy array or a LoDTensor or a list holding the data to be copied."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:25
-#: paddle.fluid.create_random_int_lodtensor:20
-msgid ""
-"a list of lists indicating the length-based LoD info specified by the "
-"user."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:26
-#: paddle.fluid.create_random_int_lodtensor:22
-msgid ""
-"CPU or GPU place indicating where the data in the new LoDTensor will be "
-"stored."
-msgstr ""
-
-#: of paddle.fluid.create_lod_tensor:28
-#: paddle.fluid.create_random_int_lodtensor:26
-msgid "A fluid LoDTensor object with tensor data and lod info."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:300
-msgid "create_random_int_lodtensor"
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:1
-msgid "Create a LoDTensor containing random integers."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:3
-msgid ""
-"This function is frequently used in the book examples. So we revised it "
-"based on the new create_lod_tensor API and put it here in the lod_tensor "
-"module to simplify the code."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:7
-msgid ""
-"The function does the following: 1. Calculate the overall shape of the "
-"LoDTensor based on the length-based 'lod' input and the shape of the "
-"basic element in 'base_shape'. 2. Create a numpy array of this shape. 3. "
-"Create the LoDTensor using create_lod_tensor API."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:13
-msgid ""
-"Suppose we want LoDTensor to hold data for sequences of word, where each "
-"word is represented by an integer. If we want to create a LoDTensor to "
-"represent two sentences, one of 2 words, and one of 3 words. Then "
-"'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the "
-"overall shape of the LoDTensor would be [5, 1], holding 5 words for two "
-"sentences."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:19
-msgid "a numpy array or a LoDTensor holding the data to be copied."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:21
-msgid "the shape of the basic element to be held by the LoDTensor."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:23
-msgid "the lower bound of the random integers."
-msgstr ""
-
-#: of paddle.fluid.create_random_int_lodtensor:24
-msgid "the upper bound of the random integers."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:308
-msgid "LoDTensor"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:317
-msgid "CPUPlace"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:326
-msgid "CUDAPlace"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:335
-msgid "CUDAPinnedPlace"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:344
-msgid "Tensor"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:353
-msgid "ParamAttr"
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:362
-msgid "WeightNormParamAttr"
-msgstr ""
-
-#: of paddle.fluid.WeightNormParamAttr:1
-msgid ""
-"Used for weight normalization. Any field in ParamAttr can also be set "
-"here. Besides, an extra field dim can be set to indicate the dimension "
-"except which to normalize."
-msgstr ""
-
-#: ../../source/api_reference/fluid.rst:371
-msgid "DataFeeder"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/index.po b/source/locale/en/LC_MESSAGES/api_reference/index.po
deleted file mode 100644
index cf6e3912e55140876f17ebb6395b754523d919cd..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/index.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/index.rst:3
-msgid "API Reference"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/initializer.po b/source/locale/en/LC_MESSAGES/api_reference/initializer.po
deleted file mode 100644
index ff18a675736f6fad2a45cc70638e5467a921038a..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/initializer.po
+++ /dev/null
@@ -1,121 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/initializer.rst:6
-msgid "fluid.initializer"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:11
-msgid "Constant"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:20
-msgid "Uniform"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:29
-msgid "Normal"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:38
-msgid "Xavier"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:47
-msgid "force_init_on_cpu"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:55
-msgid "init_on_cpu"
-msgstr ""
-
-#: of paddle.fluid.initializer.init_on_cpu:1
-msgid "Switch program with `with` statement"
-msgstr ""
-
-#: of paddle.fluid.initializer.init_on_cpu:3
-msgid "Examples"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:63
-msgid "ConstantInitializer"
-msgstr ""
-
-#: of paddle.fluid.initializer.ConstantInitializer:1
-msgid "Implements the constant initializer"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:72
-msgid "UniformInitializer"
-msgstr ""
-
-#: of paddle.fluid.initializer.UniformInitializer:1
-msgid "Implements the random uniform distribution initializer"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:81
-msgid "NormalInitializer"
-msgstr ""
-
-#: of paddle.fluid.initializer.NormalInitializer:1
-msgid "Implements the  random Normal(Gaussian) distribution initializer"
-msgstr ""
-
-#: ../../source/api_reference/initializer.rst:90
-msgid "XavierInitializer"
-msgstr ""
-
-#: of paddle.fluid.initializer.XavierInitializer:1
-msgid "Implements the Xavier initializer"
-msgstr ""
-
-#: of paddle.fluid.initializer.XavierInitializer:3
-msgid ""
-"This class implements the Xavier weight initializer from the paper "
-"Understanding the difficulty of training deep feedforward neural "
-"networks[1] by Xavier Glorot and Yoshua Bengio."
-msgstr ""
-
-#: of paddle.fluid.initializer.XavierInitializer:7
-msgid ""
-"This initializer is designed to keep the scale of the gradients "
-"approximately same in all the layers. In case of Uniform distribution, "
-"the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)). In case of "
-"Normal distribution, the mean is 0 and the standard deviation is sqrt(2/ "
-"(fan_in + fan_out))."
-msgstr ""
-
-#: of paddle.fluid.initializer.XavierInitializer:14
-msgid "References"
-msgstr ""
-
-#: of paddle.fluid.initializer.XavierInitializer:17
-msgid "[1] Understanding the difficulty of training deep feedforward neural"
-msgstr ""
-
-#: of paddle.fluid.initializer.XavierInitializer:16
-msgid ""
-"networks. International conference on artificial intelligence and "
-"statistics. (http://proceedings.mlr.press/v9/glorot10a.html)"
-msgstr ""
-
-#~ msgid "initializer"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/io.po b/source/locale/en/LC_MESSAGES/api_reference/io.po
deleted file mode 100644
index 8d7833855b4d97b9d068084f1ab2c50fb6bd3014..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/io.po
+++ /dev/null
@@ -1,317 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/io.rst:6
-msgid "fluid.io"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:11
-msgid "save_vars"
-msgstr ""
-
-#: of paddle.fluid.io.save_vars:1
-msgid "Save variables to directory by executor."
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model paddle.fluid.io.load_vars
-#: paddle.fluid.io.save_inference_model paddle.fluid.io.save_vars
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.io.save_vars:3
-msgid "executor that save variable"
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model:3 paddle.fluid.io.load_vars:4
-#: paddle.fluid.io.save_inference_model:4 paddle.fluid.io.save_vars:4
-msgid "directory path"
-msgstr ""
-
-#: of paddle.fluid.io.load_vars:5 paddle.fluid.io.save_vars:5
-msgid "program. If vars is None, then filter all variables in this"
-msgstr ""
-
-#: of paddle.fluid.io.save_vars:6
-msgid ""
-"program which fit `predicate`. Default default_main_program. :param "
-"predicate: The Predicate describes a callable that returns a variable as "
-"a bool. If it returns true, the corresponding input variable will be "
-"saved. :param vars: variables need to be saved. If vars is specified, "
-"program & predicate will be ignored :param filename: The name of a single"
-" file that all vars are saved to."
-msgstr ""
-
-#: of paddle.fluid.io.save_vars:12
-msgid "If it is None, save variables to separate files."
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model paddle.fluid.io.load_vars
-#: paddle.fluid.io.save_inference_model paddle.fluid.io.save_vars
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.io.load_vars:14 paddle.fluid.io.save_inference_model:16
-#: paddle.fluid.io.save_vars:14
-msgid "None"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:19
-msgid "save_params"
-msgstr ""
-
-#: of paddle.fluid.io.save_params:1
-msgid "Save all parameters to directory with executor."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:27
-msgid "save_persistables"
-msgstr ""
-
-#: of paddle.fluid.io.save_persistables:1
-msgid "Save all persistables to directory with executor."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:35
-msgid "load_vars"
-msgstr ""
-
-#: of paddle.fluid.io.load_vars:1
-msgid "Load variables from directory by executor."
-msgstr ""
-
-#: of paddle.fluid.io.load_vars:3
-msgid "executor that load variable"
-msgstr ""
-
-#: of paddle.fluid.io.load_vars:6
-msgid ""
-"program which fit `predicate`. Default default_main_program(). :param "
-"predicate: The Predicate describes a callable that returns a variable as "
-"a bool. If it returns true, the corresponding input variable will be "
-"loaded. :param vars: variables need to be loaded. If vars is specified, "
-"program & predicate will be ignored :param filename: The name of the "
-"single file that all vars are loaded from."
-msgstr ""
-
-#: of paddle.fluid.io.load_vars:12
-msgid "If it is None, load variables from separate files."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:43
-msgid "load_params"
-msgstr ""
-
-#: of paddle.fluid.io.load_params:1
-msgid "load all parameters from directory by executor."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:51
-msgid "load_persistables"
-msgstr ""
-
-#: of paddle.fluid.io.load_persistables:1
-msgid "load all persistables from directory by executor."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:59
-msgid "save_inference_model"
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:1
-msgid ""
-"Build a model especially for inference, and save it to directory by the "
-"executor."
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:5
-msgid "Names of variables that need to be feeded data during inference"
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:6
-msgid "Variables from which we can get inference results."
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:7
-msgid "executor that save inference model"
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:8
-msgid ""
-"original program, which will be pruned to build the inference model. "
-"Default default_main_program()."
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:10
-msgid ""
-"The name of file to save inference program. If not specified, default "
-"filename `__model__` will be used."
-msgstr ""
-
-#: of paddle.fluid.io.save_inference_model:12
-msgid ""
-"The name of file to save parameters. It is used for the case that all "
-"parameters are saved in a single binary file. If not specified, "
-"parameters are considered saved in separate files."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:67
-msgid "load_inference_model"
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model:1
-msgid "Load inference model from a directory"
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model:4
-msgid "executor that load inference model"
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model:5
-msgid ""
-"The name of file to load inference program. If not specified, default "
-"filename `__model__` will be used."
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model:7
-msgid ""
-"The name of file to load parameters. It is used for the case that all "
-"parameters are saved in a single binary file. If not specified, "
-"parameters are considered saved in separate files."
-msgstr ""
-
-#: of paddle.fluid.io.load_inference_model:11
-msgid ""
-"[program, feed_target_names, fetch_targets] program: program especially "
-"for inference. feed_target_names: Names of variables that need to feed "
-"data fetch_targets: Variables from which we can get inference results."
-msgstr ""
-
-#: ../../source/api_reference/io.rst:75
-msgid "get_inference_program"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:83
-msgid "save_checkpoint"
-msgstr ""
-
-#: of paddle.fluid.io.save_checkpoint:1
-msgid ""
-"Save Checkpoint will save persistable LodTensor variables from "
-"main_program in checkpoint directory, the directory named by serial "
-"number from 0 to (n -1), save_checkpoint use LRU strategy to keep numbers"
-" of checkpoint directory,  the numbers of checkpoint directory are "
-"max_num_checkpoints at most, The interval between two saved checkpoints "
-"must greater than save_interval_secs."
-msgstr ""
-
-#: of paddle.fluid.io.save_checkpoint:6
-msgid ""
-":param executor executor for save the value :param checkpoint_dir the "
-"checkpoint directory :param trainer_id currect trainer id, if id is equal"
-" to 0, the trainer is chief :param main_program   will save all variables"
-" in program :param max_num_checkpoints will keep numbers of checkpoint "
-"serials not bigger than max_num_checkpoints"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:91
-msgid "load_checkpoint"
-msgstr ""
-
-#: of paddle.fluid.io.load_checkpoint:1
-msgid ""
-"Load checkpoint from a directory by executor, it will find  the most "
-"recent saved checkpoint file and load it auto."
-msgstr ""
-
-#: of paddle.fluid.io.load_checkpoint:4
-msgid ""
-":param executor executor for load the value :param checkpoint_dir  the "
-"checkpoint directory :param serial the serial folder in checkpoint "
-"directory will be load :param main_program  will load all variables in "
-"program"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:99
-msgid "clean_checkpoint"
-msgstr ""
-
-#: of paddle.fluid.io.clean_checkpoint:1
-msgid ""
-"clean the checkpoint dir, when the train exits normally, the trainer will"
-" call clean_checkpoint to delete checkpoint directory saved before. "
-"delete_dir only works when the directory is empty, otherwise, OSError is "
-"raised."
-msgstr ""
-
-#: of paddle.fluid.io.clean_checkpoint:4
-msgid ":param checkpoint_dir :param delete_dir"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:107
-msgid "load_persist_vars_without_grad"
-msgstr ""
-
-#: of paddle.fluid.io.load_persist_vars_without_grad:1
-msgid ""
-"load_persist_vars_without_grad will load variables from a directory by an"
-" executor, the variable named end with \"@GRAD\" will not be loaded."
-msgstr ""
-
-#: of paddle.fluid.io.load_persist_vars_without_grad:4
-msgid ""
-":param executor  executor for load the value :param dirname the "
-"checkpoint directory :param program   will load all variables in program "
-":param has_model_dir if has_model_dir is True, will load variables from  "
-"sub directory named __model__"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:115
-msgid "save_persist_vars_without_grad"
-msgstr ""
-
-#: of paddle.fluid.io.save_persist_vars_without_grad:1
-msgid ""
-"save_persist_vars_without_grad  will save variables to a directory by an "
-"executor, the variable named end with \"@GRAD\" will not be saved."
-msgstr ""
-
-#: of paddle.fluid.io.save_persist_vars_without_grad:4
-msgid ""
-":param executor  executor for load the value :param dirname the "
-"checkpoint directory :param program   will load all variables in program"
-msgstr ""
-
-#: ../../source/api_reference/io.rst:123
-msgid "get_latest_checkpoint_serial"
-msgstr ""
-
-#: of paddle.fluid.io.get_latest_checkpoint_serial:1
-msgid ""
-"get the latest file in checkpoint directory, the _SUCCESS file must exist"
-" in the directory"
-msgstr ""
-
-#: of paddle.fluid.io.get_latest_checkpoint_serial:3
-msgid ":param checkpoint_dir"
-msgstr ""
-
-#~ msgid "io"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/layers.po b/source/locale/en/LC_MESSAGES/api_reference/layers.po
deleted file mode 100644
index 2c86984ba85df6755a3160118176b86b2adadfaa..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/layers.po
+++ /dev/null
@@ -1,8463 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/layers.rst:6
-msgid "fluid.layers"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:9
-msgid "control_flow"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:14
-msgid "split_lod_tensor"
-msgstr ""
-
-#: of paddle.fluid.layers.split_lod_tensor:1
-msgid "**split_lod_tensor**"
-msgstr ""
-
-#: of paddle.fluid.layers.split_lod_tensor:3
-msgid ""
-"This function takes in an input that contains the complete lod "
-"information, and takes in a mask which is used to mask certain parts of "
-"the input. The output is the true branch and the false branch with the "
-"mask applied to the input at a certain level in the tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory
-#: paddle.fluid.layers.DynamicRNN.output paddle.fluid.layers.IfElse
-#: paddle.fluid.layers.Preprocessor paddle.fluid.layers.Print
-#: paddle.fluid.layers.Send paddle.fluid.layers.StaticRNN.memory
-#: paddle.fluid.layers.StaticRNNMemoryLink paddle.fluid.layers.While
-#: paddle.fluid.layers.abs paddle.fluid.layers.argmax
-#: paddle.fluid.layers.argmin paddle.fluid.layers.array_length
-#: paddle.fluid.layers.array_to_lod_tensor paddle.fluid.layers.array_write
-#: paddle.fluid.layers.assign paddle.fluid.layers.autoincreased_step_counter
-#: paddle.fluid.layers.batch_norm paddle.fluid.layers.beam_search
-#: paddle.fluid.layers.beam_search_decode paddle.fluid.layers.bipartite_match
-#: paddle.fluid.layers.box_coder paddle.fluid.layers.brelu
-#: paddle.fluid.layers.ceil paddle.fluid.layers.chunk_eval
-#: paddle.fluid.layers.clip paddle.fluid.layers.clip_by_norm
-#: paddle.fluid.layers.concat paddle.fluid.layers.conv2d
-#: paddle.fluid.layers.conv2d_transpose paddle.fluid.layers.conv3d
-#: paddle.fluid.layers.conv3d_transpose paddle.fluid.layers.cos
-#: paddle.fluid.layers.cos_sim paddle.fluid.layers.create_array
-#: paddle.fluid.layers.create_parameter paddle.fluid.layers.crf_decoding
-#: paddle.fluid.layers.cross_entropy paddle.fluid.layers.ctc_greedy_decoder
-#: paddle.fluid.layers.cumsum paddle.fluid.layers.data
-#: paddle.fluid.layers.detection_map paddle.fluid.layers.detection_output
-#: paddle.fluid.layers.dice_loss paddle.fluid.layers.double_buffer
-#: paddle.fluid.layers.dropout paddle.fluid.layers.dynamic_gru
-#: paddle.fluid.layers.dynamic_lstm paddle.fluid.layers.dynamic_lstmp
-#: paddle.fluid.layers.edit_distance paddle.fluid.layers.elementwise_add
-#: paddle.fluid.layers.elementwise_div paddle.fluid.layers.elementwise_max
-#: paddle.fluid.layers.elementwise_min paddle.fluid.layers.elementwise_mul
-#: paddle.fluid.layers.elementwise_pow paddle.fluid.layers.elementwise_sub
-#: paddle.fluid.layers.elu paddle.fluid.layers.embedding
-#: paddle.fluid.layers.equal paddle.fluid.layers.exp
-#: paddle.fluid.layers.exponential_decay paddle.fluid.layers.fc
-#: paddle.fluid.layers.fill_constant
-#: paddle.fluid.layers.fill_constant_batch_size_like paddle.fluid.layers.floor
-#: paddle.fluid.layers.gather paddle.fluid.layers.gaussian_random
-#: paddle.fluid.layers.gaussian_random_batch_size_like
-#: paddle.fluid.layers.get_places paddle.fluid.layers.gru_unit
-#: paddle.fluid.layers.hard_shrink paddle.fluid.layers.hard_sigmoid
-#: paddle.fluid.layers.im2sequence paddle.fluid.layers.image_resize
-#: paddle.fluid.layers.image_resize_short paddle.fluid.layers.increment
-#: paddle.fluid.layers.inverse_time_decay paddle.fluid.layers.iou_similarity
-#: paddle.fluid.layers.is_empty paddle.fluid.layers.label_smooth
-#: paddle.fluid.layers.layer_norm paddle.fluid.layers.leaky_relu
-#: paddle.fluid.layers.less_than paddle.fluid.layers.linear_chain_crf
-#: paddle.fluid.layers.load paddle.fluid.layers.lod_rank_table
-#: paddle.fluid.layers.lod_reset paddle.fluid.layers.lod_tensor_to_array
-#: paddle.fluid.layers.log paddle.fluid.layers.logical_and
-#: paddle.fluid.layers.logical_not paddle.fluid.layers.logical_or
-#: paddle.fluid.layers.logical_xor paddle.fluid.layers.logsigmoid
-#: paddle.fluid.layers.lstm_unit paddle.fluid.layers.matmul
-#: paddle.fluid.layers.max_sequence_len paddle.fluid.layers.maxout
-#: paddle.fluid.layers.mean paddle.fluid.layers.mean_iou
-#: paddle.fluid.layers.merge_lod_tensor paddle.fluid.layers.mul
-#: paddle.fluid.layers.multi_box_head paddle.fluid.layers.multiplex
-#: paddle.fluid.layers.natural_exp_decay paddle.fluid.layers.nce
-#: paddle.fluid.layers.noam_decay paddle.fluid.layers.one_hot
-#: paddle.fluid.layers.ones paddle.fluid.layers.open_files
-#: paddle.fluid.layers.open_recordio_file paddle.fluid.layers.pad
-#: paddle.fluid.layers.polygon_box_transform paddle.fluid.layers.pool2d
-#: paddle.fluid.layers.pool3d paddle.fluid.layers.pow
-#: paddle.fluid.layers.prior_box paddle.fluid.layers.random_crop
-#: paddle.fluid.layers.random_data_generator paddle.fluid.layers.reciprocal
-#: paddle.fluid.layers.reduce_max paddle.fluid.layers.reduce_mean
-#: paddle.fluid.layers.reduce_min paddle.fluid.layers.reduce_prod
-#: paddle.fluid.layers.reduce_sum paddle.fluid.layers.relu
-#: paddle.fluid.layers.relu6 paddle.fluid.layers.reorder_lod_tensor_by_rank
-#: paddle.fluid.layers.reshape paddle.fluid.layers.resize_bilinear
-#: paddle.fluid.layers.roi_pool paddle.fluid.layers.round
-#: paddle.fluid.layers.row_conv paddle.fluid.layers.scale
-#: paddle.fluid.layers.scatter paddle.fluid.layers.sequence_conv
-#: paddle.fluid.layers.sequence_expand paddle.fluid.layers.sequence_first_step
-#: paddle.fluid.layers.sequence_last_step paddle.fluid.layers.sequence_pool
-#: paddle.fluid.layers.sequence_reshape paddle.fluid.layers.shape
-#: paddle.fluid.layers.shrink_memory paddle.fluid.layers.sigmoid
-#: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
-#: paddle.fluid.layers.sin paddle.fluid.layers.slice
-#: paddle.fluid.layers.smooth_l1 paddle.fluid.layers.soft_relu
-#: paddle.fluid.layers.softmax_with_cross_entropy paddle.fluid.layers.softplus
-#: paddle.fluid.layers.softshrink paddle.fluid.layers.softsign
-#: paddle.fluid.layers.split paddle.fluid.layers.split_lod_tensor
-#: paddle.fluid.layers.sqrt paddle.fluid.layers.square
-#: paddle.fluid.layers.square_error_cost paddle.fluid.layers.ssd_loss
-#: paddle.fluid.layers.stanh paddle.fluid.layers.sum paddle.fluid.layers.sums
-#: paddle.fluid.layers.swish paddle.fluid.layers.tanh
-#: paddle.fluid.layers.tanh_shrink paddle.fluid.layers.target_assign
-#: paddle.fluid.layers.thresholded_relu paddle.fluid.layers.topk
-#: paddle.fluid.layers.transpose paddle.fluid.layers.uniform_random
-#: paddle.fluid.layers.uniform_random_batch_size_like
-#: paddle.fluid.layers.warpctc paddle.fluid.layers.zeros
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:12
-#: paddle.fluid.layers.split_lod_tensor:8
-msgid ""
-"The input tensor that contains complete lod information needed to "
-"construct the output."
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:15
-#: paddle.fluid.layers.split_lod_tensor:11
-msgid "A bool column vector which masks the input."
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:17
-#: paddle.fluid.layers.split_lod_tensor:13
-msgid "The specific lod level to rank."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory
-#: paddle.fluid.layers.DynamicRNN.output
-#: paddle.fluid.layers.DynamicRNN.static_input
-#: paddle.fluid.layers.DynamicRNN.step_input
-#: paddle.fluid.layers.DynamicRNN.update_memory paddle.fluid.layers.Print
-#: paddle.fluid.layers.abs paddle.fluid.layers.argmax
-#: paddle.fluid.layers.argmin paddle.fluid.layers.array_length
-#: paddle.fluid.layers.array_read paddle.fluid.layers.array_to_lod_tensor
-#: paddle.fluid.layers.array_write paddle.fluid.layers.assign
-#: paddle.fluid.layers.autoincreased_step_counter
-#: paddle.fluid.layers.batch_norm paddle.fluid.layers.beam_search
-#: paddle.fluid.layers.beam_search_decode paddle.fluid.layers.bipartite_match
-#: paddle.fluid.layers.box_coder paddle.fluid.layers.brelu
-#: paddle.fluid.layers.ceil paddle.fluid.layers.chunk_eval
-#: paddle.fluid.layers.clip paddle.fluid.layers.clip_by_norm
-#: paddle.fluid.layers.concat paddle.fluid.layers.conv2d
-#: paddle.fluid.layers.conv2d_transpose paddle.fluid.layers.conv3d
-#: paddle.fluid.layers.conv3d_transpose paddle.fluid.layers.cos
-#: paddle.fluid.layers.cos_sim paddle.fluid.layers.create_array
-#: paddle.fluid.layers.create_global_var paddle.fluid.layers.create_parameter
-#: paddle.fluid.layers.crf_decoding paddle.fluid.layers.cross_entropy
-#: paddle.fluid.layers.ctc_greedy_decoder paddle.fluid.layers.cumsum
-#: paddle.fluid.layers.data paddle.fluid.layers.detection_map
-#: paddle.fluid.layers.detection_output paddle.fluid.layers.dice_loss
-#: paddle.fluid.layers.double_buffer paddle.fluid.layers.dropout
-#: paddle.fluid.layers.dynamic_gru paddle.fluid.layers.dynamic_lstm
-#: paddle.fluid.layers.dynamic_lstmp paddle.fluid.layers.edit_distance
-#: paddle.fluid.layers.elementwise_add paddle.fluid.layers.elementwise_div
-#: paddle.fluid.layers.elementwise_max paddle.fluid.layers.elementwise_min
-#: paddle.fluid.layers.elementwise_mul paddle.fluid.layers.elementwise_pow
-#: paddle.fluid.layers.elementwise_sub paddle.fluid.layers.elu
-#: paddle.fluid.layers.embedding paddle.fluid.layers.equal
-#: paddle.fluid.layers.exp paddle.fluid.layers.exponential_decay
-#: paddle.fluid.layers.fc paddle.fluid.layers.fill_constant
-#: paddle.fluid.layers.fill_constant_batch_size_like paddle.fluid.layers.floor
-#: paddle.fluid.layers.gather paddle.fluid.layers.gaussian_random
-#: paddle.fluid.layers.gaussian_random_batch_size_like
-#: paddle.fluid.layers.get_places paddle.fluid.layers.gru_unit
-#: paddle.fluid.layers.hard_shrink paddle.fluid.layers.hard_sigmoid
-#: paddle.fluid.layers.im2sequence paddle.fluid.layers.image_resize
-#: paddle.fluid.layers.image_resize_short paddle.fluid.layers.increment
-#: paddle.fluid.layers.inverse_time_decay paddle.fluid.layers.iou_similarity
-#: paddle.fluid.layers.is_empty paddle.fluid.layers.label_smooth
-#: paddle.fluid.layers.layer_norm paddle.fluid.layers.leaky_relu
-#: paddle.fluid.layers.less_than paddle.fluid.layers.linear_chain_crf
-#: paddle.fluid.layers.load paddle.fluid.layers.lod_rank_table
-#: paddle.fluid.layers.lod_reset paddle.fluid.layers.lod_tensor_to_array
-#: paddle.fluid.layers.log paddle.fluid.layers.logical_and
-#: paddle.fluid.layers.logical_not paddle.fluid.layers.logical_or
-#: paddle.fluid.layers.logical_xor paddle.fluid.layers.logsigmoid
-#: paddle.fluid.layers.lstm_unit paddle.fluid.layers.matmul
-#: paddle.fluid.layers.max_sequence_len paddle.fluid.layers.maxout
-#: paddle.fluid.layers.mean paddle.fluid.layers.mean_iou
-#: paddle.fluid.layers.merge_lod_tensor paddle.fluid.layers.mul
-#: paddle.fluid.layers.multi_box_head paddle.fluid.layers.multiplex
-#: paddle.fluid.layers.natural_exp_decay paddle.fluid.layers.nce
-#: paddle.fluid.layers.noam_decay paddle.fluid.layers.one_hot
-#: paddle.fluid.layers.ones paddle.fluid.layers.open_files
-#: paddle.fluid.layers.open_recordio_file paddle.fluid.layers.pad
-#: paddle.fluid.layers.polygon_box_transform
-#: paddle.fluid.layers.polynomial_decay paddle.fluid.layers.pool2d
-#: paddle.fluid.layers.pool3d paddle.fluid.layers.pow
-#: paddle.fluid.layers.prior_box paddle.fluid.layers.random_crop
-#: paddle.fluid.layers.random_data_generator paddle.fluid.layers.reciprocal
-#: paddle.fluid.layers.reduce_max paddle.fluid.layers.reduce_mean
-#: paddle.fluid.layers.reduce_min paddle.fluid.layers.reduce_prod
-#: paddle.fluid.layers.reduce_sum paddle.fluid.layers.relu
-#: paddle.fluid.layers.relu6 paddle.fluid.layers.reorder_lod_tensor_by_rank
-#: paddle.fluid.layers.reshape paddle.fluid.layers.resize_bilinear
-#: paddle.fluid.layers.roi_pool paddle.fluid.layers.round
-#: paddle.fluid.layers.row_conv paddle.fluid.layers.scale
-#: paddle.fluid.layers.scatter paddle.fluid.layers.sequence_conv
-#: paddle.fluid.layers.sequence_expand paddle.fluid.layers.sequence_first_step
-#: paddle.fluid.layers.sequence_last_step paddle.fluid.layers.sequence_pool
-#: paddle.fluid.layers.sequence_reshape paddle.fluid.layers.shape
-#: paddle.fluid.layers.shrink_memory paddle.fluid.layers.sigmoid
-#: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
-#: paddle.fluid.layers.sin paddle.fluid.layers.slice
-#: paddle.fluid.layers.smooth_l1 paddle.fluid.layers.soft_relu
-#: paddle.fluid.layers.softmax_with_cross_entropy paddle.fluid.layers.softplus
-#: paddle.fluid.layers.softshrink paddle.fluid.layers.softsign
-#: paddle.fluid.layers.split paddle.fluid.layers.split_lod_tensor
-#: paddle.fluid.layers.sqrt paddle.fluid.layers.square
-#: paddle.fluid.layers.square_error_cost paddle.fluid.layers.ssd_loss
-#: paddle.fluid.layers.stanh paddle.fluid.layers.sum paddle.fluid.layers.sums
-#: paddle.fluid.layers.swish paddle.fluid.layers.tanh
-#: paddle.fluid.layers.tanh_shrink paddle.fluid.layers.target_assign
-#: paddle.fluid.layers.thresholded_relu paddle.fluid.layers.topk
-#: paddle.fluid.layers.transpose paddle.fluid.layers.uniform_random
-#: paddle.fluid.layers.uniform_random_batch_size_like
-#: paddle.fluid.layers.warpctc paddle.fluid.layers.zeros
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.layers.split_lod_tensor:16
-msgid ""
-"The true branch of tensor as per the mask applied to input. Variable: The"
-" false branch of tensor as per the mask applied to input."
-msgstr ""
-
-#: of paddle.fluid.layers.Print paddle.fluid.layers.argmax
-#: paddle.fluid.layers.argmin paddle.fluid.layers.array_length
-#: paddle.fluid.layers.array_read paddle.fluid.layers.array_to_lod_tensor
-#: paddle.fluid.layers.array_write paddle.fluid.layers.assign
-#: paddle.fluid.layers.autoincreased_step_counter
-#: paddle.fluid.layers.batch_norm paddle.fluid.layers.beam_search
-#: paddle.fluid.layers.beam_search_decode paddle.fluid.layers.bipartite_match
-#: paddle.fluid.layers.chunk_eval paddle.fluid.layers.concat
-#: paddle.fluid.layers.conv2d paddle.fluid.layers.conv2d_transpose
-#: paddle.fluid.layers.conv3d paddle.fluid.layers.conv3d_transpose
-#: paddle.fluid.layers.cos_sim paddle.fluid.layers.create_array
-#: paddle.fluid.layers.create_global_var paddle.fluid.layers.ctc_greedy_decoder
-#: paddle.fluid.layers.data paddle.fluid.layers.detection_output
-#: paddle.fluid.layers.dice_loss paddle.fluid.layers.dropout
-#: paddle.fluid.layers.dynamic_gru paddle.fluid.layers.dynamic_lstm
-#: paddle.fluid.layers.dynamic_lstmp paddle.fluid.layers.edit_distance
-#: paddle.fluid.layers.embedding paddle.fluid.layers.equal
-#: paddle.fluid.layers.fill_constant paddle.fluid.layers.gather
-#: paddle.fluid.layers.gru_unit paddle.fluid.layers.im2sequence
-#: paddle.fluid.layers.image_resize paddle.fluid.layers.image_resize_short
-#: paddle.fluid.layers.increment paddle.fluid.layers.is_empty
-#: paddle.fluid.layers.label_smooth paddle.fluid.layers.linear_chain_crf
-#: paddle.fluid.layers.lod_rank_table paddle.fluid.layers.lod_reset
-#: paddle.fluid.layers.lod_tensor_to_array paddle.fluid.layers.lstm_unit
-#: paddle.fluid.layers.matmul paddle.fluid.layers.mean_iou
-#: paddle.fluid.layers.merge_lod_tensor paddle.fluid.layers.multi_box_head
-#: paddle.fluid.layers.nce paddle.fluid.layers.ones
-#: paddle.fluid.layers.open_files paddle.fluid.layers.open_recordio_file
-#: paddle.fluid.layers.pad paddle.fluid.layers.pool2d
-#: paddle.fluid.layers.pool3d paddle.fluid.layers.prior_box
-#: paddle.fluid.layers.random_data_generator paddle.fluid.layers.reduce_max
-#: paddle.fluid.layers.reduce_mean paddle.fluid.layers.reduce_min
-#: paddle.fluid.layers.reduce_prod paddle.fluid.layers.reduce_sum
-#: paddle.fluid.layers.reshape paddle.fluid.layers.roi_pool
-#: paddle.fluid.layers.sequence_conv paddle.fluid.layers.sequence_expand
-#: paddle.fluid.layers.sequence_reshape paddle.fluid.layers.smooth_l1
-#: paddle.fluid.layers.softmax_with_cross_entropy paddle.fluid.layers.split
-#: paddle.fluid.layers.split_lod_tensor paddle.fluid.layers.square_error_cost
-#: paddle.fluid.layers.sums paddle.fluid.layers.target_assign
-#: paddle.fluid.layers.topk paddle.fluid.layers.transpose
-#: paddle.fluid.layers.warpctc paddle.fluid.layers.zeros
-msgid "返回类型"
-msgstr ""
-
-#: of paddle.fluid.layers.IfElse:8 paddle.fluid.layers.Preprocessor:8
-#: paddle.fluid.layers.Print:34 paddle.fluid.layers.While:8
-#: paddle.fluid.layers.argmax:15 paddle.fluid.layers.argmin:15
-#: paddle.fluid.layers.array_length:11 paddle.fluid.layers.array_read:12
-#: paddle.fluid.layers.array_to_lod_tensor:16
-#: paddle.fluid.layers.array_write:21 paddle.fluid.layers.assign:13
-#: paddle.fluid.layers.bipartite_match:60 paddle.fluid.layers.concat:17
-#: paddle.fluid.layers.conv2d:99 paddle.fluid.layers.conv2d_transpose:98
-#: paddle.fluid.layers.conv3d:94 paddle.fluid.layers.conv3d_transpose:99
-#: paddle.fluid.layers.create_array:10 paddle.fluid.layers.create_parameter:23
-#: paddle.fluid.layers.cross_entropy:55
-#: paddle.fluid.layers.ctc_greedy_decoder:51 paddle.fluid.layers.cumsum:16
-#: paddle.fluid.layers.data:29 paddle.fluid.layers.detection_output:65
-#: paddle.fluid.layers.dice_loss:26 paddle.fluid.layers.double_buffer:15
-#: paddle.fluid.layers.dropout:26 paddle.fluid.layers.dynamic_gru:63
-#: paddle.fluid.layers.dynamic_lstm:104 paddle.fluid.layers.dynamic_lstmp:113
-#: paddle.fluid.layers.edit_distance:38 paddle.fluid.layers.embedding:34
-#: paddle.fluid.layers.equal:15 paddle.fluid.layers.fc:62
-#: paddle.fluid.layers.fill_constant:22 paddle.fluid.layers.gather:33
-#: paddle.fluid.layers.gru_unit:44 paddle.fluid.layers.hard_shrink:18
-#: paddle.fluid.layers.image_resize:35 paddle.fluid.layers.increment:18
-#: paddle.fluid.layers.is_empty:17 paddle.fluid.layers.label_smooth:42
-#: paddle.fluid.layers.layer_norm:53 paddle.fluid.layers.lod_rank_table:40
-#: paddle.fluid.layers.lod_reset:70 paddle.fluid.layers.lod_tensor_to_array:16
-#: paddle.fluid.layers.lstm_unit:64 paddle.fluid.layers.matmul:43
-#: paddle.fluid.layers.mean_iou:25 paddle.fluid.layers.merge_lod_tensor:23
-#: paddle.fluid.layers.multi_box_head:81 paddle.fluid.layers.one_hot:12
-#: paddle.fluid.layers.ones:16 paddle.fluid.layers.open_files:28
-#: paddle.fluid.layers.open_recordio_file:24 paddle.fluid.layers.pad:42
-#: paddle.fluid.layers.prior_box:53 paddle.fluid.layers.random_crop:5
-#: paddle.fluid.layers.random_data_generator:24
-#: paddle.fluid.layers.reduce_max:22 paddle.fluid.layers.reduce_mean:22
-#: paddle.fluid.layers.reduce_min:22 paddle.fluid.layers.reduce_prod:22
-#: paddle.fluid.layers.reduce_sum:22 paddle.fluid.layers.reshape:61
-#: paddle.fluid.layers.roi_pool:34 paddle.fluid.layers.row_conv:40
-#: paddle.fluid.layers.sequence_expand:53
-#: paddle.fluid.layers.sequence_first_step:20
-#: paddle.fluid.layers.sequence_last_step:20
-#: paddle.fluid.layers.sequence_pool:40 paddle.fluid.layers.sequence_reshape:37
-#: paddle.fluid.layers.shrink_memory:19 paddle.fluid.layers.smooth_l1:34
-#: paddle.fluid.layers.softmax_with_cross_entropy:47
-#: paddle.fluid.layers.split:22 paddle.fluid.layers.split_lod_tensor:20
-#: paddle.fluid.layers.square_error_cost:26 paddle.fluid.layers.ssd_loss:95
-#: paddle.fluid.layers.sums:14 paddle.fluid.layers.thresholded_relu:17
-#: paddle.fluid.layers.topk:28 paddle.fluid.layers.transpose:18
-#: paddle.fluid.layers.uniform_random:18 paddle.fluid.layers.warpctc:34
-#: paddle.fluid.layers.zeros:16
-msgid "Examples"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:22
-msgid "merge_lod_tensor"
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:1
-msgid "**merge_lod_tensor**"
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:3
-msgid ""
-"This function takes in an input :math:`x`, the True branch, the False "
-"branch and a binary :math:`mask`. Using this information, this function "
-"merges the True and False branches of the tensor into a single Output at "
-"a certain lod level indiacted by :math:`level`."
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:8
-msgid "The True branch to be merged."
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:10
-msgid "The False branch to be merged."
-msgstr ""
-
-#: of paddle.fluid.layers.merge_lod_tensor:20
-msgid "The merged output tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:30
-msgid "BlockGuard"
-msgstr ""
-
-#: of paddle.fluid.layers.BlockGuard:1
-msgid "BlockGuard class."
-msgstr ""
-
-#: of paddle.fluid.layers.BlockGuard:3
-msgid ""
-"BlockGuard class is used to create a sub-block in a program by using the "
-"Python `with` keyword."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:39
-msgid "BlockGuardWithCompletion"
-msgstr ""
-
-#: of paddle.fluid.layers.BlockGuardWithCompletion:1
-msgid "BlockGuardWithCompletion class."
-msgstr ""
-
-#: of paddle.fluid.layers.BlockGuardWithCompletion:3
-msgid ""
-"BlockGuardWithCompletion class is used to create an op with a block in a "
-"program."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:48
-msgid "StaticRNNMemoryLink"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNNMemoryLink:1
-msgid "StaticRNNMemoryLink class."
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNNMemoryLink:3
-msgid "the initial variable for Memory"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNNMemoryLink:4
-#: paddle.fluid.layers.StaticRNNMemoryLink:6
-#: paddle.fluid.layers.StaticRNNMemoryLink:8
-msgid "Variable"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNNMemoryLink:5
-msgid "the memory variable in previous time step"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNNMemoryLink:7
-msgid "the memory variable in current time step"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNNMemoryLink:10
-msgid ""
-"StaticRNNMemoryLink class is used to create a link between two memory "
-"cells of a StaticRNN."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:57
-msgid "WhileGuard"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:66
-msgid "While"
-msgstr ""
-
-#: of paddle.fluid.layers.While:1
-msgid "while loop control flow."
-msgstr ""
-
-#: of paddle.fluid.layers.IfElse:3 paddle.fluid.layers.While:3
-msgid "condition used to compare."
-msgstr ""
-
-#: of paddle.fluid.layers.IfElse:5 paddle.fluid.layers.While:5
-#: paddle.fluid.layers.fc:55
-msgid "The name of this layer."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:75
-msgid "Switch"
-msgstr ""
-
-#: of paddle.fluid.layers.Switch.case:1
-msgid "create a new block for this condition"
-msgstr ""
-
-#: of paddle.fluid.layers.Switch.default:1
-msgid "create a default case for this switch"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:84
-msgid "lod_rank_table"
-msgstr ""
-
-#: of paddle.fluid.layers.lod_rank_table:1
-msgid ""
-"LoD Rank Table Operator. Given an input variable **x** and a level number"
-" of LoD, this layer creates a LodRankTable object. A LoDRankTable object "
-"contains a list of bi-element tuples. Each tuple consists of an index and"
-" a length, both of which are int type. Refering to specified level of "
-"LoD, the index is the sequence index number and the length representes "
-"the sequence length. Please note that the list is ranked in descending "
-"order by the length. The following is an example:"
-msgstr ""
-
-#: of paddle.fluid.layers.lod_rank_table:30
-msgid "Input variable, a LoDTensor based which to create the lod rank table."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_rank_table:33
-msgid "Specify the LoD level, on which to create the lod rank table."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_rank_table:37
-msgid "The created LoDRankTable object."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:92
-msgid "max_sequence_len"
-msgstr ""
-
-#: of paddle.fluid.layers.max_sequence_len:1
-msgid ""
-"Given a LoDRankTable object, this layer returns the max length of a batch"
-" of sequences. In fact, a LoDRankTable object contains a list of "
-"tuples(<sequence index, sequence length>) and the list is already sorted "
-"by sequence length in descending order, so the operator just returns the "
-"sequence length of the first tuple element"
-msgstr ""
-
-#: of paddle.fluid.layers.max_sequence_len:11
-msgid "Input variable which is a LoDRankTable object."
-msgstr ""
-
-#: of paddle.fluid.layers.max_sequence_len:14
-msgid "The max sequence length."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:100
-msgid "lod_tensor_to_array"
-msgstr ""
-
-#: of paddle.fluid.layers.lod_tensor_to_array:1
-msgid "Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_tensor_to_array:3
-msgid "The LOD tensor to be converted to a LOD tensor array."
-msgstr ""
-
-#: of paddle.fluid.layers.array_to_lod_tensor:5
-#: paddle.fluid.layers.lod_tensor_to_array:5
-msgid ""
-"The variable that stores the level of lod which is ordered by sequence "
-"length in descending order."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_tensor_to_array:10
-msgid ""
-"The variable of type array that has been converted from a           "
-"tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_tensor_to_array:12
-msgid "The variable of type array that has been converted from a"
-msgstr ""
-
-#: of paddle.fluid.layers.lod_tensor_to_array:13
-msgid "tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:108
-msgid "array_to_lod_tensor"
-msgstr ""
-
-#: of paddle.fluid.layers.array_to_lod_tensor:1
-msgid "Convert a LoD_Tensor_Aarry to an LoDTensor."
-msgstr ""
-
-#: of paddle.fluid.layers.array_to_lod_tensor:3
-msgid "The lod tensor array to be converted to a tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.array_to_lod_tensor:10
-msgid ""
-"The variable of type tensor that has been converted           from an "
-"array."
-msgstr ""
-
-#: of paddle.fluid.layers.array_to_lod_tensor:12
-msgid "The variable of type tensor that has been converted"
-msgstr ""
-
-#: of paddle.fluid.layers.array_to_lod_tensor:13
-msgid "from an array."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:116
-msgid "increment"
-msgstr ""
-
-#: of paddle.fluid.layers.increment:1
-msgid ""
-"This function performs an operation that increments each value in the "
-"input :math:`x` by an amount: :math:`value` as mentioned in the input "
-"parameter. This operation is performed in-place by default."
-msgstr ""
-
-#: of paddle.fluid.layers.increment:5
-msgid "The tensor that has the input values."
-msgstr ""
-
-#: of paddle.fluid.layers.increment:7
-msgid "The amount by which the values should be incremented."
-msgstr ""
-
-#: of paddle.fluid.layers.increment:9
-msgid "If the increment should be performed in-place."
-msgstr ""
-
-#: of paddle.fluid.layers.increment:12
-msgid ""
-"The tensor variable storing the transformation of           element-wise "
-"increment of each value in the input."
-msgstr ""
-
-#: of paddle.fluid.layers.increment:14
-msgid "The tensor variable storing the transformation of"
-msgstr ""
-
-#: of paddle.fluid.layers.increment:15
-msgid "element-wise increment of each value in the input."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:124
-msgid "array_write"
-msgstr ""
-
-#: of paddle.fluid.layers.array_write:1
-msgid ""
-"This function writes the given input variable to the specified position "
-"indicating by the arrary index to an output LOD_TENSOR_ARRAY. If the "
-"output LOD_TENSOR_ARRAY is not given(None), a new one will be created and"
-" returned."
-msgstr ""
-
-#: of paddle.fluid.layers.array_write:6
-msgid "The input tensor from which the data will be read."
-msgstr ""
-
-#: of paddle.fluid.layers.array_write:8
-msgid ""
-"The index of the output LOD_TENSOR_ARRAY, pointing to the position to "
-"which the input tensor will be written."
-msgstr ""
-
-#: of paddle.fluid.layers.array_write:12
-msgid ""
-"The output LOD_TENSOR_ARRAY to which the input tensor will be written. If"
-" this parameter is NONE, a new LOD_TENSOR_ARRAY will be created and "
-"returned."
-msgstr ""
-
-#: of paddle.fluid.layers.array_write:18
-msgid "The output LOD_TENSOR_ARRAY where the input tensor is written."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:132
-msgid "create_array"
-msgstr ""
-
-#: of paddle.fluid.layers.create_array:1
-msgid ""
-"This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the"
-" LayerHelper."
-msgstr ""
-
-#: of paddle.fluid.layers.create_array:4
-msgid "The data type of the elements in the array."
-msgstr ""
-
-#: of paddle.fluid.layers.create_array:7
-msgid "The tensor variable storing the elements of data type."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:140
-msgid "less_than"
-msgstr ""
-
-#: of paddle.fluid.layers.less_than:1
-msgid ""
-"It operates element-wise on X and Y, and returns the Out. Each of them is"
-" a N-dim tensor. X and Y could be any type.  The each element of the Out "
-"tensor is calculated by :math:`Out = X < Y`"
-msgstr ""
-
-#: of paddle.fluid.layers.less_than:8
-msgid "the left hand operand of less_than operator."
-msgstr ""
-
-#: of paddle.fluid.layers.less_than:10
-msgid "the right hand operand of less_than operator."
-msgstr ""
-
-#: of paddle.fluid.layers.less_than:12
-msgid ""
-"Force fill output variable to cpu memory. Otherwise, fill output variable"
-" to the running device [default true]."
-msgstr ""
-
-#: of paddle.fluid.layers.less_than:14
-msgid "Optional output variable to store the result of *less_than*"
-msgstr ""
-
-#: of paddle.fluid.layers.less_than:17
-msgid "n-dim bool tensor. Each element is Out = X < Y."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:148
-msgid "equal"
-msgstr ""
-
-#: of paddle.fluid.layers.equal:1
-msgid "**equal**"
-msgstr ""
-
-#: of paddle.fluid.layers.equal:3
-msgid "This layer returns the truth value of :math:`x == y` elementwise."
-msgstr ""
-
-#: of paddle.fluid.layers.equal:5
-msgid "First operand of *equal*"
-msgstr ""
-
-#: of paddle.fluid.layers.equal:7
-msgid "Second operand of *equal*"
-msgstr ""
-
-#: of paddle.fluid.layers.equal:9
-msgid "Optional output variable to store the result of *equal*"
-msgstr ""
-
-#: of paddle.fluid.layers.equal:12
-msgid "The tensor variable storing the output of *equal*."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:156
-msgid "array_read"
-msgstr ""
-
-#: of paddle.fluid.layers.array_read:1
-msgid ""
-"This function performs the operation to read the data in as an "
-"LOD_TENSOR_ARRAY. :param array: The input tensor that will be written to "
-"an array. :type array: Variable|list :param i: The subscript index in "
-"tensor array, that points the"
-msgstr ""
-
-#: of paddle.fluid.layers.array_read:6
-msgid "place where data will be written to."
-msgstr ""
-
-#: of paddle.fluid.layers.array_read:9
-msgid "The tensor type variable that has the data written to it."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:164
-msgid "shrink_memory"
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:1
-msgid ""
-"This function creates an operator to shrink rnn memory using the "
-"RankTable as mentioned in the input parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:4
-msgid "NOTE: This API is very low-level API. It is used by DynamicRNN only."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:6
-msgid ""
-"Since the Dynamic RNN uses no-padding way to implement RNN. The sequence "
-"will be sorted by order, and the length of valid memory will be shrink "
-"after each time step."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:10
-msgid "The memory object in the previous time step."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:12
-msgid "The step count variable. A int scalar as LoDTensor."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:14
-msgid "The RNNRankTable object."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:17
-msgid "the memory variable after shrink."
-msgstr ""
-
-#: of paddle.fluid.layers.shrink_memory:21
-msgid ""
-"Since this API is very low level API. The example is not provided. Please"
-" reference the implementation of class DynamicRNN for detail usage."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:172
-msgid "array_length"
-msgstr ""
-
-#: of paddle.fluid.layers.array_length:1
-msgid ""
-"This function performs the operation to find the length of the input "
-"LOD_TENSOR_ARRAY."
-msgstr ""
-
-#: of paddle.fluid.layers.array_length:4
-msgid "The input array that will be used to compute the length."
-msgstr ""
-
-#: of paddle.fluid.layers.array_length:8
-msgid "The length of the input LoDTensorArray."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:180
-msgid "IfElse"
-msgstr ""
-
-#: of paddle.fluid.layers.IfElse:1
-msgid "if-else control flow."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:189
-msgid "DynamicRNN"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN:1
-msgid ""
-"The dynamic RNN can process a batch of sequence data. The length of each "
-"sample sequence can be different. This API automatically process them in "
-"batch."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN:5
-msgid "The input lod must be set. Please reference `lod_tensor`"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN:23
-msgid ""
-"The dynamic RNN will unfold sequence into timesteps. Users need to define"
-" how to process each time step during the :code:`with` block."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN:26
-msgid ""
-"The `memory` is used staging data cross time step. The initial value of "
-"memory can be zero or another variable."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN:29
-msgid ""
-"The dynamic RNN can mark multiple variables as its output. Use `drnn()` "
-"to get the output sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.block:1
-msgid ""
-"The block for user to define operators in RNN. See the class docstring "
-"for more details."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:1
-msgid "Create a memory variable for dynamic rnn."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:3
-msgid ""
-"If the :code:`init` is not None, :code:`memory` will be initialized by "
-"this variable. The :code:`need_reorder` is used to reorder the memory as "
-"the input variable. It should be set to true when the initialized memory "
-"depends on the input sample."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:8
-#: paddle.fluid.layers.DynamicRNN.memory:30
-msgid "For example,"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:27
-msgid ""
-"Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the "
-":code:`memory` will be initialized by this :code:`value`."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:47
-msgid "The initialized variable."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:49
-msgid "The memory shape. NOTE the shape does not contain"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:52
-msgid "the initalized value."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:54
-msgid "True if the initialized memory depends on the"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:57
-msgid "The data type of the initialized memory."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.memory:60
-msgid "the memory variable."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.output:1
-msgid "mark the RNN output variables."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.output:3
-msgid "The output variables."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.output:5
-#: paddle.fluid.layers.DynamicRNN.update_memory:8 paddle.fluid.layers.load:14
-msgid "None"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.static_input:1
-msgid ""
-"Mark a variable as a RNN input. The input will not be scattered into time"
-" steps. :param x: The input variable. :type x: Variable"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.static_input:6
-msgid "The input variable that can access in RNN."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.step_input:1
-msgid ""
-"Mark a sequence as a dynamic RNN input. :param x: The input sequence. "
-":type x: Variable"
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.step_input:5
-msgid "The current timestep in the input sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.DynamicRNN.update_memory:1
-msgid ""
-"Update the memory from ex_mem to new_mem. NOTE that the shape and data "
-"type of :code:`ex_mem` and :code:`new_mem` must be same. :param ex_mem: "
-"the memory variable. :type ex_mem: Variable :param new_mem: the plain "
-"variable generated in RNN block. :type new_mem: Variable"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:198
-msgid "ConditionalBlock"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:207
-msgid "StaticRNN"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN:1
-msgid "StaticRNN class."
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN:3
-msgid ""
-"StaticRNN class is used to create a StaticRNN. The RNN will have its own "
-"parameters like inputs, outputs, memories, status and length."
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN.memory:1
-msgid "boot memory, if not set, a shape, batch_ref must be provided"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN.memory:2
-msgid "shape of the boot memory"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN.memory:3
-msgid "batch size reference variable"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN.memory:4
-msgid "the init value of boot memory"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN.memory:5
-msgid "the index of batch size in init's dimension"
-msgstr ""
-
-#: of paddle.fluid.layers.StaticRNN.memory:6
-msgid "the index of batch size in batch_ref's dimension"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:216
-msgid "reorder_lod_tensor_by_rank"
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:1
-msgid "ReorderLoDTensorByRankTable operator."
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:3
-msgid ""
-"Input(X) is a batch of sequences. Input(RankTable) stores new orders of "
-"the input sequence batch. The reorder_lod_tensor_by_rank operator "
-"reorders the Input(X) according to the information provided by "
-"Input(RankTable)."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:23
-#: paddle.fluid.layers.elementwise_div:23
-#: paddle.fluid.layers.elementwise_max:23
-#: paddle.fluid.layers.elementwise_min:23
-#: paddle.fluid.layers.elementwise_mul:23
-#: paddle.fluid.layers.elementwise_pow:23
-#: paddle.fluid.layers.elementwise_sub:23
-#: paddle.fluid.layers.reorder_lod_tensor_by_rank:7
-msgid "For example:"
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:9
-msgid ""
-"If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the "
-"Input(X) will be reordered that the fourth sequence in Input(X) will "
-"become the first one, and then followed by the original first, third, and"
-" the second one."
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:13
-msgid ""
-"This is: X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, "
-"0, 2, 1]. Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information."
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:17
-msgid ""
-"If the LoD information of Input(X) is empty, this means Input(X) is not "
-"sequence data. This is also identical to a batch of sequences where each "
-"sequence has a fixed length 1. In this case, the "
-"reorder_lod_tensor_by_rank operator reorders each slice of Input(X) along"
-" the first axis according to Input(RankTable)."
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:22
-msgid ""
-"This is: X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is "
-"empty. The indices in RankTable are [3, 0, 2, 1]. Out = [Slice3, Slice0, "
-"Slice2, Slice1] with no LoD information is appended."
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:27
-msgid ""
-"NOTE: This operator sorts Input(X) according to a given LoDRankTable "
-"which does not need to be calculated according to Input(X). It can be "
-"calculated according to another different sequence, and then this "
-"operator sorts Input(X) according to the given LoDRankTable."
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:33
-msgid ""
-"(LoDTensor), the input lod tensor to be reordered according to "
-"Input(RankTable). Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:35
-msgid ""
-"(LoDRankTable), the rank table according to which Input(X) is reordered. "
-"Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.reorder_lod_tensor_by_rank:38
-msgid "(LoDTensor), the reordered lod tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:224
-msgid "ParallelDo"
-msgstr ""
-
-#: of paddle.fluid.layers.ParallelDo:1
-msgid "ParallelDo class."
-msgstr ""
-
-#: of paddle.fluid.layers.ParallelDo:3
-msgid "ParallelDo class is used to create a ParallelDo."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:233
-msgid "Print"
-msgstr ""
-
-#: of paddle.fluid.layers.Print:1
-msgid "**Print operator**"
-msgstr ""
-
-#: of paddle.fluid.layers.Print:3
-msgid "This creates a print op that will print when a tensor is accessed."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:5
-msgid ""
-"Wraps the tensor passed in so that whenever that a tensor is accessed, "
-"the message `message` is printed, along with the current value of the "
-"tensor `t`."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:9
-msgid "A Tensor to print."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:11
-msgid ""
-"Print this number of elements in the tensor, will print all if left is "
-"negative."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:14
-msgid "A string message to print as a prefix."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:16
-msgid "Only log `first_n` number of times."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:18
-msgid "Print the tensor name."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:20
-msgid "Print the tensor type."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:22
-msgid "Print the tensor shape."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:24
-msgid "Print the tensor lod."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:26
-msgid ""
-"Which phase to displace, including 'forward', 'backward' and 'both'. If "
-"set to 'backward' or 'both', will print the gradients of input tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:31
-msgid "Output tensor, same data with input tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.Print:38
-msgid "value = some_layer(...) Print(value, summarize=10,"
-msgstr ""
-
-#: of paddle.fluid.layers.Print:40
-msgid "message=\"The content of some_layer: \")"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:241
-msgid "is_empty"
-msgstr ""
-
-#: of paddle.fluid.layers.is_empty:1
-msgid "**Is Empty**"
-msgstr ""
-
-#: of paddle.fluid.layers.is_empty:3
-msgid "This layer returns the truth value of whether the variable is empty."
-msgstr ""
-
-#: of paddle.fluid.layers.is_empty:5
-msgid "Operand of *is_empty*"
-msgstr ""
-
-#: of paddle.fluid.layers.is_empty:7
-msgid "Optional output variable to store the result of *is_empty*"
-msgstr ""
-
-#: of paddle.fluid.layers.is_empty:11
-msgid "The tensor variable storing the output of *is_empty*."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d paddle.fluid.layers.conv2d_transpose
-#: paddle.fluid.layers.conv3d paddle.fluid.layers.conv3d_transpose
-#: paddle.fluid.layers.cross_entropy paddle.fluid.layers.fc
-#: paddle.fluid.layers.is_empty paddle.fluid.layers.lod_reset
-#: paddle.fluid.layers.lstm_unit paddle.fluid.layers.ssd_loss
-msgid "raises"
-msgstr ""
-
-#: of paddle.fluid.layers.is_empty:14
-msgid ""
-":exc:`TypeError` -- If input cond is not a variable, or cond's dtype is "
-"not bool"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:247
-msgid "device"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:252
-msgid "get_places"
-msgstr ""
-
-#: of paddle.fluid.layers.get_places:1
-msgid ""
-"Returns a list of places based on arguments. The list will be used for "
-"parallel execution."
-msgstr ""
-
-#: of paddle.fluid.layers.get_places:4
-msgid "device count"
-msgstr ""
-
-#: of paddle.fluid.layers.get_places:6
-msgid "device type"
-msgstr ""
-
-#: of paddle.fluid.layers.get_places:9
-msgid "vector of Place"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:258
-msgid "io"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:263
-msgid "data"
-msgstr ""
-
-#: of paddle.fluid.layers.data:1
-msgid "**Data Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.data:3
-msgid ""
-"This function takes in the input and based on whether data has to be "
-"returned back as a minibatch, it creates the global variable by using the"
-" helper functions. The global variables can be accessed by all the "
-"following operators in the graph."
-msgstr ""
-
-#: of paddle.fluid.layers.data:8
-msgid ""
-"All the input variables of this function are passed in as local variables"
-" to the LayerHelper constructor."
-msgstr ""
-
-#: of paddle.fluid.layers.data:11
-msgid "The name/alias of the function"
-msgstr ""
-
-#: of paddle.fluid.layers.data:13
-msgid "Tuple declaring the shape."
-msgstr ""
-
-#: of paddle.fluid.layers.data:15
-msgid "Whether or not to append the data as a batch."
-msgstr ""
-
-#: of paddle.fluid.layers.data:17 paddle.fluid.layers.embedding:28
-msgid "The type of data : float32, float_16, int etc"
-msgstr ""
-
-#: of paddle.fluid.layers.data:19
-msgid "The output type. By default it is LOD_TENSOR."
-msgstr ""
-
-#: of paddle.fluid.layers.data:21
-msgid "The LoD Level. 0 means the input data is not a sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.data:23
-msgid "A boolean that mentions whether gradient should flow."
-msgstr ""
-
-#: of paddle.fluid.layers.data:26
-msgid "The global variable that gives access to the data."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:271
-msgid "BlockGuardServ"
-msgstr ""
-
-#: of paddle.fluid.layers.BlockGuardServ:1
-msgid "BlockGuardServ class."
-msgstr ""
-
-#: of paddle.fluid.layers.BlockGuardServ:3
-msgid "BlockGuardServ class is used to create an op with a block in a program."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:280
-msgid "ListenAndServ"
-msgstr ""
-
-#: of paddle.fluid.layers.ListenAndServ:1
-msgid "ListenAndServ class."
-msgstr ""
-
-#: of paddle.fluid.layers.ListenAndServ:3
-msgid ""
-"ListenAndServ class is used to wrap listen_and_serv op to create a server"
-" which can receive variables from clients and run a block."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:289
-msgid "Send"
-msgstr ""
-
-#: of paddle.fluid.layers.Send:1
-msgid "Send layer"
-msgstr ""
-
-#: of paddle.fluid.layers.Send:3
-msgid "comma seperated IP:PORT pairs in the order of send_vars to send"
-msgstr ""
-
-#: of paddle.fluid.layers.Send:5
-msgid "vars to send"
-msgstr ""
-
-#: of paddle.fluid.layers.Send:6
-msgid "vars to get from server after send completes."
-msgstr ""
-
-#: of paddle.fluid.layers.Send:8
-msgid ""
-"Send variables to the server side, and get vars from server side when "
-"server have finished running server side program."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:297
-msgid "open_recordio_file"
-msgstr ""
-
-#: of paddle.fluid.layers.open_recordio_file:1
-msgid ""
-"Open a recordio file and return the reader object. The returned reader "
-"object is thread-safe."
-msgstr ""
-
-#: of paddle.fluid.layers.open_recordio_file:3
-msgid ""
-"NOTE: This is a very low-level API. It is used for debugging data file or"
-" training. Please use `open_files` instead of this API for production "
-"usage."
-msgstr ""
-
-#: of paddle.fluid.layers.open_recordio_file:7
-msgid "The filename of record file. This file will given to reader."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:9 paddle.fluid.layers.open_recordio_file:9
-#: paddle.fluid.layers.random_data_generator:13
-msgid "List of tuples which declaring data shapes."
-msgstr ""
-
-#: of paddle.fluid.layers.open_recordio_file:11
-msgid "The LoD levels of each data."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:13
-#: paddle.fluid.layers.open_recordio_file:13
-msgid "List of strs which declaring data type."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:19
-#: paddle.fluid.layers.open_recordio_file:15
-msgid "Number of passes to run."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:21
-#: paddle.fluid.layers.open_recordio_file:17
-#: paddle.fluid.layers.random_data_generator:17
-msgid "Set it as True if you are going to run subsequent operators in parallel."
-msgstr ""
-
-#: of paddle.fluid.layers.open_recordio_file:21
-msgid "The created random reader."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:305
-msgid "open_files"
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:1
-msgid "Open files"
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:3
-msgid ""
-"This layer takes a list of files to read from and returns a Reader "
-"Variable. Via the Reader Variable, we can get data from given files. All "
-"files must have name suffixs to indicate their formats, e.g., "
-"'*.recordio'."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:7
-msgid "The list of file names."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:11
-#: paddle.fluid.layers.random_data_generator:15
-msgid "List of ints which declaring data lod_level."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:15
-msgid "The maximal concurrent prefetch thread number."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:17
-msgid "The size of prefetch buffer."
-msgstr ""
-
-#: of paddle.fluid.layers.open_files:25
-msgid "A Reader Variable via which we can get file data."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:313
-msgid "read_file"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:321
-msgid "shuffle"
-msgstr ""
-
-#: of paddle.fluid.layers.shuffle:1
-msgid "Shuffle the reader."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:329
-msgid "batch"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:337
-msgid "double_buffer"
-msgstr ""
-
-#: of paddle.fluid.layers.double_buffer:1
-msgid ""
-"Wrap a double buffer reader. The data will copy to target place with a "
-"double buffer queue. If the target place is None, the place that executor"
-" perform on will be used."
-msgstr ""
-
-#: of paddle.fluid.layers.double_buffer:5
-msgid "the reader variable need to be wrapped."
-msgstr ""
-
-#: of paddle.fluid.layers.double_buffer:7
-msgid "the place of target data. Default is the sample place of executor perform."
-msgstr ""
-
-#: of paddle.fluid.layers.double_buffer:10
-msgid "Variable name. None if the user does not care."
-msgstr ""
-
-#: of paddle.fluid.layers.double_buffer:13
-msgid "wrapped reader with double buffer."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:345
-msgid "random_data_generator"
-msgstr ""
-
-#: of paddle.fluid.layers.random_data_generator:1
-msgid "Create a uniform random data generator"
-msgstr ""
-
-#: of paddle.fluid.layers.random_data_generator:3
-msgid ""
-"This layer returns a Reader Variable. Instead of opening a file and "
-"reading data from it, this Reader Variable generates float uniform random"
-" data by itself. It can be used as a dummy reader to test a network "
-"without opening a real file."
-msgstr ""
-
-#: of paddle.fluid.layers.random_data_generator:9
-msgid "The lower bound of data's uniform distribution."
-msgstr ""
-
-#: of paddle.fluid.layers.random_data_generator:11
-msgid "The upper bound of data's uniform distribution."
-msgstr ""
-
-#: of paddle.fluid.layers.random_data_generator:21
-msgid "A Reader Variable from which we can get random data."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:353
-msgid "Preprocessor"
-msgstr ""
-
-#: of paddle.fluid.layers.Preprocessor:1
-msgid "A block for data pre-processing in reader."
-msgstr ""
-
-#: of paddle.fluid.layers.Preprocessor:3
-msgid "A reader variable."
-msgstr ""
-
-#: of paddle.fluid.layers.Preprocessor:5
-msgid "The name of the reader."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:362
-msgid "load"
-msgstr ""
-
-#: of paddle.fluid.layers.load:1
-msgid "Load operator will load a tensor variable from disk file."
-msgstr ""
-
-#: of paddle.fluid.layers.load:7
-msgid "The tensor need to be loaded."
-msgstr ""
-
-#: of paddle.fluid.layers.load:9
-msgid "Variable will be loaded from \"file_path\"."
-msgstr ""
-
-#: of paddle.fluid.layers.load:11
-msgid ""
-"If true, the tensor will be first loaded and then converted to float16 "
-"data type. Otherwise, the tensor will be directly loaded without data "
-"type conversion. Default is false."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:368
-msgid "nn"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:373
-msgid "fc"
-msgstr ""
-
-#: of paddle.fluid.layers.fc:1
-msgid "**Fully Connected Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.fc:3
-msgid ""
-"The fully connected layer can take multiple tensors as its inputs. It "
-"creates a variable called weights for each input tensor, which represents"
-" a fully connected weight matrix from each input unit to each output "
-"unit. The fully connected layer multiplies each input tensor with its "
-"coresponding weight to produce an output Tensor. If multiple input "
-"tensors are given, the results of multiple multiplications will be sumed "
-"up. If bias_attr is not None, a bias variable will be created and added "
-"to the output. Finally, if activation is not None, it will be applied to "
-"the output as well."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:12
-msgid "This process can be formulated as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:18 paddle.fluid.layers.conv3d:18
-#: paddle.fluid.layers.conv3d_transpose:18 paddle.fluid.layers.fc:18
-#: paddle.fluid.layers.lrn:13 paddle.fluid.layers.row_conv:11
-#: paddle.fluid.layers.square_error_cost:12
-msgid "In the above equation:"
-msgstr ""
-
-#: of paddle.fluid.layers.fc:20
-msgid ":math:`N`: Number of the input."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:21
-msgid ":math:`X_i`: The input tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:22
-msgid ":math:`W`: The weights created by this layer."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:23
-msgid ":math:`b`: The bias parameter created by this layer (if needed)."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:24
-msgid ":math:`Act`: The activation function."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:25
-msgid ":math:`Out`: The output tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:27
-msgid ""
-"The input tensor(s) of this layer, and the dimension of the input "
-"tensor(s) is at least 2."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:30
-msgid "The number of output units in this layer."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:32
-msgid ""
-"The fc layer can accept an input tensor with more than two dimensions. If"
-" this happens, the multidimensional tensor will first be flattened into a"
-" 2-dimensional matrix. The parameter `num_flatten_dims` determines how "
-"the input tensor is flattened: the first `num_flatten_dims` (inclusive, "
-"index starts from 1) dimensions will be flatten to form the first "
-"dimension of the final matrix (height of the matrix), and the rest "
-"`rank(X) - num_flatten_dims` dimensions are flattened to form the second "
-"dimension of the final matrix (width of the matrix). For example, suppose"
-" `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and "
-"`num_flatten_dims` = 3. Then, the flattened matrix will have a shape [2 x"
-" 3 x 4, 5 x 6] = [24, 30]."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:42
-msgid "The parameter attribute for learnable parameters/weights of this layer."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:45
-msgid ""
-"The parameter attribute for the bias of this layer. If it is set to None,"
-" no bias will be added to the output units."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:48
-msgid "Activation to be applied to the output of this layer."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:50
-msgid "A flag indicating whether execution is in test phase."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:52
-msgid ""
-"Use mkldnn kernel or not, it is valid only when the mkldnn library is "
-"installed. Default: False"
-msgstr ""
-
-#: of paddle.fluid.layers.fc:58 paddle.fluid.layers.lrn:35
-msgid "A tensor variable storing the transformation result."
-msgstr ""
-
-#: of paddle.fluid.layers.fc:60
-msgid ":exc:`ValueError` -- If rank of the input tensor is less than 2."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:381
-msgid "embedding"
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:1
-msgid "**Embedding Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:3
-msgid ""
-"This layer is used to lookup embeddings of IDs, provided by "
-":attr:`input`, in a lookup table. The result of this lookup is the "
-"embedding of each ID in the :attr:`input`."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:7
-msgid ""
-"All the input variables are passed in as local variables to the "
-"LayerHelper constructor."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:10
-msgid "The tensor variable containing the IDs."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:12
-msgid ""
-"The shape of the look up table parameter. It should have two elements "
-"which indicate the size of the dictionary of embeddings and the size of "
-"each embedding vector respectively."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:16
-msgid "The flag indicating whether to use sparse update."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:18
-msgid "Whether to run lookup table from remote parameter server."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:20
-msgid ""
-"If :attr:`None`, it makes no effect to lookup. Otherwise the given "
-":attr:`padding_idx` indicates padding the output with zeros whenever "
-"lookup encounters it in :attr:`input`. If :math:`padding_idx < 0`, the "
-":attr:`padding_idx` to use in lookup is :math:`size[0] + dim`."
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:26
-msgid "Parameters for this layer"
-msgstr ""
-
-#: of paddle.fluid.layers.embedding:31
-msgid ""
-"The tensor variable storing the embeddings of the                   "
-"supplied inputs."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:389
-msgid "dynamic_lstm"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:1
-msgid "**Dynamic LSTM Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:3
-msgid ""
-"The defalut implementation is diagonal/peephole connection "
-"(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:20
-msgid ""
-"where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is "
-"the matrix of weights from the input gate to the input), :math:`W_{ic},"
-"     W_{fc}, W_{oc}` are diagonal weight matrices for peephole "
-"connections. In our implementation, we use vectors to reprenset these "
-"diagonal weight matrices. The :math:`b` terms denote bias vectors "
-"(:math:`b_i` is the input gate bias vector), :math:`\\sigma` is the non-"
-"linear activations, such as logistic sigmoid function, and :math:`i, f, "
-"o` and :math:`c` are the input gate, forget gate, output gate, and cell "
-"activation vectors, respectively, all of which have the same size as the "
-"cell output activation vector :math:`h`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:29
-msgid ""
-"The :math:`\\odot` is the element-wise product of the vectors. "
-":math:`act_g` and :math:`act_h` are the cell input and cell output "
-"activation functions and `tanh` is usually used for them. "
-":math:`\\tilde{c_t}` is also called candidate hidden state, which is "
-"computed based on the current input and the previous hidden state."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:35 paddle.fluid.layers.dynamic_lstmp:42
-msgid ""
-"Set `use_peepholes` to `False` to disable peephole connection. The "
-"formula is omitted here, please refer to the paper "
-"http://www.bioinf.jku.at/publications/older/2604.pdf for details."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:39
-msgid ""
-"Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, "
-"W_{xo}x_{t}` operations on the input :math:`x_{t}` are NOT included in "
-"this operator. Users can choose to use fully-connect layer before LSTM "
-"layer."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:43
-msgid ""
-"The input of dynamic_lstm layer, which supports variable-time length "
-"input sequence. The underlying tensor in this Variable is a matrix with "
-"shape (T X 4D), where T is the total time steps in this mini-batch, D is "
-"the hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:49 paddle.fluid.layers.dynamic_lstmp:56
-msgid "4 * hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:51
-msgid ""
-"The initial hidden state is an optional input, default is zero. This is a"
-" tensor with shape (N x D), where N is the batch size and D is the hidden"
-" size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:55
-msgid ""
-"The initial cell state is an optional input, default is zero. This is a "
-"tensor with shape (N x D), where N is the batch size. `h_0` and `c_0` can"
-" be NULL but only at the same time."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:59
-msgid ""
-"The parameter attribute for the learnable hidden-hidden weights.  - "
-"Weights = {:math:`W_{ch}, W_{ih},"
-"                                                 W_{fh}, W_{oh}`} - The "
-"shape is (D x 4D), where D is the hidden   size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:59
-msgid "The parameter attribute for the learnable hidden-hidden weights."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:62
-msgid ""
-"Weights = {:math:`W_{ch}, W_{ih},"
-"                                                 W_{fh}, W_{oh}`}"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:63
-msgid "The shape is (D x 4D), where D is the hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:66 paddle.fluid.layers.dynamic_lstmp:70
-msgid ""
-"The bias attribute for the learnable bias weights, which contains two "
-"parts, input-hidden bias weights and peephole connections weights if "
-"setting `use_peepholes` to `True`.  1. `use_peepholes = False`   - Biases"
-" = {:math:`b_c, b_i, b_f, b_o`}.   - The shape is (1 x 4D). 2. "
-"`use_peepholes = True`   - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic},"
-"                                                  W_{fc}, W_{oc}`}.   - "
-"The shape is (1 x 7D)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:66 paddle.fluid.layers.dynamic_lstmp:70
-msgid ""
-"The bias attribute for the learnable bias weights, which contains two "
-"parts, input-hidden bias weights and peephole connections weights if "
-"setting `use_peepholes` to `True`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:71 paddle.fluid.layers.dynamic_lstmp:75
-msgid "`use_peepholes = False`"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:72 paddle.fluid.layers.dynamic_lstmp:76
-msgid "Biases = {:math:`b_c, b_i, b_f, b_o`}."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:73 paddle.fluid.layers.dynamic_lstmp:77
-msgid "The shape is (1 x 4D)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:74 paddle.fluid.layers.dynamic_lstmp:78
-msgid "`use_peepholes = True`"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:75 paddle.fluid.layers.dynamic_lstmp:79
-msgid ""
-"Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic},"
-"                                                  W_{fc}, W_{oc}`}."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:76 paddle.fluid.layers.dynamic_lstmp:80
-msgid "The shape is (1 x 7D)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:78 paddle.fluid.layers.dynamic_lstmp:82
-msgid "Whether to enable diagonal/peephole connections, default `True`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:81 paddle.fluid.layers.dynamic_lstmp:85
-msgid "Whether to compute reversed LSTM, default `False`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:83 paddle.fluid.layers.dynamic_lstmp:87
-msgid ""
-"The activation for input gate, forget gate and output gate. Choices = "
-"[\"sigmoid\", \"tanh\", \"relu\", \"identity\"], default \"sigmoid\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:87 paddle.fluid.layers.dynamic_lstmp:91
-msgid ""
-"The activation for cell output. Choices = [\"sigmoid\", \"tanh\", "
-"\"relu\", \"identity\"], default \"tanh\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:90 paddle.fluid.layers.dynamic_lstmp:94
-msgid ""
-"The activation for candidate hidden state. Choices = [\"sigmoid\", "
-"\"tanh\",     \"relu\", \"identity\"], default \"tanh\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:90 paddle.fluid.layers.dynamic_lstmp:94
-msgid ""
-"The activation for candidate hidden state. Choices = [\"sigmoid\", "
-"\"tanh\","
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:92 paddle.fluid.layers.dynamic_lstmp:96
-#: paddle.fluid.layers.dynamic_lstmp:101
-msgid "\"relu\", \"identity\"],"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:93 paddle.fluid.layers.dynamic_lstmp:97
-#: paddle.fluid.layers.dynamic_lstmp:102
-msgid "default \"tanh\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:95 paddle.fluid.layers.dynamic_lstmp:104
-msgid "Data type. Choices = [\"float32\", \"float64\"], default \"float32\"."
-msgstr ""
-
-#: of paddle.fluid.layers.concat:10 paddle.fluid.layers.conv2d:89
-#: paddle.fluid.layers.conv2d_transpose:88 paddle.fluid.layers.conv3d:84
-#: paddle.fluid.layers.conv3d_transpose:89 paddle.fluid.layers.dropout:19
-#: paddle.fluid.layers.dynamic_lstm:97 paddle.fluid.layers.dynamic_lstmp:106
-#: paddle.fluid.layers.image_resize:22 paddle.fluid.layers.label_smooth:35
-#: paddle.fluid.layers.lstm_unit:52 paddle.fluid.layers.matmul:36
-#: paddle.fluid.layers.pad:35 paddle.fluid.layers.pool2d:22
-#: paddle.fluid.layers.pool3d:22 paddle.fluid.layers.reduce_max:15
-#: paddle.fluid.layers.reduce_mean:15 paddle.fluid.layers.reduce_min:15
-#: paddle.fluid.layers.reduce_prod:15 paddle.fluid.layers.reduce_sum:15
-#: paddle.fluid.layers.sequence_expand:46 paddle.fluid.layers.split:15
-#: paddle.fluid.layers.topk:16
-msgid ""
-"A name for this layer(optional). If set None, the layer will be named "
-"automatically."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstm:101
-msgid ""
-"The hidden state, and cell state of LSTM. The shape of both         is (T"
-" x D), and lod is the same with the `input`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:397
-msgid "dynamic_lstmp"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:1
-msgid "**Dynamic LSTMP Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:3
-msgid ""
-"LSTMP (LSTM with recurrent projection) layer has a separate projection "
-"layer after the LSTM layer, projecting the original hidden state to a "
-"lower-dimensional one, which is proposed to reduce the number of total "
-"parameters and furthermore computational complexity for the LSTM, "
-"espeacially for the case that the size of output units is relative large "
-"(https://research.google.com/pubs/archive/43905.pdf)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:6 paddle.fluid.layers.dynamic_lstmp:10
-#: paddle.fluid.layers.layer_norm:7 paddle.fluid.layers.lrn:4
-msgid "The formula is as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:28
-msgid "In the above formula:"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:30
-msgid ""
-":math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is           the "
-"matrix of weights from the input gate to the input)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:31
-msgid ""
-":math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight"
-"           matrices for peephole connections. In our implementation,"
-"           we use vectors to reprenset these diagonal weight matrices."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:32
-msgid ""
-":math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate"
-"           bias vector)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:33
-msgid ":math:`\\sigma`: The activation, such as logistic sigmoid function."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:34
-msgid ""
-":math:`i, f, o` and :math:`c`: The input gate, forget gate, output"
-"           gate, and cell activation vectors, respectively, all of which "
-"have           the same size as the cell output activation vector "
-":math:`h`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:35
-msgid ":math:`h`: The hidden state."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:36
-msgid ":math:`r`: The recurrent projection of the hidden state."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:37
-msgid ""
-":math:`\\tilde{c_t}`: The candidate hidden state, whose           "
-"computation is based on the current input and previous hidden state."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:38
-msgid ":math:`\\odot`: The element-wise product of the vectors."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:39
-msgid ""
-":math:`act_g` and :math:`act_h`: The cell input and cell output"
-"           activation functions and `tanh` is usually used for them."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:40
-msgid ""
-":math:`\\overline{act_h}`: The activation function for the projection"
-"           output, usually using `identity` or same as :math:`act_h`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:46
-msgid ""
-"Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, "
-"W_{xo}x_{t}` operations on the input :math:`x_{t}` are NOT included in "
-"this operator. Users can choose to use fully-connected layer before LSTMP"
-" layer."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:50
-msgid ""
-"The input of dynamic_lstmp layer, which supports variable-time length "
-"input sequence. The underlying tensor in this Variable is a matrix with "
-"shape (T X 4D), where T is the total time steps in this mini-batch, D is "
-"the hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:58
-msgid "The size of projection output."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:60
-msgid ""
-"The parameter attribute for the learnable hidden-hidden weight and "
-"projection weight.  - Hidden-hidden weight = {:math:`W_{ch}, W_{ih},"
-"                                                 W_{fh}, W_{oh}`}. - The "
-"shape of hidden-hidden weight is (P x 4D),   where P is the projection "
-"size and D the hidden   size. - Projection weight = {:math:`W_{rh}`}. - "
-"The shape of projection weight is (D x P)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:60
-msgid ""
-"The parameter attribute for the learnable hidden-hidden weight and "
-"projection weight."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:63
-msgid ""
-"Hidden-hidden weight = {:math:`W_{ch}, W_{ih},"
-"                                                 W_{fh}, W_{oh}`}."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:64
-msgid ""
-"The shape of hidden-hidden weight is (P x 4D), where P is the projection "
-"size and D the hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:67
-msgid "Projection weight = {:math:`W_{rh}`}."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:68
-msgid "The shape of projection weight is (D x P)."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:99
-msgid ""
-"The activation for projection output. Choices = [\"sigmoid\", \"tanh\","
-"     \"relu\", \"identity\"], default \"tanh\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:99
-msgid "The activation for projection output. Choices = [\"sigmoid\", \"tanh\","
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_lstmp:110
-msgid ""
-"The projection of hidden state, and cell state of LSTMP. The"
-"                shape of projection is (T x P), for the cell state which "
-"is                (T x D), and both LoD is the same with the `input`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:405
-msgid "dynamic_gru"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:1
-msgid "**Dynamic GRU Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:3
-msgid ""
-"Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on "
-"Sequence Modeling <https://arxiv.org/abs/1412.3555>`_"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:18
-msgid ""
-"The :math:`\\odot` is the element-wise product of the vectors. "
-":math:`act_g` is the update gate and reset gate activation function and "
-":math:`sigmoid` is usually used for it. :math:`act_c` is the activation "
-"function for candidate hidden state and :math:`tanh` is usually used for "
-"it."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:23
-msgid ""
-"Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations "
-"on the input :math:`x_{t}` are NOT included in this operator. Users can "
-"choose to use fully-connect layer before GRU layer."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:27
-msgid ""
-"The input of dynamic_gru layer, which supports variable-time length input"
-" sequence. The underlying tensor in this Variable is a matrix with shape "
-":math:`(T \\times 3D)`, where :math:`T` is the total time steps in this "
-"mini-batch, :math:`D` is the hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:33
-msgid "The dimension of the gru cell."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:35
-msgid ""
-"The parameter attribute for the learnable hidden-hidden weight matrix. "
-"Note:  - The shape of the weight matrix is :math:`(T \\times 3D)`, where"
-"   :math:`D` is the hidden size. - All elements in the weight matrix can "
-"be divided into two parts.   The first part are weights of the update "
-"gate and reset gate with   shape :math:`(D \\times 2D)`, and the second "
-"part are weights for   candidate hidden state with shape :math:`(D "
-"\\times D)`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:35
-msgid ""
-"The parameter attribute for the learnable hidden-hidden weight matrix. "
-"Note:"
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:38
-msgid ""
-"The shape of the weight matrix is :math:`(T \\times 3D)`, where :math:`D`"
-" is the hidden size."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:40
-msgid ""
-"All elements in the weight matrix can be divided into two parts. The "
-"first part are weights of the update gate and reset gate with shape "
-":math:`(D \\times 2D)`, and the second part are weights for candidate "
-"hidden state with shape :math:`(D \\times D)`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:45
-msgid "The parameter attribute for learnable the hidden-hidden bias."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:48
-msgid "Whether to compute reversed GRU, default :attr:`False`."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:51
-msgid ""
-"The activation for update gate and reset gate. Choices = [\"sigmoid\", "
-"\"tanh\", \"relu\", \"identity\"], default \"sigmoid\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:54
-msgid ""
-"The activation for candidate hidden state. Choices = [\"sigmoid\", "
-"\"tanh\", \"relu\", \"identity\"], default \"tanh\"."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:57
-msgid "The hidden output of the first time step."
-msgstr ""
-
-#: of paddle.fluid.layers.dynamic_gru:60
-msgid ""
-"The hidden state of GRU. The shape is :math:`(T \\times D)`,             "
-"and lod is the same with the input."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:413
-msgid "gru_unit"
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:1
-msgid "GRU unit layer. The equation of a gru step is:"
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:12
-msgid ""
-"The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms of"
-" the equation above, the :math:`z_t` is split into 3 parts - "
-":math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to "
-"implement a full GRU unit operator for an input, a fully connected layer "
-"has to be applied, such that :math:`z_t = W_{fc}x_t`."
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:18
-msgid ""
-"The terms :math:`u_t` and :math:`r_t` represent the update and reset "
-"gates of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, "
-"there is an intermediate candidate hidden output, which is denoted by "
-":math:`m_t`. This layer has three outputs :math:`h_t`, :math:`dot(r_t, "
-"h_{t-1})` and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`."
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:24
-msgid "The fc transformed input value of current step."
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:26
-msgid "The hidden value of lstm unit from previous step."
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:28
-msgid "The input dimension value."
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:30
-msgid "The weight parameters for gru unit. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:32
-msgid "The bias parameters for gru unit. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:34
-msgid "The activation type for cell (actNode). Default: 'tanh'"
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:37
-msgid "The activation type for gates (actGate). Default: 'sigmoid'"
-msgstr ""
-
-#: of paddle.fluid.layers.gru_unit:41
-msgid "The hidden value, reset-hidden value and gate values."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:421
-msgid "linear_chain_crf"
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:1
-msgid "Linear Chain CRF."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:3
-msgid ""
-"Conditional Random Field defines an undirected probabilistic graph with "
-"nodes denoting random variables and edges denoting dependencies between "
-"these variables. CRF learns the conditional probability :math:`P(Y|X)`, "
-"where :math:`X = (x_1, x_2, ... , x_n)` are structured inputs and "
-":math:`Y = (y_1, y_2, ... , y_n)` are labels for the inputs."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:5
-msgid ""
-"Linear chain CRF is a special case of CRF that is useful for sequence "
-"labeling task. Sequence labeling tasks do not assume a lot of conditional"
-" independences among inputs. The only constraint they impose is that the "
-"input and output must be linear sequences. Thus, the graph of such a CRF "
-"is a simple chain or a line, which results in the linear chain CRF."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:7
-msgid ""
-"This operator implements the Forward-Backward algorithm for the linear "
-"chain CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf "
-"and http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for "
-"details."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:9
-msgid ""
-"Equation: 1. Denote Input(Emission) to this operator as :math:`x` here. "
-"2. The first D values of Input(Transition) to this operator are for "
-"starting weights, denoted as :math:`a` here. 3. The next D values of "
-"Input(Transition) of this operator are for ending weights, denoted as "
-":math:`b` here. 4. The remaning values of Input(Transition) are for "
-"transition weights, denoted as :math:`w` here. 5. Denote Input(Label) as "
-":math:`s` here."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:11
-msgid ""
-"The probability of a sequence :math:`s` of length :math:`L` is defined "
-"as: $$P(s) = (1/Z) \\exp(a_{s_1} + b_{s_L} + \\sum_{l=1}^L x_{s_l} + "
-"\\sum_{l=2}^L w_{s_{l-1},s_l})$$"
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:13
-msgid ""
-"where :math:`Z` is a normalization value so that the sum of :math:`P(s)` "
-"over all possible sequences is 1, and :math:`x` is the emission feature "
-"weight to the linear chain CRF."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:15
-msgid ""
-"Finally, the linear chain CRF operator outputs the logarithm of the "
-"conditional likelihood of each training sample in a mini-batch."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:17
-msgid ""
-"NOTE: 1. The feature function for a CRF is made up of the emission "
-"features and the transition features. The emission feature weights are "
-"NOT computed in this operator. They MUST be computed first before this "
-"operator is called."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:19
-msgid ""
-"Because this operator performs global normalization over all possible "
-"sequences internally, it expects UNSCALED emission feature weights. "
-"Please do not call this op with the emission feature being output of any "
-"nonlinear activation."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:21
-msgid "The 2nd dimension of Input(Emission) MUST be equal to the tag number."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:27
-msgid ""
-"(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape [N x D],"
-" where N is the size of the mini-batch and D is the total tag number. The"
-" unscaled emission weight matrix for the linear chain CRF."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:29
-msgid ""
-"(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape [N x 1], "
-"where N is the total element number in a mini-batch. The ground truth"
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:31
-msgid "The attribute of the learnable parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.linear_chain_crf:34
-msgid ""
-"S is equal to the sequence number in a mini-batch. The output is no "
-"longer a LoDTensor"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:429
-msgid "crf_decoding"
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:1
-msgid ""
-"The crf_decoding operator reads the emission feature weights and the "
-"transition feature weights learned by the linear_chain_crf operator. It "
-"implements the Viterbi algorithm which is a dynamic programming algorithm"
-" for finding the most likely sequence of hidden states, called the "
-"Viterbi path, that results in a sequence of observed tags."
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:3
-msgid ""
-"The output of this operator changes according to whether Input(Label) is "
-"given:"
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:5
-msgid "Input(Label) is given:"
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:7
-msgid ""
-"This happens in training. This operator is used to co-work with the "
-"chunk_eval operator."
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:9
-msgid ""
-"When Input(Label) is given, the crf_decoding operator returns a row "
-"vector with shape [N x 1] whose values are fixed to be 0, indicating an "
-"incorrect prediction, or 1 indicating a tag is correctly predicted. Such "
-"an output is the input to chunk_eval operator."
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:11
-msgid "Input(Label) is not given:"
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:13
-msgid "This is the standard decoding process."
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:15
-msgid ""
-"The crf_decoding operator returns a row vector with shape [N x 1] whose "
-"values range from 0 to maximum tag number - 1. Each element indicates an "
-"index of a predicted tag."
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:19
-msgid ""
-"(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape [N x D] "
-"where N is the size of the mini-batch and D is the total tag number. This"
-" input is the unscaled emission weight matrix of the linear_chain_crf "
-"operator"
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:21
-msgid "The parameter attribute for training."
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:23
-msgid ""
-"(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape [N x 1]. "
-"This input is optional. See more details in the operator's comments"
-msgstr ""
-
-#: of paddle.fluid.layers.crf_decoding:26
-msgid ""
-"(LoDTensor, LoDTensor<int64_t>). The decoding results. What to return "
-"changes depending on whether the Input(Label) (the ground truth) is "
-"given. See more details in the operator's comment"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:437
-msgid "cos_sim"
-msgstr ""
-
-#: of paddle.fluid.layers.cos_sim:1
-msgid ""
-"This function performs the cosine similarity between two tensors X and Y "
-"and returns that as the output."
-msgstr ""
-
-#: of paddle.fluid.layers.cos_sim:4
-msgid "The input X."
-msgstr ""
-
-#: of paddle.fluid.layers.cos_sim:6
-msgid "The input Y."
-msgstr ""
-
-#: of paddle.fluid.layers.cos_sim:9
-msgid "the output of cosine(X, Y)."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:445
-msgid "cross_entropy"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:1
-msgid "**Cross Entropy Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:3
-msgid ""
-"This layer computes the cross entropy between `input` and `label`. It "
-"supports both standard cross-entropy and soft-label cross-entropy loss "
-"computation."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:12
-msgid "One-hot cross-entropy:"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:8
-msgid ""
-"`soft_label = False`, `Label[i, 0]` indicates the class index for sample "
-"i:"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:20
-msgid "Soft-label cross-entropy:"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:15
-msgid ""
-"`soft_label = True`, `Label[i, j]` indicates the soft label of class j "
-"for sample i:"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:22
-msgid ""
-"Please make sure that in this case the summation of each row of `label` "
-"equals one."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:28
-msgid "One-hot cross-entropy with vecterized `label`:"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:26
-msgid ""
-"As a special case of 2), when each row of 'label' has only one non-zero "
-"element which is equal to 1, soft-label cross-entropy degenerates to a "
-"one-hot cross-entropy with one-hot label representation."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:30
-msgid ""
-"a 2-D tensor with shape [N x D], where N is the batch size and D is the "
-"number of classes. This input is a probability computed by the previous "
-"operator, which is almost always the result of a softmax operator."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:36
-msgid ""
-"the ground truth which is a 2-D tensor. When `soft_label` is set to "
-"`False`, `label` is a tensor<int64> with shape [N x 1]. When `soft_label`"
-" is set to `True`, `label` is a tensor<float/double> with shape [N x D]."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:42
-msgid ""
-"a flag indicating whether to interpretate the given labels as soft "
-"labels, default `False`."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:47
-msgid "A 2-D tensor with shape [N x 1], the cross entropy loss."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:49
-msgid ""
-"`ValueError` -- 1) the 1st dimension of `input` and `label` are not "
-"equal. 2) when `soft_label == True`, and the 2nd dimension of"
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:51
-msgid "`input` and `label` are not equal."
-msgstr ""
-
-#: of paddle.fluid.layers.cross_entropy:52
-msgid "when `soft_label == False`, and the 2nd dimension of `label` is not 1."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:453
-msgid "square_error_cost"
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:1
-msgid "**Square error cost layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:3
-msgid ""
-"This layer accepts input predictions and target label and returns the "
-"squared error cost."
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:6
-msgid "For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:"
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:14
-msgid ":math:`X`: Input predictions, a tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:15
-msgid ":math:`Y`: Input labels, a tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:16
-msgid ":math:`Out`: Output value, same shape with :math:`X`."
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:18
-msgid "Input tensor, has predictions."
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:20
-msgid "Label tensor, has target labels."
-msgstr ""
-
-#: of paddle.fluid.layers.square_error_cost:23
-msgid ""
-"The tensor variable storing the element-wise squared error"
-"                   difference of input and label."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:461
-msgid "chunk_eval"
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:1
-msgid ""
-"This function computes and outputs the precision, recall and F1-score of "
-"chunk detection."
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:4
-msgid "prediction output of the network."
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:6
-msgid "label of the test data set."
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:8
-msgid ""
-"(string, default IOB). The labeling scheme indicating how to encode the "
-"chunks. Must be IOB, IOE, IOBES or plain. See below for details"
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:10
-msgid "(int). The number of chunk type. See below for details"
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:12
-msgid ""
-"(list<int>) A list including chunk type ids indicating chunk types that "
-"are not counted. See below for details"
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:15
-msgid ""
-"tuple containing: (precision, recall, f1_score,        num_infer_chunks, "
-"num_label_chunks,        num_correct_chunks)"
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:18
-msgid "tuple containing: (precision, recall, f1_score,"
-msgstr ""
-
-#: of paddle.fluid.layers.chunk_eval:18
-msgid "num_infer_chunks, num_label_chunks, num_correct_chunks)"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:469
-msgid "sequence_conv"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:1
-msgid ""
-"This function creates the op for sequence_conv, using the inputs and "
-"other convolutional configurations for the filters and stride as given in"
-" the input parameters to the function."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:5
-msgid ""
-"(LoDTensor) the input(X) is a LodTensor, which supports variable-time "
-"length input sequence. The underlying tensor in this LoDTensor is a "
-"matrix with shape (T, N), where T is the total time steps in this mini-"
-"batch and N is the input_hidden_size"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:7
-msgid "number of filters."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:9
-msgid "the filter size (H and W)."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:11
-msgid "stride of the filter."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:13
-msgid "if True, add paddings."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:16 paddle.fluid.layers.nce:15
-#: paddle.fluid.layers.sequence_conv:15
-msgid "attributes for bias"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:14 paddle.fluid.layers.nce:13
-#: paddle.fluid.layers.sequence_conv:17
-msgid "attributes for parameter"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:19
-msgid "the activation type"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_conv:22
-msgid "output of sequence_conv"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:477
-msgid "conv2d"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:1
-msgid ""
-"The convolution2D layer calculates the output based on the input, filter "
-"and strides, paddings, dilations, groups parameters. Input and Output are"
-" in NCHW format, where N is batch size, C is the number of channels, H is"
-" the height of the feature, and W is the width of the feature. Filter is "
-"in MCHW format, where M is the number of output image channels, C is the "
-"number of input image channels, H is the height of the filter, and W is "
-"the width of the filter. If the groups is greater than 1, C will equal "
-"the number of input image channels divided by the groups. Please refer to"
-" UFLDL's `convolution "
-"<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_"
-" for more detials. If bias attribution and activation type are provided, "
-"bias is added to the output of the convolution, and the corresponding "
-"activation function is applied to the final result."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:16 paddle.fluid.layers.conv2d_transpose:12
-#: paddle.fluid.layers.conv3d:12 paddle.fluid.layers.conv3d_transpose:12
-msgid "For each input :math:`X`, the equation is:"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:22
-msgid "Where:"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:24 paddle.fluid.layers.conv2d_transpose:20
-msgid ":math:`X`: Input value, a tensor with NCHW format."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:25 paddle.fluid.layers.conv2d_transpose:21
-msgid ":math:`W`: Filter value, a tensor with MCHW format."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:26 paddle.fluid.layers.conv3d:22
-msgid ":math:`\\ast`: Convolution operation."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:27 paddle.fluid.layers.conv3d:23
-msgid ":math:`b`: Bias value, a 2-D tensor with shape [M, 1]."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:28 paddle.fluid.layers.conv3d:24
-msgid ":math:`\\sigma`: Activation function."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:29
-msgid ""
-":math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be "
-"different."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:31 paddle.fluid.layers.conv2d_transpose:26
-#: paddle.fluid.layers.conv3d:28 paddle.fluid.layers.conv3d_transpose:26
-msgid "Example"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:33 paddle.fluid.layers.conv2d_transpose:28
-#: paddle.fluid.layers.conv3d:30 paddle.fluid.layers.conv3d_transpose:28
-msgid "Input:"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:35
-msgid "Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:37
-msgid "Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:39 paddle.fluid.layers.conv2d_transpose:34
-#: paddle.fluid.layers.conv3d_transpose:34
-msgid "Output:"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:41
-msgid "Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:43 paddle.fluid.layers.conv2d_transpose:38
-#: paddle.fluid.layers.conv3d:39 paddle.fluid.layers.conv3d_transpose:38
-msgid "Where"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:50 paddle.fluid.layers.conv2d_transpose:45
-msgid "The input image with [N, C, H, W] format."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:52
-msgid "The number of filter. It is as same as the output image channel."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:55 paddle.fluid.layers.im2sequence:17
-msgid ""
-"The filter size. If filter_size is a tuple, it must contain two integers,"
-" (filter_size_H, filter_size_W). Otherwise, the filter will be a square."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:59 paddle.fluid.layers.conv2d_transpose:63
-#: paddle.fluid.layers.im2sequence:21
-msgid ""
-"The stride size. If stride is a tuple, it must contain two integers, "
-"(stride_H, stride_W). Otherwise, the stride_H = stride_W = stride. "
-"Default: stride = 1."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:63 paddle.fluid.layers.conv2d_transpose:59
-msgid ""
-"The padding size. If padding is a tuple, it must contain two integers, "
-"(padding_H, padding_W). Otherwise, the padding_H = padding_W = padding. "
-"Default: padding = 0."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:67 paddle.fluid.layers.conv2d_transpose:67
-msgid ""
-"The dilation size. If dilation is a tuple, it must contain two integers, "
-"(dilation_H, dilation_W). Otherwise, the dilation_H = dilation_W = "
-"dilation. Default: dilation = 1."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:71
-msgid ""
-"The groups number of the Conv2d Layer. According to grouped convolution "
-"in Alex Krizhevsky's Deep CNN paper: when group=2, the first half of the "
-"filters is only connected to the first half of the input channels, while "
-"the second half of the filters is only connected to the second half of "
-"the input channels. Default: groups=1"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:77
-msgid "The parameters to the Conv2d Layer. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:79 paddle.fluid.layers.conv2d_transpose:81
-msgid "Bias parameter for the Conv2d layer. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:81 paddle.fluid.layers.conv2d_transpose:83
-#: paddle.fluid.layers.conv3d:77 paddle.fluid.layers.conv3d_transpose:84
-msgid ""
-"Use cudnn kernel or not, it is valid only when the cudnn library is "
-"installed. Default: True"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:84
-msgid ""
-"Use mkldnn kernels or not, it is valid only when compiled with mkldnn "
-"library. Default: False"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:87 paddle.fluid.layers.conv2d_transpose:86
-#: paddle.fluid.layers.conv3d:82 paddle.fluid.layers.conv3d_transpose:87
-msgid "Activation type. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:93 paddle.fluid.layers.conv3d:88
-msgid ""
-"The tensor variable storing the convolution and                   non-"
-"linearity activation result."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d:96 paddle.fluid.layers.conv2d_transpose:95
-#: paddle.fluid.layers.conv3d:91 paddle.fluid.layers.conv3d_transpose:96
-msgid ""
-":exc:`ValueError` -- If the shapes of input, filter_size, stride, padding"
-" and groups mismatch."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:485
-msgid "conv3d"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:1
-msgid "**Convlution3D Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:3
-msgid ""
-"The convolution3D layer calculates the output based on the input, filter "
-"and strides, paddings, dilations, groups parameters. Input(Input) and "
-"Output(Output) are in NCDHW format. Where N is batch size C is the number"
-" of channels, D is the depth of the feature, H is the height of the "
-"feature, and W is the width of the feature. Convlution3D is similar with "
-"Convlution2D but adds one dimension(depth). If bias attribution and "
-"activation type are provided, bias is added to the output of the "
-"convolution, and the corresponding activation function is applied to the "
-"final result."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:20 paddle.fluid.layers.conv3d_transpose:20
-msgid ":math:`X`: Input value, a tensor with NCDHW format."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:21 paddle.fluid.layers.conv3d_transpose:21
-msgid ":math:`W`: Filter value, a tensor with MCDHW format."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:24 paddle.fluid.layers.conv3d:26
-#: paddle.fluid.layers.conv3d_transpose:24
-msgid ":math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:24 paddle.fluid.layers.conv3d:26
-#: paddle.fluid.layers.conv3d_transpose:24
-msgid "different."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:32
-msgid "Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:34
-msgid "Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:36
-msgid "Output: Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:47
-msgid ""
-"The input image with [N, C, D, H, W] format. num_filters(int): The number"
-" of filter. It is as same as the output image channel."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:51
-msgid ""
-"The filter size. If filter_size is a tuple, it must contain three "
-"integers, (filter_size_D, filter_size_H, filter_size_W). Otherwise, the "
-"filter will be a square."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:55 paddle.fluid.layers.conv3d_transpose:64
-msgid ""
-"The stride size. If stride is a tuple, it must contain three integers, "
-"(stride_D, stride_H, stride_W). Otherwise, the stride_D = stride_H = "
-"stride_W = stride. Default: stride = 1."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:59 paddle.fluid.layers.conv3d_transpose:60
-msgid ""
-"The padding size. If padding is a tuple, it must contain three integers, "
-"(padding_D, padding_H, padding_W). Otherwise, the padding_D = padding_H ="
-" padding_W = padding. Default: padding = 0."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:63 paddle.fluid.layers.conv3d_transpose:68
-msgid ""
-"The dilation size. If dilation is a tuple, it must contain three "
-"integers, (dilation_D, dilation_H, dilation_W). Otherwise, the dilation_D"
-" = dilation_H = dilation_W = dilation. Default: dilation = 1."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:67
-msgid ""
-"The groups number of the Conv3d Layer. According to grouped convolution "
-"in Alex Krizhevsky's Deep CNN paper: when group=2, the first half of the "
-"filters is only connected to the first half of the input channels, while "
-"the second half of the filters is only connected to the second half of "
-"the input channels. Default: groups=1"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:73
-msgid "The parameters to the Conv3d Layer. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:75 paddle.fluid.layers.conv3d_transpose:82
-msgid "Bias parameter for the Conv3d layer. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d:80
-msgid "Use mkldnn kernels or not."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:493
-msgid "sequence_pool"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:1
-msgid ""
-"This function add the operator for sequence pooling. It pools features of"
-" all time-steps of each instance, and is applied on top of the input "
-"using pool_type mentioned in the parameters."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:5
-msgid "It supports four pool_type:"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:7
-msgid "average: :math:`Out[i] = \\frac{\\sum_i X_i}{N}`"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:8
-msgid "sum:     :math:`Out[i] = \\sum_jX_{ij}`"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:9
-msgid "sqrt:    :math:`Out[i] = \\frac{\\sum_jX_{ij}}{\\sqrt{len(X_i)}}`"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:10
-msgid "max:     :math:`Out[i] = max(X_i)`"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_expand:41
-#: paddle.fluid.layers.sequence_first_step:15
-#: paddle.fluid.layers.sequence_last_step:15
-#: paddle.fluid.layers.sequence_pool:32
-msgid "The input variable which is a LoDTensor."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:34
-msgid "The pooling type of sequence_pool. It supports average, sum, sqrt and max."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_pool:38
-msgid "The sequence pooling variable which is a Tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:501
-msgid "sequence_softmax"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:509
-msgid "softmax"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:517
-msgid "pool2d"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:1
-msgid ""
-"This function adds the operator for pooling in 2 dimensions, using the "
-"pooling configurations mentioned in input parameters."
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:4 paddle.fluid.layers.pool3d:4
-msgid "${input_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:6 paddle.fluid.layers.pool3d:6
-msgid "${ksize_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:8 paddle.fluid.layers.pool3d:8
-msgid "${pooling_type_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:10 paddle.fluid.layers.pool3d:10
-msgid "stride of the pooling layer."
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:12 paddle.fluid.layers.pool3d:12
-msgid "padding size."
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:14 paddle.fluid.layers.pool3d:14
-msgid "${global_pooling_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:16 paddle.fluid.layers.pool3d:16
-msgid "${use_cudnn_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:18 paddle.fluid.layers.pool3d:18
-msgid "${ceil_mode_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:22 paddle.fluid.layers.pool2d:20
-#: paddle.fluid.layers.pool3d:20
-msgid "${use_mkldnn_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.pool2d:26
-msgid "output of pool2d layer."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:525
-msgid "pool3d"
-msgstr ""
-
-#: of paddle.fluid.layers.pool3d:1
-msgid ""
-"This function adds the operator for pooling in 3-dimensions, using the "
-"pooling configurations mentioned in input parameters."
-msgstr ""
-
-#: of paddle.fluid.layers.pool3d:26
-msgid "output of pool3d layer."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:533
-msgid "batch_norm"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:1
-msgid ""
-"This function helps create an operator to implement the BatchNorm layer "
-"using the configurations from the input parameters."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:4
-msgid "the input variable."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:6
-msgid "activation type"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:8
-msgid "whether to run batch_norm as test mode."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:10
-msgid "momentum"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:12
-msgid "epsilon, default 1e-05"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:18
-msgid "data layout, default NCHW"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:20
-msgid "if True, do not create tmp variable"
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:24
-#: paddle.fluid.layers.beam_search_decode:7
-#: paddle.fluid.layers.ctc_greedy_decoder:44
-#: paddle.fluid.layers.edit_distance:32 paddle.fluid.layers.im2sequence:34
-#: paddle.fluid.layers.layer_norm:48 paddle.fluid.layers.reshape:55
-#: paddle.fluid.layers.transpose:12
-msgid "The name of this layer. It is optional."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:26
-msgid "The name of moving mean variable name, optional."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:28
-msgid "The name of moving variance name, optional."
-msgstr ""
-
-#: of paddle.fluid.layers.batch_norm:33
-msgid "output of batch_norm layer."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:541
-msgid "beam_search_decode"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search_decode:1
-msgid "${beam_search_decode}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:5
-#: paddle.fluid.layers.beam_search_decode:3
-msgid "${ids_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:7
-#: paddle.fluid.layers.beam_search_decode:5
-msgid "${scores_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search_decode:10
-msgid "a tuple of two output variable: sentence_ids, sentence_scores"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:549
-msgid "conv2d_transpose"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:1
-msgid "**Convlution2D transpose layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:3
-msgid ""
-"The convolution2D transpose layer calculates the output based on the "
-"input, filter, and dilations, strides, paddings. Input(Input) and "
-"output(Output) are in NCHW format. Where N is batch size, C is the number"
-" of channels, H is the height of the feature, and W is the width of the "
-"feature. Parameters(dilations, strides, paddings) are two elements. These"
-" two elements represent height and width, respectively. The details of "
-"convolution transpose layer, please refer to the following explanation "
-"and references `therein <http://www.matthewzeiler.com/wp-"
-"content/uploads/2017/07/cvpr2010.pdf>`_."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:22
-#: paddle.fluid.layers.conv3d_transpose:22
-msgid ":math:`\\ast` : Convolution transpose operation."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:30
-msgid "Input shape: $(N, C_{in}, H_{in}, W_{in})$"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:32
-msgid "Filter shape: $(C_{in}, C_{out}, H_f, W_f)$"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:36
-msgid "Output shape: $(N, C_{out}, H_{out}, W_{out})$"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:47
-#: paddle.fluid.layers.conv3d_transpose:48
-msgid "The number of the filter. It is as same as the output image channel."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:50
-msgid ""
-"The output image size. If output size is a tuple, it must contain two "
-"integers, (image_H, image_W). This parameter only works when filter_size "
-"is None."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:54
-msgid ""
-"The filter size. If filter_size is a tuple, it must contain two integers,"
-" (filter_size_H, filter_size_W). Otherwise, the filter will be a square. "
-"None if use output size to calculate filter_size."
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:71
-msgid ""
-"The groups number of the Conv2d transpose layer. Inspired by grouped "
-"convolution in Alex Krizhevsky's Deep CNN paper, in which when group=2, "
-"the first half of the filters is only connected to the first half of the "
-"input channels, while the second half of the filters is only connected to"
-" the second half of the input channels. Default: groups=1"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:78
-msgid "The parameters to the Conv2d_transpose Layer. Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.conv2d_transpose:92
-#: paddle.fluid.layers.conv3d_transpose:93
-msgid "The tensor variable storing the convolution transpose result."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:557
-msgid "conv3d_transpose"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:1
-msgid "**Convlution3D transpose layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:3
-msgid ""
-"The convolution3D transpose layer calculates the output based on the "
-"input, filter, and dilations, strides, paddings. Input(Input) and "
-"output(Output) are in NCDHW format. Where N is batch size, C is the "
-"number of channels, D is the depth of the feature, H is the height of the"
-" feature, and W is the width of the feature. Parameters(dilations, "
-"strides, paddings) are two elements. These two elements represent height "
-"and width, respectively. The details of convolution transpose layer, "
-"please refer to the following explanation and references `therein "
-"<http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:30
-msgid "Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:32
-msgid "Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:36
-msgid "Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:46
-msgid "The input image with [N, C, D, H, W] format."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:51
-msgid ""
-"The output image size. If output size is a tuple, it must contain three "
-"integers, (image_D, image_H, image_W). This parameter only works when "
-"filter_size is None."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:55
-msgid ""
-"The filter size. If filter_size is a tuple, it must contain three "
-"integers, (filter_size_D, filter_size_H, filter_size_W). Otherwise, the "
-"filter will be a square. None if use output size to calculate "
-"filter_size."
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:72
-msgid ""
-"The groups number of the Conv3d transpose layer. Inspired by grouped "
-"convolution in Alex Krizhevsky's Deep CNN paper, in which when group=2, "
-"the first half of the filters is only connected to the first half of the "
-"input channels, while the second half of the filters is only connected to"
-" the second half of the input channels. Default: groups=1"
-msgstr ""
-
-#: of paddle.fluid.layers.conv3d_transpose:79
-msgid "The parameters to the Conv3d_transpose Layer. Default: None"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:565
-msgid "sequence_expand"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_expand:1
-msgid ""
-"Sequence Expand Layer. This layer will expand the input variable **x** "
-"according to specified level lod of **y**. Please note that lod level of "
-"**x** is at most 1 and rank of **x** is at least 2. When rank of **x** is"
-" greater than 2, then it would be viewed as a 2-D tensor. Following "
-"examples will explain how sequence_expand works:"
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:28 paddle.fluid.layers.matmul:30
-#: paddle.fluid.layers.reduce_max:3 paddle.fluid.layers.reduce_mean:3
-#: paddle.fluid.layers.reduce_min:3 paddle.fluid.layers.reduce_prod:3
-#: paddle.fluid.layers.reduce_sum:3 paddle.fluid.layers.sequence_expand:39
-#: paddle.fluid.layers.split:3
-msgid "The input variable which is a Tensor or LoDTensor."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_expand:43
-msgid ""
-"Lod level of `y` to be referred by `x`. If set to -1, refer the last "
-"level of lod."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_expand:50
-msgid "The expanded variable which is a LoDTensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:573
-msgid "lstm_unit"
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:1
-msgid "Lstm unit layer. The equation of a lstm step is:"
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:15
-msgid ""
-"The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and "
-":math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and "
-":math:`c_{t-1}` should be same. The implementation separates the linear "
-"transformation and non-linear transformation apart. Here, we take "
-":math:`i_t` as an example. The linear transformation is applied by "
-"calling a `fc` layer and the equation is:"
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:26
-msgid ""
-"The non-linear transformation is applied by calling `lstm_unit_op` and "
-"the equation is:"
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:33
-msgid "This layer has two outputs including :math:`h_t` and :math:`o_t`."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:35
-msgid ""
-"The input value of current step, a 2-D tensor with shape M x N, M for "
-"batch size and N for input size."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:38
-msgid ""
-"The hidden value of lstm unit, a 2-D tensor with shape M x S, M for batch"
-" size and S for size of lstm unit."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:41
-msgid ""
-"The cell value of lstm unit, a 2-D tensor with shape M x S, M for batch "
-"size and S for size of lstm unit."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:44
-msgid "The forget bias of lstm unit."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:46
-msgid "The attributes of parameter weights, used to set initializer, name etc."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:49
-msgid ""
-"The attributes of bias weights, if not False, bias weights will be "
-"created and be set to default value."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:56
-msgid "The hidden value and cell value of lstm unit."
-msgstr ""
-
-#: of paddle.fluid.layers.lstm_unit:59
-msgid ""
-":exc:`ValueError` -- The ranks of **x_t**, **hidden_t_prev** and "
-"**cell_t_prev** not be 2 or the 1st dimensions of **x_t**, "
-"**hidden_t_prev** and **cell_t_prev** not be the same or the 2nd "
-"dimensions of **hidden_t_prev** and **cell_t_prev** not be the same."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:581
-msgid "reduce_sum"
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_sum:1
-msgid "Computes the sum of tensor elements over the given dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_sum:5
-msgid ""
-"The dimensions along which the sum is performed. If :attr:`None`, sum all"
-" elements of :attr:`input` and return a Tensor variable with a single "
-"element, otherwise must be in the range :math:`[-rank(input), "
-"rank(input))`. If :math:`dim[i] < 0`, the dimension to reduce is "
-":math:`rank + dim[i]`."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_max:11 paddle.fluid.layers.reduce_mean:11
-#: paddle.fluid.layers.reduce_min:11 paddle.fluid.layers.reduce_prod:11
-#: paddle.fluid.layers.reduce_sum:11
-msgid ""
-"Whether to reserve the reduced dimension in the output Tensor. The result"
-" tensor will have one fewer dimension than the :attr:`input` unless "
-":attr:`keep_dim` is true."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_max:19 paddle.fluid.layers.reduce_mean:19
-#: paddle.fluid.layers.reduce_min:19 paddle.fluid.layers.reduce_prod:19
-#: paddle.fluid.layers.reduce_sum:19
-msgid "The reduced Tensor variable."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:589
-msgid "reduce_mean"
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_mean:1
-msgid "Computes the mean of tensor elements over the given dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_mean:5
-msgid ""
-"The dimensions along which the mean is computed. If :attr:`None`, compute"
-" the mean over all elements of :attr:`input` and return a Tensor variable"
-" with a single element, otherwise must be in the range "
-":math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`, the dimension"
-" to reduce is :math:`rank + dim[i]`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:597
-msgid "reduce_max"
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_max:1
-msgid "Computes the maximum of tensor elements over the given dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_max:5
-msgid ""
-"The dimension along which the maximum is computed. If :attr:`None`, "
-"compute the maximum over all elements of :attr:`input` and return a "
-"Tensor variable with a single element, otherwise must be in the range "
-":math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`, the dimension"
-" to reduce is :math:`rank + dim[i]`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:605
-msgid "reduce_min"
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_min:1
-msgid "Computes the minimum of tensor elements over the given dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_min:5
-msgid ""
-"The dimensions along which the minimum is computed. If :attr:`None`, "
-"compute the minimum over all elements of :attr:`input` and return a "
-"Tensor variable with a single element, otherwise must be in the range "
-":math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`, the dimension"
-" to reduce is :math:`rank + dim[i]`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:613
-msgid "reduce_prod"
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_prod:1
-msgid "Computes the product of tensor elements over the given dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.reduce_prod:5
-msgid ""
-"The dimensions along which the product is performed. If :attr:`None`, "
-"multipy all elements of :attr:`input` and return a Tensor variable with a"
-" single element, otherwise must be in the range :math:`[-rank(input), "
-"rank(input))`. If :math:`dim[i] < 0`, the dimension to reduce is "
-":math:`rank + dim[i]`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:621
-msgid "sequence_first_step"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_first_step:1
-msgid "This function gets the first step of sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_first_step:18
-msgid "The sequence's first step variable which is a Tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:629
-msgid "sequence_last_step"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_last_step:1
-msgid "This function gets the last step of sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_last_step:18
-msgid "The sequence's last step variable which is a Tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:637
-msgid "dropout"
-msgstr ""
-
-#: of paddle.fluid.layers.dropout:1
-msgid "Computes dropout."
-msgstr ""
-
-#: of paddle.fluid.layers.dropout:3
-msgid ""
-"Drop or keep each element of `x` independently. Dropout is a "
-"regularization technique for reducing overfitting by preventing neuron "
-"co-adaption during training. The dropout operator randomly set (according"
-" to the given dropout probability) the outputs of some units to zero, "
-"while others are remain unchanged."
-msgstr ""
-
-#: of paddle.fluid.layers.dropout:9
-msgid ""
-"The input tensor. dropout_prob (float): Probability of setting units to "
-"zero."
-msgstr ""
-
-#: of paddle.fluid.layers.dropout:12
-msgid "A flag indicating whether it is in test phrase or not."
-msgstr ""
-
-#: of paddle.fluid.layers.dropout:14
-msgid ""
-"A Python integer used to create random seeds. If this parameter is set to"
-" None, a random seed is used. NOTE: If an integer seed is given, always "
-"the same output units will be dropped. DO NOT use a fixed seed in "
-"training."
-msgstr ""
-
-#: of paddle.fluid.layers.dropout:23
-msgid "A tensor variable."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:645
-msgid "split"
-msgstr ""
-
-#: of paddle.fluid.layers.split:1
-msgid "Split the input tensor into multiple sub-tensors."
-msgstr ""
-
-#: of paddle.fluid.layers.split:5
-msgid ""
-"If :attr:`num_or_sections` is an integer, then the integer indicates the "
-"number of equal sized sub-tensors that the tensor will be divided into. "
-"If :attr:`num_or_sections` is a list of integers, the length of list "
-"indicates the number of sub-tensors and the integers indicate the sizes "
-"of sub-tensors' :attr:`dim` dimension orderly."
-msgstr ""
-
-#: of paddle.fluid.layers.split:12
-msgid ""
-"The dimension along which to split. If :math:`dim < 0`, the dimension to "
-"split along is :math:`rank(input) + dim`."
-msgstr ""
-
-#: of paddle.fluid.layers.split:19
-msgid "The list of segmented tensor variables."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:653
-msgid "ctc_greedy_decoder"
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:1
-msgid ""
-"This op is used to decode sequences by greedy policy by below steps: 1. "
-"Get the indexes of max value for each row in input. a.k.a."
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:3
-msgid "numpy.argmax(input, axis=0)."
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:4
-msgid ""
-"For each sequence in result of step1, merge repeated tokens between two "
-"blanks and delete all blanks."
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:7
-msgid "A simple example as below:"
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:33
-msgid ""
-"(LoDTensor<float>), the probabilities of variable-length sequences, which"
-" is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + "
-"1], where Lp is the sum of all input sequences' length and num_classes is"
-" the true number of classes. (not including the blank label)."
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:40
-msgid ""
-"the blank label index of Connectionist Temporal Classification (CTC) "
-"loss, which is in thehalf-opened interval [0, num_classes + 1)."
-msgstr ""
-
-#: of paddle.fluid.layers.ctc_greedy_decoder:47
-msgid ""
-"CTC greedy decode result. If all the sequences in result were empty, the "
-"result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:661
-msgid "edit_distance"
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:1
-msgid ""
-"EditDistance operator computes the edit distances between a batch of "
-"hypothesis strings and their references. Edit distance, also called "
-"Levenshtein distance, measures how dissimilar two strings are by counting"
-" the minimum number of operations to transform one string into anthor. "
-"Here the operations include insertion, deletion, and substitution."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:7
-msgid ""
-"For example, given hypothesis string A = \"kitten\" and reference B = "
-"\"sitting\", the edit distance is 3 for A will be transformed into B at "
-"least after two substitutions and one insertion:"
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:11
-msgid "\"kitten\" -> \"sitten\" -> \"sittin\" -> \"sitting\""
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:13
-msgid ""
-"Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with "
-"the total number denoted by `batch_size`, and the separation is specified"
-" by the LoD information. And the `batch_size` reference strings are "
-"arranged in order in the same way in the LoDTensor Input(Refs)."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:18
-msgid ""
-"Output(Out) contains the `batch_size` results and each stands for the "
-"edit distance for a pair of strings respectively. If Attr(normalized) is "
-"true, the edit distance will be divided by the length of reference "
-"string."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:22
-msgid "The indices for hypothesis strings."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:24
-msgid "The indices for reference strings."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:26
-msgid ""
-"Indicated whether to normalize the edit distance by the length of "
-"reference string."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:29
-msgid "Tokens that should be removed before calculating edit distance."
-msgstr ""
-
-#: of paddle.fluid.layers.edit_distance:35
-msgid "sequence-to-sequence edit distance in shape [batch_size, 1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:669
-msgid "l2_normalize"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:1
-msgid "**L2 normalize Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:3
-msgid ""
-"The l2 normalize layer normalizes `x` along dimension `axis` using an L2 "
-"norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:7
-msgid "y ="
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:8
-msgid "rac{x}{ \\sqrt{\\sum {x^2} + epsion }}"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:10
-msgid ""
-"For `x` with more dimensions, this layer independently normalizes each "
-"1-D slice along dimension `axis`."
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:22 paddle.fluid.layers.lrn:29
-msgid "Args:"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:14
-msgid ""
-"x(Variable|list): The input tensor to l2_normalize layer. axis(int): The "
-"axis on which to apply normalization. If `axis < 0`,"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:16
-msgid ""
-"the dimension to normalization is rank(X) + axis. -1 is the last "
-"dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:18
-msgid "epsilon(float): The epsilon value is used to avoid division by zero,"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:19
-msgid "the defalut value is 1e-10."
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:22
-msgid "name(str|None): A name for this layer(optional). If set None, the layer"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:21
-msgid "will be named automatically."
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:25 paddle.fluid.layers.lrn:35
-msgid "Returns:"
-msgstr ""
-
-#: of paddle.fluid.layers.l2_normalize:25
-msgid "Variable: The output tensor variable."
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:44 paddle.fluid.layers.l2_normalize:32
-#: paddle.fluid.layers.lrn:41
-msgid "Examples:"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:677
-msgid "matmul"
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:1
-msgid "Applies matrix multiplication to two tensors."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:3
-msgid ""
-"Currently, the input tensors' rank can be any, but when the rank of any "
-"inputs is bigger than 3, this two inputs' rank should be equal."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:6
-msgid ""
-"The actual behavior depends on the shapes of :math:`x`, :math:`y` and the"
-" flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:"
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:9
-msgid ""
-"If a transpose flag is specified, the last two dimensions of the tensor "
-"are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for "
-":math:`x` it is treated as :math:`[1, D]` in nontransposed form and as "
-":math:`[D, 1]` in transposed form, whereas for :math:`y` it is the "
-"opposite: It is treated as :math:`[D, 1]` in nontransposed form and as "
-":math:`[1, D]` in transposed form."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:16
-msgid ""
-"After transpose, the two tensors are 2-D or n-D and matrix multiplication"
-" performs in the following way."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:19
-msgid "If both are 2-D, they are multiplied like conventional matrices."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:20
-msgid ""
-"If either is n-D, it is treated as a stack of matrices residing in the "
-"last two dimensions and a batched matrix multiply supporting broadcast "
-"applies on the two tensors."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:24
-msgid ""
-"Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and "
-"nontransposed, the prepended or appended dimension :math:`1` will be "
-"removed after matrix multiplication."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:32
-msgid "Whether to transpose :math:`x` before multiplication."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:34
-msgid "Whether to transpose :math:`y` before multiplication."
-msgstr ""
-
-#: of paddle.fluid.layers.matmul:40
-msgid "The product Tensor variable."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:685
-msgid "topk"
-msgstr ""
-
-#: of paddle.fluid.layers.topk:1
-msgid ""
-"This operator is used to find values and indices of the k largest entries"
-" for the last dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:4
-msgid ""
-"If the input is a vector (rank=1), finds the k largest entries in the "
-"vector and outputs their values and indices as vectors. Thus values[j] is"
-" the j-th largest entry in input, and its index is indices[j]."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:8
-msgid ""
-"If the input is a Tensor with higher rank, this operator computes the top"
-" k entries along the last dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:11
-msgid "The input variable which can be a vector or Tensor with higher rank."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:14
-msgid "An integer value to specify the top k largest elements."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:20
-msgid ""
-"The k largest elements along each last dimensional     slice. "
-"indices(Variable): The indices of values within the last dimension of"
-"     input."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:22
-msgid "The k largest elements along each last dimensional"
-msgstr ""
-
-#: of paddle.fluid.layers.topk:23
-msgid "slice."
-msgstr ""
-
-#: of paddle.fluid.layers.topk:24
-msgid "indices(Variable): The indices of values within the last dimension of"
-msgstr ""
-
-#: of paddle.fluid.layers.topk:25
-msgid "input."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:693
-msgid "warpctc"
-msgstr ""
-
-#: of paddle.fluid.layers.warpctc:1
-msgid ""
-"An operator integrating the open source Warp-CTC library "
-"(https://github.com/baidu-research/warp-ctc) to compute Connectionist "
-"Temporal Classification (CTC) loss. It can be aliased as softmax with "
-"CTC, since a native softmax activation is interated to the Warp-CTC "
-"library, to to normlize values for each row of the input tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.warpctc:8
-msgid ""
-"(LodTensor, default: LoDTensor<float>), the unscaled probabilities of "
-"variable-length sequences, which is a 2-D Tensor with LoD information. "
-"It's shape is [Lp, num_classes + 1], where Lp is the sum of all input "
-"sequences' length and num_classes is the true number of classes. (not "
-"including the blank label)."
-msgstr ""
-
-#: of paddle.fluid.layers.warpctc:15
-msgid ""
-"(LodTensor, default: LoDTensor<int>), the ground truth of variable-length"
-" sequence, which is a 2-D Tensor with LoD information. It is of the shape"
-" [Lg, 1], where Lg is th sum of all labels' length."
-msgstr ""
-
-#: of paddle.fluid.layers.warpctc:20
-msgid ""
-"default 0, the blank label index of Connectionist Temporal Classification"
-" (CTC) loss, which is in the half-opened interval [0, num_classes + 1)."
-msgstr ""
-
-#: of paddle.fluid.layers.warpctc:24
-msgid ""
-"default false, whether to normalize the gradients by the number of time-"
-"step, which is also the sequence's length. There is no need to normalize "
-"the gradients if warpctc layer was follewed by a mean_op."
-msgstr ""
-
-#: of paddle.fluid.layers.warpctc:30
-msgid ""
-"The Connectionist Temporal Classification (CTC) loss, which is a 2-D "
-"Tensor of the shape [batch_size, 1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:701
-msgid "sequence_reshape"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_reshape:1
-msgid "**Sequence Reshape Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_reshape:3
-msgid ""
-"This layer will rearrange the input sequences. The new dimension is set "
-"by user. Length of each sequence is computed according to original "
-"length, original dimension and new dimension. The following example will "
-"help to illustrate the function of this layer:"
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_reshape:24
-msgid ""
-"Currently, only 1-level LoDTensor is supported and please make sure "
-"(original length * original dimension) can be divided by new dimension "
-"with no remainder for each sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_reshape:28
-msgid ""
-"(LodTensor, default: LoDTensor<float>), a 2-D LoDTensor with shape being "
-"[N, M] where M for dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_reshape:31
-msgid "New dimension which the input LoDTensor is reshaped to."
-msgstr ""
-
-#: of paddle.fluid.layers.sequence_reshape:34
-msgid "Reshaped LoDTensor according to new dimension."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:709
-msgid "transpose"
-msgstr ""
-
-#: of paddle.fluid.layers.transpose:1
-msgid "**transpose Layer**"
-msgstr ""
-
-#: of paddle.fluid.layers.transpose:3
-msgid "Permute the dimensions of `input` according to `perm`."
-msgstr ""
-
-#: of paddle.fluid.layers.transpose:5
-msgid ""
-"The `i`-th dimension  of the returned tensor will correspond to the "
-"perm[i]-th dimension of `input`."
-msgstr ""
-
-#: of paddle.fluid.layers.transpose:8
-msgid "The input Tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.transpose:10
-msgid "A permutation of the dimensions of `input`."
-msgstr ""
-
-#: of paddle.fluid.layers.transpose:15
-msgid "A transposed Tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:717
-msgid "im2sequence"
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:1
-msgid ""
-"Extracts image patches from the input tensor to form a tensor of shape "
-"{input.batch_size * output_height * output_width, filter_size_H * "
-"filter_size_W * input.channels} which is similar with im2col. This op use"
-" filter / kernel to scan images and convert these images to sequences. "
-"After expanding, the number of time step are output_height * output_width"
-" for an image, in which output_height and output_width are calculated by "
-"below equation:"
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:13
-msgid "And the dimension of each time step is block_y * block_x * input.channels."
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:15
-msgid "The input should be a tensor in NCHW format."
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:25
-msgid ""
-"The padding size. If padding is a tuple, it can contain two integers like"
-" (padding_H, padding_W) which means padding_up = padding_down = padding_H"
-" and padding_left = padding_right = padding_W. Or it can use (padding_up,"
-" padding_left, padding_down, padding_right) to indicate paddings of four "
-"direction. Otherwise, a scalar padding means padding_up = padding_down = "
-"padding_left = padding_right = padding Default: padding = 0."
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:37
-msgid ""
-"The output is a LoDTensor with shape {input.batch_size * output_height * "
-"output_width, filter_size_H * filter_size_W * input.channels}. If we "
-"regard output as a matrix, each row of this matrix is a step of a "
-"sequence."
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:46
-msgid "As an example:"
-msgstr ""
-
-#: of paddle.fluid.layers.im2sequence:91
-msgid "The simple usage is:"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:725
-msgid "nce"
-msgstr ""
-
-#: of paddle.fluid.layers.nce:1
-msgid ""
-"Compute and return the noise-contrastive estimation training loss. See "
-"[Noise-contrastive estimation: A new estimation principle for "
-"unnormalized statistical "
-"models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf)."
-" By default this operator uses a uniform distribution for sampling."
-msgstr ""
-
-#: of paddle.fluid.layers.nce:5
-msgid "input variable."
-msgstr ""
-
-#: of paddle.fluid.layers.nce:7
-msgid "label."
-msgstr ""
-
-#: of paddle.fluid.layers.nce:9
-msgid "Total number of classes in all samples"
-msgstr ""
-
-#: of paddle.fluid.layers.nce:11
-msgid ""
-"(Tensor) A tensor of shape [batch_size, 1] storing a weight for each "
-"sample. And it is a dispensable input. The default value of sample is 1"
-msgstr ""
-
-#: of paddle.fluid.layers.nce:17
-msgid "The number of negative classes. The default value is 10"
-msgstr ""
-
-#: of paddle.fluid.layers.nce:20
-msgid "output of nce layer."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:733
-msgid "beam_search"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:1
-msgid "This function implements the beam search algorithm."
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:3
-msgid "${pre_ids_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:9
-msgid "${beam_size_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:11
-msgid "${end_id_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:13
-msgid "${level_comment}"
-msgstr ""
-
-#: of paddle.fluid.layers.beam_search:16
-msgid "a tuple of beam_search output variables: selected_ids, selected_scores"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:741
-msgid "row_conv"
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:1
-msgid ":strong:`Row-convolution operator`"
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:3
-msgid ""
-"The row convolution is called lookahead convolution.  This operator was "
-"introduced in the following paper for DeepSpeech2: "
-"http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf"
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:5
-msgid ""
-"The main motivation is that a bidirectional RNN, useful in DeepSpeech "
-"like speech models, learns representation for a sequence by performing a "
-"forward and a backward pass through the entire sequence. However, unlike "
-"unidirectional RNNs, bidirectional RNNs are challenging to deploy in an "
-"online and low-latency setting. The lookahead convolution incorporates "
-"information from future subsequences in a computationally efficient "
-"manner to improve unidirectional recurrent neural networks. The row "
-"convolution operator is different from the 1D sequence convolution, and "
-"is computed as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:7
-msgid ""
-"Given an input sequence :math:`in` of length :math:`t` and input "
-"dimension :math:`d`, and a filter (:math:`W`) of size :math:`context "
-"\\times d`, the output sequence is convolved as:"
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:9
-msgid ""
-"$$ out_{i, :} = \\\\sum_{j=i}^{i + context} in_{j,:} \\\\cdot W_{i-j, :} "
-"$$"
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:13
-msgid ":math:`Out_{i}`: The i-th row of output variable with shape [1, D]."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:15
-msgid ":math:`\\\\tau`: Future context size."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:17
-msgid ":math:`X_{j}`: The j-th row of input variable with shape [1, D]."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:19
-msgid ":math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D]."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:21
-msgid ""
-"More details about row_conv please refer to the design document "
-"https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645"
-" ."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:27
-msgid ""
-"the input(X) is a LodTensor, which supports variable time-length input "
-"sequences. The underlying tensor in this LoDTensor is a matrix with shape"
-" (T x N), where T is the total time steps in this mini-batch and N is the"
-" input data dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:29
-msgid ""
-"Future context size. Please note, the shape of convolution kernel is "
-"[future_context_size + 1, D]."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:32
-msgid "Attributes of parameters, including name, initializer etc."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:35
-msgid "Non-linear activation to be applied to output variable."
-msgstr ""
-
-#: of paddle.fluid.layers.row_conv:38
-msgid ""
-"the output(Out) is a LodTensor, which supports variable time-length input"
-" sequences. The underlying tensor in this LodTensor is a matrix with "
-"shape T x N, i.e., the same shape as X."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:749
-msgid "multiplex"
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:1
-msgid ""
-"Referring to the given index variable, this layer selects rows from the "
-"input variables to construct a multiplex variable. Assuming that there "
-"are :math:`m` input variables and :math:`I_i` represents the i-th input "
-"variable and :math:`i` is in [0, :math:`m`). All input variables are "
-"tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`]. "
-"Please note that rank of the input tensor should be at least 2. Each "
-"input variable will be treated as a 2-D matrix with shape [:math:`M`, "
-":math:`N`] where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` "
-"* :math:`d_2` * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of "
-"the i-th input variable. The given index variable should be a 2-D tensor "
-"with shape [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the "
-"index variable. Then the output variable will be a tensor with shape "
-"[:math:`d_0`, :math:`d_1`, ..., :math:`d_R`]. If we treat the output "
-"tensor as a 2-D matrix with shape [:math:`M`, :math:`N`] and let "
-":math:`O[i]` be the i-th row of the matrix, then `O[i]` is equal to "
-":math:`I_{ID[i]}[i]`."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:3
-msgid "Ids: the index tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:5
-msgid "X[0 : N - 1]: the candidate tensors for output (N >= 2)."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:7
-msgid ""
-"For each index i from 0 to batchSize - 1, the output is the i-th row of "
-"the the (Ids[i])-th tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:9
-msgid "For i-th row of the output tensor:"
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:11
-msgid "$$ y[i] = x_{k}[i] $$"
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:13
-msgid ""
-"where :math:`y` is the output tensor, :math:`x_{k}` is the k-th input "
-"tensor, and :math:`k = Ids[i]`."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:25
-msgid ""
-"A list of variables to gather from. All variables have the same shape and"
-" the rank is at least 2."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:27
-msgid ""
-"Tensor<int32>, index variable which is a 2-D tensor with shape [M, 1] "
-"where M is the batch size."
-msgstr ""
-
-#: of paddle.fluid.layers.multiplex:30
-msgid "The output tensor of multiplex operator."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:757
-msgid "layer_norm"
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:1
-msgid ""
-"Assume feature vectors exist on dimensions :attr:`begin_norm_axis ... "
-"rank(input)` and calculate the moment statistics along these dimensions "
-"for each feature vector :math:`a` with size :math:`H`, then normalize "
-"each feature vector using the corresponding statistics. After that, apply"
-" learnable gain and bias on the normalized tensor to scale and shift if "
-":attr:`scale` and :attr:`shift` are set."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:3
-msgid "Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_"
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:17
-msgid ":math:`a`: the vector representation of the summed inputs to the neurons"
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:18
-msgid "in that layer."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:20
-msgid ":math:`H`: the number of hidden units in a layers"
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:22
-msgid ":math:`g`: the trainable scale parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:24
-msgid ":math:`b`: the trainable bias parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:26 paddle.fluid.layers.pad:26
-msgid "The input tensor variable."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:28
-msgid "Whether to learn the adaptive gain :math:`g` after normalization."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:31
-msgid "Whether to learn the adaptive bias :math:`b` after normalization."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:34
-msgid ""
-"The normalization will be performed along dimensions from "
-":attr:`begin_norm_axis` to :attr:`rank(input)`."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:37
-msgid "The small value added to the variance to prevent division by zero."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:40
-msgid "The parameter attribute for the learnable gain :math:`g`."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:43
-msgid "The parameter attribute for the learnable bias :math:`b`."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:46
-msgid "Activation to be applied to the output of layer normalizaiton."
-msgstr ""
-
-#: of paddle.fluid.layers.layer_norm:51
-msgid "Result after normalization"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:765
-msgid "softmax_with_cross_entropy"
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:1
-msgid "**Softmax With Cross Entropy Operator.**"
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:3
-msgid ""
-"Cross entropy loss with softmax is used as the output layer extensively. "
-"This operator computes the softmax normalized values for each row of the "
-"input tensor, after which cross-entropy loss is computed. This provides a"
-" more numerically stable gradient."
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:8
-msgid ""
-"Because this operator performs a softmax on logits internally, it expects"
-" unscaled logits. This operator should not be used with the output of "
-"softmax operator since that would produce incorrect results."
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:12
-msgid ""
-"When the attribute soft_label is set false, this operators expects "
-"mutually exclusive hard labels, each sample in a batch is in exactly one "
-"class with a probability of 1.0. Each sample in the batch will have a "
-"single label."
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:16
-msgid "The equation is as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:18
-msgid "Hard label (one-hot label, so every sample has exactly one class)"
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:25
-msgid "Soft label (each sample can have a distribution over all classes)"
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:33
-msgid ""
-"The unscaled log probabilities, which is a 2-D tensor with shape [N x K]."
-" N is the batch_size, and K is the class number."
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:36
-msgid ""
-"The ground truth which is a 2-D tensor. If soft_label is set to false, "
-"Label is a Tensor<int64> with shape [N x 1]. If soft_label is set to "
-"true, Label is a Tensor<float/double> with"
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:40
-msgid ""
-"A flag to indicate whether to interpretate the given labels as soft "
-"labels. By default, `soft_label` is set to False."
-msgstr ""
-
-#: of paddle.fluid.layers.softmax_with_cross_entropy:44
-msgid "The cross entropy loss is a 2-D tensor with shape [N x 1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:773
-msgid "smooth_l1"
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:1
-msgid "**Smooth L1 Loss Operator. **"
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:3
-msgid ""
-"This operator computes the smooth L1 loss for X and Y. The operator takes"
-" the first dimension of X and Y as batch size. For each instance, it "
-"computes the smooth L1 loss element by element first and then sums all "
-"the losses. So the shape of Out is [batch_size, 1]."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:8
-msgid ""
-"A tensor with rank at least 2. The input value of smooth L1 loss op with "
-"shape [batch_size, dim1, ..., dimN]."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:11
-msgid ""
-"A tensor with rank at least 2. The target value of smooth L1 loss op with"
-" same shape as x."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:14
-msgid ""
-"A tensor with rank at least 2. This input is optional and should have "
-"same shape with x. If provided, the result of (x - y) will be multiplied "
-"by this tensor element by element."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:19
-msgid ""
-"A tensor with rank at least 2. This input is optional and should have "
-"same shape with x. If provided, the out smooth L1 loss will be multiplied"
-" by this tensor element by element."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:24
-msgid ""
-"Hyper parameter of smooth L1 loss op. A float scalar with default value "
-"1.0."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:28
-msgid ""
-"A tensor with rank be 2. The output smooth L1 loss with     shape "
-"[batch_size, 1]."
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:30
-msgid "A tensor with rank be 2. The output smooth L1 loss with"
-msgstr ""
-
-#: of paddle.fluid.layers.smooth_l1:31
-msgid "shape [batch_size, 1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:781
-msgid "one_hot"
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:1
-msgid ""
-"One Hot Operator. This operator creates the one-hot representations for "
-"input index values. The following example will help to explain the "
-"function of this operator."
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:5
-msgid "A Tensor/LodTensor of indices, last dimension must be 1."
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:7
-msgid "an interger defining the depth of the one hot dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:10
-msgid "The one-hot tensor or LodTensor, same as input."
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:18
-msgid "X is a LoDTensor:"
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:17
-msgid "X.lod = [[0, 1, 4]] X.shape = [4, 1] X.data = [[1], [1], [3], [0]]"
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:20
-msgid "set depth = 4 Out is a LoDTensor:"
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:22
-msgid "Out.lod = [[0, 1, 4]] Out.shape = [4, 4] Out.data = [[0., 1., 0., 0.],"
-msgstr ""
-
-#: of paddle.fluid.layers.one_hot:25
-msgid "[0., 1., 0., 0.], [0., 0., 0., 1.], [1., 0., 0., 0.]]"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:789
-msgid "autoincreased_step_counter"
-msgstr ""
-
-#: of paddle.fluid.layers.autoincreased_step_counter:1
-msgid ""
-"NOTE: The counter will be automatically increased by 1 every mini-batch "
-"Return the run counter of the main program, which is started with 1."
-msgstr ""
-
-#: of paddle.fluid.layers.autoincreased_step_counter:4
-msgid "The counter name, default is '@STEP_COUNTER@'."
-msgstr ""
-
-#: of paddle.fluid.layers.autoincreased_step_counter:6
-msgid "The first value of this counter."
-msgstr ""
-
-#: of paddle.fluid.layers.autoincreased_step_counter:8
-msgid "The increment step between each execution."
-msgstr ""
-
-#: of paddle.fluid.layers.autoincreased_step_counter:11
-msgid "The global run counter."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:797
-msgid "reshape"
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:1
-msgid "Gives a new shape to the input Tensor without changing its data."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:3
-msgid ""
-"The target shape can be given by :attr:`shape` or :attr:`actual_shape`. "
-":attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor"
-" variable. :attr:`actual_shape` has a higher priority than :attr:`shape` "
-"if it is provided, while :attr:`shape` still should be set correctly to "
-"gurantee shape inference in compile-time."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:9
-msgid "Some tricks exist when specifying the target shape."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:11
-msgid ""
-"1. -1 means the value of this dimension is inferred from the total "
-"element number of x and remaining dimensions. Thus one and only one "
-"dimension can be set -1."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:15
-msgid ""
-"2. 0 means the actual dimension value is going to be copied from the "
-"corresponding dimension of x. The indice of 0s in shape can not exceed "
-"Rank(X)."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:19
-msgid "Here are some examples to explain it."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:21
-msgid ""
-"1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is "
-"[6, 8], the reshape operator will transform x into a 2-D tensor with "
-"shape [6, 8] and leaving x's data unchanged."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:25
-msgid ""
-"2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape "
-"specified is [2, 3, -1, 2], the reshape operator will transform x into a "
-"4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In "
-"this case, one dimension of the target shape is set to -1, the value of "
-"this dimension is inferred from the total element number of x and "
-"remaining dimensions."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:32
-msgid ""
-"3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is "
-"[-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor "
-"with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, "
-"besides -1, 0 means the actual dimension value is going to be copied from"
-" the corresponding dimension of x."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:38
-msgid "The input tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:40
-msgid "The new shape. At most one dimension of the new shape can be -1."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:43
-msgid ""
-"An optional input. If provided, reshape according to this given shape "
-"rather than :attr:`shape` specifying shape. That is to say "
-":attr:`actual_shape` has a higher priority than :attr:`shape`."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:49
-msgid "The non-linear activation to be applied to output variable."
-msgstr ""
-
-#: of paddle.fluid.layers.reshape:51
-msgid ""
-"If this flag is set true, a new output tensor is created whose data is "
-"copied from input x, otherwise the output shares data with input without "
-"copying."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:14 paddle.fluid.layers.reshape:58
-msgid "The output tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:805
-msgid "lod_reset"
-msgstr ""
-
-#: of paddle.fluid.layers.lod_reset:1
-msgid ""
-"LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or "
-"**target_lod**. When **y** provided, **y.lod** would be considered as "
-"target LoD first, otherwise **y.data** would be considered as target LoD."
-" If **y** is not provided, target LoD should be specified by "
-"**target_lod**. If target LoD is specified by **Y.data** or "
-"**target_lod**, only one level LoD is supported."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_reset:57
-msgid "Input variable which could be a Tensor or LodTensor."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_reset:59
-msgid "If provided, output's LoD would be derived from y."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_reset:61
-msgid ""
-"One level LoD which should be considered as target LoD when y not "
-"provided."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_reset:65
-msgid "Output variable with LoD specified by this operator."
-msgstr ""
-
-#: of paddle.fluid.layers.lod_reset:68
-msgid ":exc:`ValueError` -- If y and target_lod are both None."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:813
-msgid "lrn"
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:1
-msgid ""
-"Local Response Normalization Layer. This layer performs a type of "
-"\"lateral inhibition\" by normalizing over local input regions."
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:11
-msgid "ight)^{eta}"
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:15
-msgid ":math:`n`: The number of channels to sum over."
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:16
-msgid ":math:`k`: The offset (avoid being divided by 0)."
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:17
-msgid ":math:`alpha`: The scaling parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:18
-msgid ":math:`beta`: The exponent parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:20
-msgid ""
-"Refer to `ImageNet Classification with Deep Convolutional Neural Networks"
-" <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-"
-"convolutional-neural-networks.pdf>`_"
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:24
-msgid ""
-"input (Variable): The input tensor of this layer, and the dimension of "
-"input tensor must be 4. n (int, default 5): The number of channels to sum"
-" over. k (float, default 1.0): An offset (usually positive to avoid "
-"dividing by 0). alpha (float, default 1e-4): The scaling parameter. beta "
-"(float, default 0.75): The exponent. name (str, default None): A name for"
-" this operation."
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:32
-msgid "Raises:"
-msgstr ""
-
-#: of paddle.fluid.layers.lrn:32
-msgid "ValueError: If rank of the input tensor is not 4."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:821
-msgid "pad"
-msgstr ""
-
-#: of paddle.fluid.layers.pad:1
-msgid ""
-"Pads a tensor with a constant value given by :attr:`pad_value`, and the "
-"padded width is specified by :attr:`paddings`."
-msgstr ""
-
-#: of paddle.fluid.layers.pad:4
-msgid ""
-"Specifically, the number of values padded before the contents of "
-":attr:`x` in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and"
-" the number of values padded after the contents of :attr:`x` in dimension"
-" :attr:`i` is indicated by :attr:`paddings[i+1]`."
-msgstr ""
-
-#: of paddle.fluid.layers.pad:9
-msgid "See below for an example."
-msgstr ""
-
-#: of paddle.fluid.layers.pad:28
-msgid ""
-"A list of integers. Its elements specify the padded width before and "
-"after for each dimension in turn. The length of :attr:paddings must be "
-":math:`rank(x) \\times 2`."
-msgstr ""
-
-#: of paddle.fluid.layers.pad:33
-msgid "The constant value used to pad."
-msgstr ""
-
-#: of paddle.fluid.layers.pad:39
-msgid "The padded tensor variable."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:829
-msgid "label_smooth"
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:1
-msgid ""
-"Label smoothing is a mechanism to regularize the classifier layer and is "
-"called label-smoothing regularization (LSR)."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:4
-msgid ""
-"Label smoothing is proposed to encourage the model to be less confident, "
-"since optimizing the log-likelihood of the correct label directly may "
-"cause overfitting and reduce the ability of the model to adapt. Label "
-"smoothing replaces the ground-truth label :math:`y` with the weighted sum"
-" of itself and some fixed distribution :math:`\\mu`. For class :math:`k`,"
-" i.e."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:15
-msgid ""
-"where :math:`1 - \\epsilon` and :math:`\\epsilon` are the weights "
-"respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually "
-"uniform distribution is used for :math:`\\mu`."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:19
-msgid ""
-"See more details about label smoothing in "
-"https://arxiv.org/abs/1512.00567."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:21
-msgid ""
-"The input variable containing the label data. The label data should use "
-"one-hot representation."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:24
-msgid ""
-"The prior distribution to be used to smooth labels. If not provided, an "
-"uniform distribution is used. The shape of :attr:`prior_dist` should be "
-":math:`(1, class\\_num)`."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:29
-msgid ""
-"The weight used to mix up the original ground-truth distribution and the "
-"fixed distribution."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:32
-msgid "The type of data : float32, float_64, int etc."
-msgstr ""
-
-#: of paddle.fluid.layers.label_smooth:39
-msgid "The tensor variable containing the smoothed labels."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:837
-msgid "roi_pool"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:2
-msgid "Region of interest pooling (also known as RoI pooling) is to perform"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:2
-msgid ""
-"is to perform max pooling on inputs of nonuniform sizes to obtain fixed-"
-"size feature maps (e.g. 7*7)."
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:8
-msgid "The operator has three steps:"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:5
-msgid ""
-"Dividing each region proposal into equal-sized sections with the "
-"pooled_width and pooled_height"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:7
-msgid "Finding the largest value in each section"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:8
-msgid "Copying these max values to the output buffer"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:10
-msgid "The input for ROI pooling."
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:12
-msgid ""
-"ROIs (Regions of Interest) to pool over. It should be a 2-D one level "
-"LoTensor of shape [num_rois, 4]. The layout is [x1, y1, x2, y2], where "
-"(x1, y1) is the top left coordinates, and (x2, y2) is the bottom right "
-"coordinates. The num_rois is the total number of ROIs in this batch data."
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:19
-msgid "The pooled output height. Default: 1"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:21
-msgid "The pooled output width. Default: 1"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:23
-msgid ""
-"Multiplicative spatial scale factor. To translate ROI coords from their "
-"input scale to the scale used when pooling. Default: 1.0"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:28
-msgid ""
-"The output is a 4-D tensor of the shape                      (num_rois, "
-"channels, pooled_h, pooled_w)."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:31
-#: paddle.fluid.layers.image_resize_short:17 paddle.fluid.layers.roi_pool:30
-msgid "The output is a 4-D tensor of the shape"
-msgstr ""
-
-#: of paddle.fluid.layers.roi_pool:31
-msgid "(num_rois, channels, pooled_h, pooled_w)."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:845
-msgid "dice_loss"
-msgstr ""
-
-#: of paddle.fluid.layers.dice_loss:1
-msgid ""
-"Dice loss for comparing the similarity of two batch of data, usually is "
-"used for binary image segmentation i.e. labels are binary. The dice loss "
-"can be defined as below equation:"
-msgstr ""
-
-#: of paddle.fluid.layers.dice_loss:12
-msgid ""
-"The predictions with rank>=2. The first dimension is batch size, and the "
-"last dimension is class number."
-msgstr ""
-
-#: of paddle.fluid.layers.dice_loss:15
-msgid ""
-"The groud truth with the same rank with input. The first dimension is "
-"batch size, and the last dimension is 1."
-msgstr ""
-
-#: of paddle.fluid.layers.dice_loss:18
-msgid ""
-"The epsilon will be added to the numerator and denominator. If both input"
-" and label are empty, it makes sure dice is 1. Default: 0.00001"
-msgstr ""
-
-#: of paddle.fluid.layers.dice_loss:23
-msgid "The dice loss with shape [1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:853
-msgid "image_resize"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:1
-msgid "Resize a batch of images."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:3
-msgid ""
-"The input must be a tensor of the shape (num_batches, channels, in_h, "
-"in_w), and the resizing only applies on the last two dimensions(hight and"
-" width)."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:7
-msgid "Supporting resample methods:"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:7
-msgid "'BILINEAR' : Bilinear interpolation"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:9
-#: paddle.fluid.layers.image_resize_short:6
-msgid ""
-"The input tensor of image resize layer, This is a 4-D tensor of the shape"
-" (num_batches, channels, in_h, in_w)."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:13
-msgid ""
-"Output shape of image resize layer, the shape is (out_h, out_w). Default:"
-" None"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:17
-msgid ""
-"The multiplier for the input height or width. At least one of out_shape "
-"or scale must be set. And out_shape has a higher priority than scale. "
-"Default: None"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:25
-msgid ""
-"The resample method. It can only be 'BILINEAR' currently. Default: "
-"'BILINEAR'"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:29
-#: paddle.fluid.layers.image_resize_short:15
-msgid ""
-"The output is a 4-D tensor of the shape                 (num_batches, "
-"channls, out_h, out_w)."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize:32
-#: paddle.fluid.layers.image_resize_short:18
-msgid "(num_batches, channls, out_h, out_w)."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:861
-msgid "image_resize_short"
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize_short:1
-msgid ""
-"Resize a batch of images. The short edge of input images will be resized "
-"to the given 'out_short_len'. The long edge of input images will be "
-"resized proportionately to make images' length-width ratio constant."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize_short:10
-msgid "The length of output images' short edge."
-msgstr ""
-
-#: of paddle.fluid.layers.image_resize_short:12
-msgid "resample method, default: BILINEAR."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:869
-msgid "resize_bilinear"
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:1
-msgid ""
-"Bilinear interpolation is an extension of linear interpolation for "
-"interpolating functions of two variables (e.g. H-direction and "
-"W-direction in this op) on a rectilinear 2D grid."
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:3
-msgid ""
-"The key idea is to perform linear interpolation first in one direction, "
-"and then again in the other direction."
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:5
-msgid ""
-"For details, please refer to Wikipedia: "
-"https://en.wikipedia.org/wiki/Bilinear_interpolation"
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:9
-msgid ""
-"The input tensor of bilinear interpolation, This is a 4-D tensor with "
-"shape of (N x C x h x w)."
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:11
-msgid ""
-"This is a 1-D tensor with two number. The first number is height and the "
-"second number is width."
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:13
-msgid ""
-"The multiplier for the input height or width. At least one of out_shape "
-"or scale must be set. And out_shape has a higher priority than scale. "
-"Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:17
-msgid "The output variable name."
-msgstr ""
-
-#: of paddle.fluid.layers.resize_bilinear:20
-msgid "The dimension of output is (N x C x out_h x out_w)."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:877
-msgid "gather"
-msgstr ""
-
-#: of paddle.fluid.layers.gather:1
-msgid ""
-"Output is obtained by gathering entries of the outer-most dimension of X "
-"indexed by `index` and concatenate them together."
-msgstr ""
-
-#: of paddle.fluid.layers.gather:25
-msgid "The source input with rank>=1."
-msgstr ""
-
-#: of paddle.fluid.layers.gather:27
-msgid "The index input with rank=1."
-msgstr ""
-
-#: of paddle.fluid.layers.gather:30
-msgid "The output is a tensor with the same rank as input."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:885
-msgid "random_crop"
-msgstr ""
-
-#: of paddle.fluid.layers.random_crop:1
-msgid ""
-"This operator takes a batch of instance, and do random cropping on each "
-"instance. It means that cropping positions differs on each instance, "
-"which is determined by an uniform random generator. All cropped instances"
-" have the same shape, which is determined by the operator's attribute "
-"'shape'."
-msgstr ""
-
-#: of paddle.fluid.layers.random_crop:10
-msgid "A batch of instances to random crop"
-msgstr ""
-
-#: of paddle.fluid.layers.random_crop:12
-msgid "The shape of a cropped instance"
-msgstr ""
-
-#: of paddle.fluid.layers.random_crop:14
-msgid ""
-"The random seed By default, the seed will get from "
-"`random.randint(-65536, 65535)`."
-msgstr ""
-
-#: of paddle.fluid.layers.random_crop:18
-msgid "The cropped instance batch"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:893
-msgid "mean_iou"
-msgstr ""
-
-#: of paddle.fluid.layers.mean_iou:1
-msgid ""
-"Mean Intersection-Over-Union is a common evaluation metric for semantic "
-"image segmentation, which first computes the IOU for each semantic class "
-"and then computes the average over classes. IOU is defined as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.mean_iou:10
-msgid ""
-"The predictions are accumulated in a confusion matrix and mean-IOU is "
-"then calculated from it."
-msgstr ""
-
-#: of paddle.fluid.layers.mean_iou:14
-msgid ""
-"A Tensor of prediction results for semantic labels with type int32 or "
-"int64."
-msgstr ""
-
-#: of paddle.fluid.layers.mean_iou:16
-msgid ""
-"A Tensor of ground truth labels with type int32 or int64. Its shape "
-"should be the same as input."
-msgstr ""
-
-#: of paddle.fluid.layers.mean_iou:20
-msgid ""
-"A Tensor representing the mean intersection-over-union with shape [1]. "
-"out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers"
-" of each class. out_correct(Variable): A Tensor with shape [num_classes]."
-" The correct numbers of each class."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:899
-msgid "ops"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:904
-msgid "mean"
-msgstr ""
-
-#: of paddle.fluid.layers.mean:1
-msgid "Mean Operator calculates the mean of all elements in X."
-msgstr ""
-
-#: of paddle.fluid.layers.mean:4
-msgid "(Tensor) The input of mean op Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.mean:7
-msgid "(Tensor) The output of mean op"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:912
-msgid "mul"
-msgstr ""
-
-#: of paddle.fluid.layers.mul:1
-msgid "Mul Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.mul:3
-msgid ""
-"This operator is used to perform matrix multiplication for input $X$ and "
-"$Y$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:3
-#: paddle.fluid.layers.elementwise_div:3 paddle.fluid.layers.elementwise_max:3
-#: paddle.fluid.layers.elementwise_min:3 paddle.fluid.layers.elementwise_mul:3
-#: paddle.fluid.layers.elementwise_pow:3 paddle.fluid.layers.elementwise_sub:3
-#: paddle.fluid.layers.mul:5
-msgid "The equation is:"
-msgstr ""
-
-#: of paddle.fluid.layers.mul:7
-msgid "$$Out = X * Y$$"
-msgstr ""
-
-#: of paddle.fluid.layers.mul:9
-msgid ""
-"Both the input $X$ and $Y$ can carry the LoD (Level of Details) "
-"information, or not. But the output only shares the LoD information with "
-"input $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.mul:13
-msgid ""
-"(Tensor), The first input tensor of mul op. Duplicable: False  Optional: "
-"False"
-msgstr ""
-
-#: of paddle.fluid.layers.mul:15
-msgid ""
-"(Tensor), The second input tensor of mul op. Duplicable: False  Optional:"
-" False"
-msgstr ""
-
-#: of paddle.fluid.layers.mul:17
-msgid ""
-"(int, default 1), The mul_op can take tensors with more than two "
-"dimensions as its inputs. If the input $X$ is a tensor with more than two"
-" dimensions, $X$ will be flattened into a two-dimensional matrix first. "
-"The flattening rule is: the first `num_col_dims` will be flattened to "
-"form the first dimension of the final matrix (the height of the matrix), "
-"and the rest `rank(X) - num_col_dims` dimensions are flattened to form "
-"the second dimension of the final matrix (the width of the matrix). As a "
-"result, height of the flattened matrix is equal to the product of $X$'s "
-"first `x_num_col_dims` dimensions' sizes, and width of the flattened "
-"matrix is equal to the product of $X$'s last `rank(x) - num_col_dims` "
-"dimensions' size. For example, suppose $X$ is a 6-dimensional tensor with"
-" the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3. Thus, the flattened"
-" matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]."
-msgstr ""
-
-#: of paddle.fluid.layers.mul:33
-msgid ""
-"(int, default 1), The mul_op can take tensors with more than two, "
-"dimensions as its inputs. If the input $Y$ is a tensor with more than two"
-" dimensions, $Y$ will be flattened into a two-dimensional matrix first. "
-"The attribute `y_num_col_dims` determines how $Y$ is flattened. See "
-"comments of `x_num_col_dims` for more details."
-msgstr ""
-
-#: of paddle.fluid.layers.mul:40
-msgid "(Tensor), The output tensor of mul op."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:920
-msgid "scale"
-msgstr ""
-
-#: of paddle.fluid.layers.scale:1
-msgid "Scale operator"
-msgstr ""
-
-#: of paddle.fluid.layers.scale:3
-msgid "$$Out = scale*X$$"
-msgstr ""
-
-#: of paddle.fluid.layers.scale:5
-msgid ""
-"(Tensor) Input tensor of scale operator. Duplicable: False  Optional: "
-"False"
-msgstr ""
-
-#: of paddle.fluid.layers.scale:7
-msgid "(float, default 1.0)The scaling factor of the scale operator."
-msgstr ""
-
-#: of paddle.fluid.layers.scale:10
-msgid "(Tensor) Output tensor of scale operator."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:928
-msgid "sigmoid_cross_entropy_with_logits"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:1
-msgid "SigmoidCrossEntropyWithLogits Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:3
-msgid ""
-"This measures the element-wise probability error in classification tasks "
-"in which each class is independent. This can be thought of as predicting "
-"labels for a data-point, where labels are not mutually exclusive. For "
-"example, a news article can be about politics, technology or sports at "
-"the same time or none of these."
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:9
-msgid "The logistic loss is given as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:11
-msgid ""
-"$$loss = -Labels * \\log(\\sigma(X)) - (1 - Labels) * \\log(1 - "
-"\\sigma(X))$$"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:13
-msgid ""
-"We know that $$\\sigma(X) = (1 / (1 + \\exp(-X)))$$. By substituting this"
-" we get:"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:15
-msgid "$$loss = X - X * Labels + \\log(1 + \\exp(-X))$$"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:17
-msgid ""
-"For stability and to prevent overflow of $$\\exp(-X)$$ when X < 0, we "
-"reformulate the loss as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:20
-msgid "$$loss = \\max(X, 0) - X * Labels + \\log(1 + \\exp(-|X|))$$"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:22
-msgid ""
-"Both the input `X` and `Labels` can carry the LoD (Level of Details) "
-"information. However the output only shares the LoD with input `X`."
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:26
-msgid ""
-"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, where N "
-"is the batch size and D is the number of classes. This input is a tensor "
-"of logits computed by the previous  operator. Logits are unscaled log "
-"probabilities given as log(p/(1-p)). Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:28
-msgid ""
-"(Tensor, default Tensor<float>), a 2-D tensor of the same type and shape "
-"as X. This input is a tensor of probabalistic labels for each logit "
-"Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid_cross_entropy_with_logits:31
-msgid ""
-"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D  of "
-"elementwise logistic losses."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:936
-msgid "elementwise_add"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:1
-msgid "Limited Elementwise Add Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:5
-msgid "$$Out = X + Y$$"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:7
-#: paddle.fluid.layers.elementwise_div:7 paddle.fluid.layers.elementwise_max:7
-#: paddle.fluid.layers.elementwise_min:7 paddle.fluid.layers.elementwise_mul:7
-#: paddle.fluid.layers.elementwise_pow:7 paddle.fluid.layers.elementwise_sub:7
-msgid "$X$: a tensor of any dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:8
-#: paddle.fluid.layers.elementwise_div:8 paddle.fluid.layers.elementwise_max:8
-#: paddle.fluid.layers.elementwise_min:8 paddle.fluid.layers.elementwise_mul:8
-#: paddle.fluid.layers.elementwise_pow:8 paddle.fluid.layers.elementwise_sub:8
-msgid ""
-"$Y$: a tensor whose dimensions must be less than or equal to the "
-"dimensions of $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:10
-#: paddle.fluid.layers.elementwise_div:10
-#: paddle.fluid.layers.elementwise_max:10
-#: paddle.fluid.layers.elementwise_min:10
-#: paddle.fluid.layers.elementwise_mul:10
-#: paddle.fluid.layers.elementwise_pow:10
-#: paddle.fluid.layers.elementwise_sub:10
-msgid "There are two cases for this operator:"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:12
-#: paddle.fluid.layers.elementwise_div:12
-#: paddle.fluid.layers.elementwise_max:12
-#: paddle.fluid.layers.elementwise_min:12
-#: paddle.fluid.layers.elementwise_mul:12
-#: paddle.fluid.layers.elementwise_pow:12
-#: paddle.fluid.layers.elementwise_sub:12
-msgid "The shape of $Y$ is the same with $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:13
-#: paddle.fluid.layers.elementwise_div:13
-#: paddle.fluid.layers.elementwise_max:13
-#: paddle.fluid.layers.elementwise_min:13
-#: paddle.fluid.layers.elementwise_mul:13
-#: paddle.fluid.layers.elementwise_pow:13
-#: paddle.fluid.layers.elementwise_sub:13
-msgid "The shape of $Y$ is a continuous subsequence of $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:15
-#: paddle.fluid.layers.elementwise_div:15
-#: paddle.fluid.layers.elementwise_max:15
-#: paddle.fluid.layers.elementwise_min:15
-#: paddle.fluid.layers.elementwise_mul:15
-#: paddle.fluid.layers.elementwise_pow:15
-#: paddle.fluid.layers.elementwise_sub:15
-msgid "For case 2:"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:17
-#: paddle.fluid.layers.elementwise_div:17
-#: paddle.fluid.layers.elementwise_max:17
-#: paddle.fluid.layers.elementwise_min:17
-#: paddle.fluid.layers.elementwise_mul:17
-#: paddle.fluid.layers.elementwise_pow:17
-#: paddle.fluid.layers.elementwise_sub:17
-msgid ""
-"Broadcast $Y$ to match the shape of $X$, where $axis$ is the start "
-"dimension index for broadcasting $Y$ onto $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:19
-#: paddle.fluid.layers.elementwise_div:19
-#: paddle.fluid.layers.elementwise_max:19
-#: paddle.fluid.layers.elementwise_min:19
-#: paddle.fluid.layers.elementwise_mul:19
-#: paddle.fluid.layers.elementwise_pow:19
-#: paddle.fluid.layers.elementwise_sub:19
-msgid "If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:20
-#: paddle.fluid.layers.elementwise_div:20
-#: paddle.fluid.layers.elementwise_max:20
-#: paddle.fluid.layers.elementwise_min:20
-#: paddle.fluid.layers.elementwise_mul:20
-#: paddle.fluid.layers.elementwise_pow:20
-#: paddle.fluid.layers.elementwise_sub:20
-msgid ""
-"The trailing dimensions of size 1 for $Y$ will be ignored for the "
-"consideration of subsequence, such as shape(Y) = (2, 1) => (2)."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:34
-#: paddle.fluid.layers.elementwise_div:34
-#: paddle.fluid.layers.elementwise_max:34
-#: paddle.fluid.layers.elementwise_min:34
-#: paddle.fluid.layers.elementwise_mul:34
-#: paddle.fluid.layers.elementwise_pow:34
-#: paddle.fluid.layers.elementwise_sub:34
-msgid ""
-"The inputs $X$ and $Y$ can carry the different LoD information. But the "
-"output only shares the LoD information with the input $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:38
-#: paddle.fluid.layers.elementwise_div:38
-#: paddle.fluid.layers.elementwise_max:38
-#: paddle.fluid.layers.elementwise_min:38
-#: paddle.fluid.layers.elementwise_mul:38
-#: paddle.fluid.layers.elementwise_pow:38
-#: paddle.fluid.layers.elementwise_sub:38
-msgid ""
-"(Tensor), The first input tensor of elementwise op. Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:40
-#: paddle.fluid.layers.elementwise_div:40
-#: paddle.fluid.layers.elementwise_max:40
-#: paddle.fluid.layers.elementwise_min:40
-#: paddle.fluid.layers.elementwise_mul:40
-#: paddle.fluid.layers.elementwise_pow:40
-#: paddle.fluid.layers.elementwise_sub:40
-msgid ""
-"(Tensor), The second input tensor of elementwise op. Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:42
-#: paddle.fluid.layers.elementwise_div:42
-#: paddle.fluid.layers.elementwise_max:42
-#: paddle.fluid.layers.elementwise_min:42
-#: paddle.fluid.layers.elementwise_mul:42
-#: paddle.fluid.layers.elementwise_pow:42
-#: paddle.fluid.layers.elementwise_sub:42
-msgid "(int, default -1). The start dimension index for broadcasting Y onto X."
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_add:45
-#: paddle.fluid.layers.elementwise_div:45
-#: paddle.fluid.layers.elementwise_max:45
-#: paddle.fluid.layers.elementwise_min:45
-#: paddle.fluid.layers.elementwise_mul:45
-#: paddle.fluid.layers.elementwise_pow:45
-#: paddle.fluid.layers.elementwise_sub:45
-msgid "The output of elementwise op."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:944
-msgid "elementwise_div"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_div:1
-msgid "Limited Elementwise Div Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_div:5
-msgid "$$Out = X / Y$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:952
-msgid "elementwise_sub"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_sub:1
-msgid "Limited Elementwise Sub Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_sub:5
-msgid "$$Out = X - Y$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:960
-msgid "elementwise_mul"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_mul:1
-msgid "Limited Elementwise Mul Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_mul:5
-msgid "$$Out = X \\odot\\ Y$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:968
-msgid "elementwise_max"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_max:1
-msgid "Limited Elementwise Max Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_max:5
-msgid "$$Out = max(X, Y)$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:976
-msgid "elementwise_min"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_min:1
-msgid "Limited Elementwise Min Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_min:5
-msgid "$$Out = min(X, Y)$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:984
-msgid "elementwise_pow"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_pow:1
-msgid "Limited Elementwise Pow Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.elementwise_pow:5
-msgid "$$Out = X ^ Y$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:992
-msgid "clip"
-msgstr ""
-
-#: of paddle.fluid.layers.clip:1
-msgid "Clip Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.clip:3
-msgid ""
-"The clip operator limits the value of given input within an interval. The"
-" interval is specified with arguments 'min' and 'max':"
-msgstr ""
-
-#: of paddle.fluid.layers.clip:6
-msgid "$$ Out = \\min(\\max(X, min), max) $$"
-msgstr ""
-
-#: of paddle.fluid.layers.clip:11
-msgid ""
-"(Tensor)The input of clip op.The number of dimensions must be between [1,"
-" 9]. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.clip:13
-msgid "(float)Minimum value, under which element is replaced by min."
-msgstr ""
-
-#: of paddle.fluid.layers.clip:15
-msgid "(float)Maximum value, above which element is replaced by max"
-msgstr ""
-
-#: of paddle.fluid.layers.clip:18
-msgid "(Tensor)The output of clip op with shape as input(X)"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1000
-msgid "clip_by_norm"
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:1
-msgid "ClipByNorm Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:3
-msgid ""
-"This operator limits the L2 norm of the input $X$ within $max\\_norm$. If"
-" the L2 norm of $X$ is less than or equal to $max\\_norm$, $Out$ will be "
-"the same as $X$. If the L2 norm of $X$ is greater than $max\\_norm$, $X$ "
-"will be linearly scaled to make the L2 norm of $Out$ equal to "
-"$max\\_norm$, as shown in the following formula:"
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:9
-msgid "$$ Out = \\frac{max\\_norm * X}{norm(X)}, $$"
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:13
-msgid "where $norm(X)$ represents the L2 norm of $X$."
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:15
-msgid ""
-"(Tensor) The input of clip_by_norm op.The number of dimensions must be "
-"between [1, 9]. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:17
-msgid "(float) The maximum norm value."
-msgstr ""
-
-#: of paddle.fluid.layers.clip_by_norm:20
-msgid "(Tensor) The output of clip_by_norm op with shape as input(X)"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1008
-msgid "logical_and"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_and:1
-msgid "logical_and Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_and:3
-msgid ""
-"It operates element-wise on X and Y, and returns the Out. X, Y and Out "
-"are N-dim boolean tensors. Each element of Out is calculated by $$Out = X"
-" \\&\\& Y$$"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_and:6
-msgid ""
-"(LoDTensor) Left hand operand of logical_and operator Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_and:8
-msgid ""
-"(LoDTensor) Right hand operand of logical_and operator Duplicable: False"
-"  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_and:11
-msgid "(LoDTensor) n-dim bool tensor. Each element is $$Out = X \\&\\& Y$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1016
-msgid "logical_or"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_or:1
-msgid "logical_or Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_or:3
-msgid ""
-"It operates element-wise on X and Y, and returns the Out. X, Y and Out "
-"are N-dim boolean tensors. Each element of Out is calculated by $$Out = X"
-" || Y$$"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_or:6
-msgid ""
-"(LoDTensor) Left hand operand of logical_or operator Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_or:8
-msgid ""
-"(LoDTensor) Right hand operand of logical_or operator Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_or:11
-msgid "(LoDTensor) n-dim bool tensor. Each element is $$Out = X || Y$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1024
-msgid "logical_xor"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_xor:1
-msgid "logical_xor Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_xor:3
-msgid ""
-"It operates element-wise on X and Y, and returns the Out. X, Y and Out "
-"are N-dim boolean tensors. Each element of Out is calculated by $$Out = "
-"(X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_xor:6
-msgid ""
-"(LoDTensor) Left hand operand of logical_xor operator Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_xor:8
-msgid ""
-"(LoDTensor) Right hand operand of logical_xor operator Duplicable: False"
-"  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_xor:11
-msgid ""
-"(LoDTensor) n-dim bool tensor. Each element is $$Out = (X || Y) \\, "
-"\\&\\& \\, !(X \\&\\& Y)$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1032
-msgid "logical_not"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_not:1
-msgid "logical_not Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_not:3
-msgid ""
-"It operates element-wise on X, and returns the Out. X and Out are N-dim "
-"boolean tensors. Each element of Out is calculated by $$Out = !X$$"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_not:6
-msgid ""
-"(LoDTensor) Operand of logical_not operator Duplicable: False  Optional: "
-"False"
-msgstr ""
-
-#: of paddle.fluid.layers.logical_not:9
-msgid "(LoDTensor) n-dim bool tensor. Each element is $$Out = !X$$"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1040
-msgid "uniform_random_batch_size_like"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:1
-msgid "Uniform random operator"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:5
-msgid ""
-"This operator initializes a tensor with the same batch_size as the Input "
-"tensor"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:4
-msgid "with random values sampled from a uniform distribution."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random_batch_size_like:5
-#: paddle.fluid.layers.uniform_random_batch_size_like:7
-msgid ""
-"Tensor whose input_dim_idx'th dimension specifies the batch_size "
-"Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random_batch_size_like:7
-#: paddle.fluid.layers.uniform_random_batch_size_like:9
-msgid "The shape of the output"
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random_batch_size_like:9
-#: paddle.fluid.layers.uniform_random_batch_size_like:11
-msgid "default 0. The index of input's batch size dimension"
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random_batch_size_like:11
-#: paddle.fluid.layers.uniform_random_batch_size_like:13
-msgid "default 0. The index of output's batch size dimension"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:15
-msgid "(float, default -1.0) Minimum value of uniform random"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:17
-msgid "(float, default 1.0) Maximun value of uniform random"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:19
-msgid ""
-"(int, default 0) Random seed used for generating samples. 0 means use a "
-"seed generated by the system.Note that if seed is not 0, this operator "
-"will always generate the same random numbers every time."
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random_batch_size_like:21
-msgid "(int, default 5(FP32)) Output tensor data type"
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random_batch_size_like:22
-#: paddle.fluid.layers.uniform_random_batch_size_like:24
-msgid "Tensor of specified shape will be filled with the specified value"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1048
-msgid "gaussian_random"
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:1
-#: paddle.fluid.layers.gaussian_random_batch_size_like:1
-msgid "GaussianRandom Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:3
-#: paddle.fluid.layers.gaussian_random_batch_size_like:3
-msgid "Used to initialize tensors with gaussian random generator."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:6
-msgid "(vector<int>) The dimension of random tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:8
-#: paddle.fluid.layers.gaussian_random_batch_size_like:13
-msgid "(float, default 0.0) mean of random tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:10
-#: paddle.fluid.layers.gaussian_random_batch_size_like:15
-msgid "(float, default 1.0) std of random tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:12
-#: paddle.fluid.layers.gaussian_random_batch_size_like:17
-msgid ""
-"(int, default 0) Random seed of generator.0 means use system wide "
-"seed.Note that if seed is not 0, this operator will always generate the "
-"same random numbers every time."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:14
-#: paddle.fluid.layers.gaussian_random_batch_size_like:19
-msgid "(int, default 5(FP32)) Output data type."
-msgstr ""
-
-#: of paddle.fluid.layers.gaussian_random:17
-msgid "Output matrix of gaussian random op"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1056
-msgid "gaussian_random_batch_size_like"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1064
-msgid "scatter"
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:1
-msgid "Scatter Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:3
-msgid ""
-"This operator obtains output by updating the input on selected indices on"
-" the first axis:"
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:5
-msgid "$$ Out = X \\\\ Out[Ids] = X[Ids] + Updates $$"
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:11
-msgid "The source input of scatter op Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:13
-msgid ""
-"The index input of scatter op where X will be updated Duplicable: False  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:15
-msgid "The updated value of updates op Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.scatter:18
-msgid "The output of add op"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1072
-msgid "sum"
-msgstr ""
-
-#: of paddle.fluid.layers.sum:1
-msgid "Sum operator."
-msgstr ""
-
-#: of paddle.fluid.layers.sum:3
-msgid ""
-"This operators sums the input tensors. All the inputs can carry the LoD "
-"(Level of Details) information. However, the output only shares the LoD "
-"information with the first input."
-msgstr ""
-
-#: of paddle.fluid.layers.sum:7
-msgid ""
-"(vector<Tensor>) The input tensors of sum operator. Duplicable: True  "
-"Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.sum:10
-msgid "(Tensor) The output tensor of sum operator."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1080
-msgid "slice"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:1
-msgid "Slice Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.slice:3
-msgid ""
-"Produces a slice of the input tensor along multiple axes. Similar to "
-"numpy: https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html "
-"Slice uses `axes`, `starts` and `ends` attributes to specify the start "
-"and end dimension for each axis in the list of axes, it uses this "
-"information to slice the input data tensor. If a negative value is passed"
-" for any of the start or end indices, it represents number of elements "
-"before the end of that dimension. If the value passed to start or end is "
-"larger than the n (the number of elements in this dimension), it "
-"represents n. For slicing to the end of a dimension with unknown size, it"
-" is recommended to pass in INT_MAX. If axes are omitted, they are set to "
-"[0, ..., ndim-1]."
-msgstr ""
-
-#: of paddle.fluid.layers.slice:14
-msgid "Example 1: Given:"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:16
-msgid ""
-"data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] axes = [0, 1] starts = [1, 0] ends"
-" = [2, 3]"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:21 paddle.fluid.layers.slice:29
-msgid "Then:"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:21
-msgid "result = [ [5, 6, 7], ]"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:23
-msgid "Example 2: Given:"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:25
-msgid "data = [ [1, 2, 3, 4], [5, 6, 7, 8], ] starts = [0, 1] ends = [-1, 1000]"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:29
-msgid "result = [ [2, 3, 4], ]"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:31
-msgid "Tensor of data to extract slices from. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:33
-msgid ""
-"(list<int>) Axes that `starts` and `ends` apply to. It's optional.If not "
-"present, will be treated as [0, 1, ..., len(`starts`) - 1]."
-msgstr ""
-
-#: of paddle.fluid.layers.slice:35
-msgid "(list<int>) Starting indices of corresponding axis in `axes`"
-msgstr ""
-
-#: of paddle.fluid.layers.slice:37
-msgid "(list<int>) Starting indices of corresponding axis in `axes`."
-msgstr ""
-
-#: of paddle.fluid.layers.slice:40
-msgid "Sliced data tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1088
-msgid "polygon_box_transform"
-msgstr ""
-
-#: of paddle.fluid.layers.polygon_box_transform:1
-msgid ""
-"PolygonBoxTransform Operator. The input is the final geometry output in "
-"detection network. We use 2*n numbers to denote the coordinate shift from"
-" n corner vertices of the polygon_box to the pixel location. As each "
-"distance offset contains two numbers (xi, yi), the geometry output "
-"contains 2*n channels. PolygonBoxTransform Operator is used to transform "
-"the coordinate shift to the real coordinate."
-msgstr ""
-
-#: of paddle.fluid.layers.polygon_box_transform:8
-msgid ""
-"The input with shape [batch_size, geometry_channels, height, width] "
-"Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.polygon_box_transform:11
-msgid "The output with the same shape as input"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1096
-msgid "shape"
-msgstr ""
-
-#: of paddle.fluid.layers.shape:1
-msgid "Shape Operator. Get the shape of input tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.shape:4
-msgid "(Tensor), The input tensor. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.shape:7
-msgid "(Tensor), The shape of input tensor."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1104
-msgid "maxout"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:1
-msgid "MaxOut Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:3
-msgid ""
-"Assumed the input shape is (N, Ci, H, W). The output shape is (N, Co, H, "
-"W). Then $Co = Ci / groups$ and the operator formula is as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:7
-msgid ""
-"$$ y_{si+j} = \\max_k x_{gsi + sk + j} \\\\ g = groups \\\\ s = "
-"\\frac{input.size}{num\\_channels} \\\\ 0 \\le i < "
-"\\frac{num\\_channels}{groups} \\\\ 0 \\le j < s \\\\ 0 \\le k < groups "
-"$$"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:21
-msgid "Please refer to Paper:"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:17
-msgid ""
-"Maxout Networks: "
-"http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:18
-msgid ""
-"Multi-digit Number Recognition from Street View \\ Imagery using Deep "
-"Convolutional Neural Networks: \\ https://arxiv.org/pdf/1312.6082v4.pdf"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:23
-msgid ""
-"(Tensor) The input tensor of maxout operator. The format of input tensor "
-"is NCHW. Where N is batch size, C is the number of channels, H and W is "
-"the height and width of feature. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:25
-msgid ""
-"\"Specifies how many groups the input tensor will be split\" \"in the "
-"channel dimension. And the number of output channel is \" \"the number of"
-" channels divided by groups..\""
-msgstr ""
-
-#: of paddle.fluid.layers.maxout:30
-msgid ""
-"(Tensor) The output tensor of maxout operator.The format of output tensor"
-" is also NCHW.Where N is batch size, C is the number of channels, H and W"
-" is the height and width of feature."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1112
-msgid "sigmoid"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid:1
-msgid "Sigmoid Activation Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid:3
-msgid "$$out = \\frac{1}{1 + e^{-x}}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid:6
-msgid "Input of Sigmoid operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.abs:8 paddle.fluid.layers.ceil:8
-#: paddle.fluid.layers.cos:8 paddle.fluid.layers.exp:8
-#: paddle.fluid.layers.floor:8 paddle.fluid.layers.log:10
-#: paddle.fluid.layers.logsigmoid:8 paddle.fluid.layers.reciprocal:8
-#: paddle.fluid.layers.relu:8 paddle.fluid.layers.round:8
-#: paddle.fluid.layers.sigmoid:8 paddle.fluid.layers.sin:8
-#: paddle.fluid.layers.softplus:8 paddle.fluid.layers.softsign:8
-#: paddle.fluid.layers.sqrt:8 paddle.fluid.layers.square:8
-#: paddle.fluid.layers.tanh:8 paddle.fluid.layers.tanh_shrink:8
-msgid "(bool, default false) Only used in mkldnn kernel"
-msgstr ""
-
-#: of paddle.fluid.layers.sigmoid:11
-msgid "Output of Sigmoid operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1120
-msgid "logsigmoid"
-msgstr ""
-
-#: of paddle.fluid.layers.logsigmoid:1
-msgid "Logsigmoid Activation Operator"
-msgstr ""
-
-#: of paddle.fluid.layers.logsigmoid:3
-msgid "$$out = \\log \\frac{1}{1 + e^{-x}}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.logsigmoid:6
-msgid "Input of LogSigmoid operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.logsigmoid:11
-msgid "Output of LogSigmoid operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1128
-msgid "exp"
-msgstr ""
-
-#: of paddle.fluid.layers.exp:1
-msgid "Exp Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.exp:3
-msgid "$out = e^x$"
-msgstr ""
-
-#: of paddle.fluid.layers.exp:6
-msgid "Input of Exp operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.exp:11
-msgid "Output of Exp operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1136
-msgid "relu"
-msgstr ""
-
-#: of paddle.fluid.layers.relu:1
-msgid "Relu Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.relu:3
-msgid "$out = \\max(x, 0)$"
-msgstr ""
-
-#: of paddle.fluid.layers.relu:6
-msgid "Input of Relu operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.relu:11
-msgid "Output of Relu operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1144
-msgid "tanh"
-msgstr ""
-
-#: of paddle.fluid.layers.tanh:1
-msgid "Tanh Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.tanh:3
-msgid "$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.tanh:6
-msgid "Input of Tanh operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.tanh:11
-msgid "Output of Tanh operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1152
-msgid "tanh_shrink"
-msgstr ""
-
-#: of paddle.fluid.layers.tanh_shrink:1
-msgid "TanhShrink Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.tanh_shrink:3
-msgid "$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.tanh_shrink:6
-msgid "Input of TanhShrink operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.tanh_shrink:11
-msgid "Output of TanhShrink operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1160
-msgid "softshrink"
-msgstr ""
-
-#: of paddle.fluid.layers.softshrink:1
-msgid ":strong:`Softshrink Activation Operator`"
-msgstr ""
-
-#: of paddle.fluid.layers.softshrink:11
-msgid "Input of Softshrink operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.softshrink:13
-msgid "non-negative offset"
-msgstr ""
-
-#: of paddle.fluid.layers.softshrink:16
-msgid "Output of Softshrink operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1168
-msgid "sqrt"
-msgstr ""
-
-#: of paddle.fluid.layers.sqrt:1
-msgid "Sqrt Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.sqrt:3
-msgid "$out = \\sqrt{x}$"
-msgstr ""
-
-#: of paddle.fluid.layers.sqrt:6
-msgid "Input of Sqrt operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.sqrt:11
-msgid "Output of Sqrt operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1176
-msgid "abs"
-msgstr ""
-
-#: of paddle.fluid.layers.abs:1
-msgid "Abs Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.abs:3
-msgid "$out = |x|$"
-msgstr ""
-
-#: of paddle.fluid.layers.abs:6
-msgid "Input of Abs operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.abs:11
-msgid "Output of Abs operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1184
-msgid "ceil"
-msgstr ""
-
-#: of paddle.fluid.layers.ceil:1
-msgid "Ceil Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.ceil:3
-msgid "$out = ceil(x)$"
-msgstr ""
-
-#: of paddle.fluid.layers.ceil:6
-msgid "Input of Ceil operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.ceil:11
-msgid "Output of Ceil operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1192
-msgid "floor"
-msgstr ""
-
-#: of paddle.fluid.layers.floor:1
-msgid "Floor Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.floor:3
-msgid "$out = floor(x)$"
-msgstr ""
-
-#: of paddle.fluid.layers.floor:6
-msgid "Input of Floor operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.floor:11
-msgid "Output of Floor operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1200
-msgid "cos"
-msgstr ""
-
-#: of paddle.fluid.layers.cos:1
-msgid "Cosine Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.cos:3
-msgid "$out = cos(x)$"
-msgstr ""
-
-#: of paddle.fluid.layers.cos:6
-msgid "Input of Cos operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.cos:11
-msgid "Output of Cos operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1208
-msgid "sin"
-msgstr ""
-
-#: of paddle.fluid.layers.sin:1
-msgid "Sine Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.sin:3
-msgid "$out = sin(x)$"
-msgstr ""
-
-#: of paddle.fluid.layers.sin:6
-msgid "Input of Sin operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.sin:11
-msgid "Output of Sin operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1216
-msgid "round"
-msgstr ""
-
-#: of paddle.fluid.layers.round:1
-msgid "Round Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.round:3
-msgid "$out = [x]$"
-msgstr ""
-
-#: of paddle.fluid.layers.round:6
-msgid "Input of Round operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.round:11
-msgid "Output of Round operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1224
-msgid "reciprocal"
-msgstr ""
-
-#: of paddle.fluid.layers.reciprocal:1
-msgid "Reciprocal Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.reciprocal:3
-msgid "$$out = \\frac{1}{x}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.reciprocal:6
-msgid "Input of Reciprocal operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.reciprocal:11
-msgid "Output of Reciprocal operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1232
-msgid "log"
-msgstr ""
-
-#: of paddle.fluid.layers.log:1
-msgid "Log Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.log:3
-msgid "$out = \\ln(x)$"
-msgstr ""
-
-#: of paddle.fluid.layers.log:5
-msgid "Natural logarithm of x."
-msgstr ""
-
-#: of paddle.fluid.layers.log:8
-msgid "Input of Log operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.log:13
-msgid "Output of Log operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1240
-msgid "square"
-msgstr ""
-
-#: of paddle.fluid.layers.square:1
-msgid "Square Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.square:3
-msgid "$out = x^2$"
-msgstr ""
-
-#: of paddle.fluid.layers.square:6
-msgid "Input of Square operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.square:11
-msgid "Output of Square operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1248
-msgid "softplus"
-msgstr ""
-
-#: of paddle.fluid.layers.softplus:1
-msgid "Softplus Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.softplus:3
-msgid "$out = \\ln(1 + e^{x})$"
-msgstr ""
-
-#: of paddle.fluid.layers.softplus:6
-msgid "Input of Softplus operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.softplus:11
-msgid "Output of Softplus operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1256
-msgid "softsign"
-msgstr ""
-
-#: of paddle.fluid.layers.softsign:1
-msgid "Softsign Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.softsign:3
-msgid "$$out = \\frac{x}{1 + |x|}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.softsign:6
-msgid "Input of Softsign operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.softsign:11
-msgid "Output of Softsign operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1264
-msgid "brelu"
-msgstr ""
-
-#: of paddle.fluid.layers.brelu:1
-msgid "BRelu Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.brelu:3
-msgid "$out = \\max(\\min(x, t_{min}), t_{max})$"
-msgstr ""
-
-#: of paddle.fluid.layers.brelu:6
-msgid "Input of BRelu operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.brelu:8
-msgid "The min marginal value of BRelu"
-msgstr ""
-
-#: of paddle.fluid.layers.brelu:10
-msgid "The max marginal value of BRelu"
-msgstr ""
-
-#: of paddle.fluid.layers.brelu:13
-msgid "Output of BRelu operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1272
-msgid "leaky_relu"
-msgstr ""
-
-#: of paddle.fluid.layers.leaky_relu:1
-msgid "LeakyRelu Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.leaky_relu:3
-msgid "$out = \\max(x, \\alpha * x)$"
-msgstr ""
-
-#: of paddle.fluid.layers.leaky_relu:6
-msgid "Input of LeakyRelu operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.leaky_relu:8
-msgid "The small negative slope"
-msgstr ""
-
-#: of paddle.fluid.layers.leaky_relu:11
-msgid "Output of LeakyRelu operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1280
-msgid "soft_relu"
-msgstr ""
-
-#: of paddle.fluid.layers.soft_relu:1
-msgid "SoftRelu Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.soft_relu:3
-msgid "$out = \\ln(1 + \\exp(\\max(\\min(x, threshold), threshold))$"
-msgstr ""
-
-#: of paddle.fluid.layers.soft_relu:6
-msgid "Input of SoftRelu operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.soft_relu:8
-msgid "The threshold value of SoftRelu"
-msgstr ""
-
-#: of paddle.fluid.layers.soft_relu:11
-msgid "Output of SoftRelu operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1288
-msgid "elu"
-msgstr ""
-
-#: of paddle.fluid.layers.elu:1
-msgid "ELU Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.elu:3
-msgid ""
-"Applies the following element-wise computation on the input according to "
-"https://arxiv.org/abs/1511.07289."
-msgstr ""
-
-#: of paddle.fluid.layers.elu:6
-msgid "$out = \\max(0, x) + \\min(0, \\alpha * (e^x - 1))$"
-msgstr ""
-
-#: of paddle.fluid.layers.elu:9
-msgid "Input of ELU operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.elu:11
-msgid "The alpha value of ELU"
-msgstr ""
-
-#: of paddle.fluid.layers.elu:14
-msgid "Output of ELU operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1296
-msgid "relu6"
-msgstr ""
-
-#: of paddle.fluid.layers.relu6:1
-msgid "Relu6 Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.relu6:3
-msgid "$out = \\min(\\max(0, x), 6)$"
-msgstr ""
-
-#: of paddle.fluid.layers.relu6:6
-msgid "Input of Relu6 operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.relu6:8
-msgid "The threshold value of Relu6"
-msgstr ""
-
-#: of paddle.fluid.layers.relu6:11
-msgid "Output of Relu6 operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1304
-msgid "pow"
-msgstr ""
-
-#: of paddle.fluid.layers.pow:1
-msgid "Pow Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.pow:3
-msgid "$out = x^{factor}$"
-msgstr ""
-
-#: of paddle.fluid.layers.pow:6
-msgid "Input of Pow operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.pow:8
-msgid "The exponential factor of Pow"
-msgstr ""
-
-#: of paddle.fluid.layers.pow:11
-msgid "Output of Pow operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1312
-msgid "stanh"
-msgstr ""
-
-#: of paddle.fluid.layers.stanh:1
-msgid "STanh Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.stanh:3
-msgid "$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.stanh:6
-msgid "Input of STanh operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.stanh:8
-msgid "The scale parameter of a for the input"
-msgstr ""
-
-#: of paddle.fluid.layers.stanh:10
-msgid "The scale parameter of b for the input"
-msgstr ""
-
-#: of paddle.fluid.layers.stanh:13
-msgid "Output of STanh operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1320
-msgid "hard_sigmoid"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:1
-msgid "HardSigmoid Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:3
-msgid ""
-"Segment-wise linear approximation of "
-"sigmoid(https://arxiv.org/abs/1603.00391), which is much faster than "
-"sigmoid."
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:6
-msgid "$out = \\max(0, \\min(1, slope * x + shift))$"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:8
-msgid ""
-"The slope should be positive. The offset can be either positive or "
-"negative. The default slope and shift are set according to the above "
-"reference. It is recommended to use the defaults for this activation."
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:13
-msgid "Input of HardSigmoid operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:15
-msgid "Slope for linear approximation of sigmoid"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:17
-msgid "Offset for linear approximation of sigmoid"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_sigmoid:20
-msgid "Output of HardSigmoid operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1328
-msgid "swish"
-msgstr ""
-
-#: of paddle.fluid.layers.swish:1
-msgid "Swish Activation Operator."
-msgstr ""
-
-#: of paddle.fluid.layers.swish:3
-msgid "$$out = \\frac{x}{1 + e^{- \\beta x}}$$"
-msgstr ""
-
-#: of paddle.fluid.layers.swish:6
-msgid "Input of Swish operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.swish:8
-msgid "Constant beta of swish operator"
-msgstr ""
-
-#: of paddle.fluid.layers.swish:11
-msgid "Output of Swish operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1336
-msgid "uniform_random"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:1
-msgid ""
-"This operator initializes a tensor with random values sampled from a "
-"uniform distribution. The random result is in set [min, max]."
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:5
-msgid "The shape of the output tensor"
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:7
-msgid "Minimum value of uniform random. [default -1.0]."
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:9
-msgid "Maximun value of uniform random. [default 1.0]."
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:11
-msgid ""
-"Random seed used for generating samples. 0 means use a seed generated by "
-"the system.Note that if seed is not 0, this operator will always generate"
-" the same random numbers every time. [default 0]."
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:13
-msgid "Output tensor data type. [default 5(FP32)]."
-msgstr ""
-
-#: of paddle.fluid.layers.uniform_random:16
-msgid "The output tensor of uniform random op"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1344
-msgid "hard_shrink"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_shrink:1
-msgid ":strong:`HardShrink activation operator`"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_shrink:11
-msgid "Input of HardShrink operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_shrink:13
-msgid "The value of threshold for HardShrink. [default: 0.5]"
-msgstr ""
-
-#: of paddle.fluid.layers.hard_shrink:16
-msgid "Output of HardShrink operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1352
-msgid "cumsum"
-msgstr ""
-
-#: of paddle.fluid.layers.cumsum:1
-msgid ""
-"The cumulative sum of the elements along a given axis. By default, the "
-"first element of the result is the same of the first element of the "
-"input. If exlusive is true, the first element of the result is 0."
-msgstr ""
-
-#: of paddle.fluid.layers.cumsum:5
-msgid "Input of cumsum operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.cumsum:7
-msgid ""
-"The dimenstion to accumulate along. -1 means the last dimenstion [default"
-" -1]."
-msgstr ""
-
-#: of paddle.fluid.layers.cumsum:9
-msgid "Whether to perform exclusive cumsum. [default false]."
-msgstr ""
-
-#: of paddle.fluid.layers.cumsum:11
-msgid ""
-"If true, the cumsum is performed in the reversed direction. [default "
-"false]."
-msgstr ""
-
-#: of paddle.fluid.layers.cumsum:14
-msgid "Output of cumsum operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1360
-msgid "thresholded_relu"
-msgstr ""
-
-#: of paddle.fluid.layers.thresholded_relu:1
-msgid ":strong:`ThresholdedRelu activation operator`"
-msgstr ""
-
-#: of paddle.fluid.layers.thresholded_relu:10
-msgid "Input of ThresholdedRelu operator Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.thresholded_relu:12
-msgid "The threshold location of activation. [default 1.0]."
-msgstr ""
-
-#: of paddle.fluid.layers.thresholded_relu:15
-msgid "Output of ThresholdedRelu operator"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1366
-#: ../../source/api_reference/layers.rst:1618
-msgid "tensor"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1371
-#: ../../source/api_reference/layers.rst:1623
-msgid "create_tensor"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1379
-#: ../../source/api_reference/layers.rst:1631
-msgid "create_parameter"
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:1
-msgid ""
-"Create a parameter. The parameter is a learnable variable, which can have"
-" gradient, and can be optimized."
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:4
-msgid ""
-"NOTE: this is a very low-level API. This API is useful when you create "
-"operator by your self. instead of using layers."
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:7
-msgid "shape of the parameter"
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:9
-msgid "element type of the parameter"
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:11
-msgid "attributes of the parameter"
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:13
-msgid ""
-"This can affect which default initializer is chosen when "
-"default_initializer is None. If is_bias, initializer.Constant(0.0) will "
-"be used. Otherwise, Xavier() will be used."
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:18
-msgid "initializer for the parameter"
-msgstr ""
-
-#: of paddle.fluid.layers.create_parameter:21
-msgid "the created parameter."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1387
-#: ../../source/api_reference/layers.rst:1639
-msgid "create_global_var"
-msgstr ""
-
-#: of paddle.fluid.layers.create_global_var:1
-msgid ""
-"Create a global variable. such as global_step :param shape: shape of the "
-"variable :type shape: list[int] :param value: the value of the variable "
-":type value: float :param dtype: element type of the parameter :type "
-"dtype: string :param persistable: if this variable is persistable :type "
-"persistable: bool :param force_cpu: force this variable to be on CPU "
-":type force_cpu: bool"
-msgstr ""
-
-#: of paddle.fluid.layers.create_global_var:13
-msgid "the created Variable"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1395
-#: ../../source/api_reference/layers.rst:1647
-msgid "cast"
-msgstr ""
-
-#: of paddle.fluid.layers.cast:1
-msgid ""
-"This function takes in the input with input_dtype and casts it to the "
-"output_dtype as the output."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1403
-#: ../../source/api_reference/layers.rst:1655
-msgid "concat"
-msgstr ""
-
-#: of paddle.fluid.layers.concat:1
-msgid "**Concat**"
-msgstr ""
-
-#: of paddle.fluid.layers.concat:3
-msgid ""
-"This function concatenates the input along the axis mentioned and returns"
-" that as the output."
-msgstr ""
-
-#: of paddle.fluid.layers.concat:6
-msgid "List of tensors to be concatenated"
-msgstr ""
-
-#: of paddle.fluid.layers.concat:8
-msgid "Integer axis along which the tensors will be concatenated"
-msgstr ""
-
-#: of paddle.fluid.layers.concat:14
-msgid "Output variable of the concatenation"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1411
-#: ../../source/api_reference/layers.rst:1663
-msgid "sums"
-msgstr ""
-
-#: of paddle.fluid.layers.sums:1
-msgid ""
-"This function performs the sum operation on the input and returns the "
-"result as the output."
-msgstr ""
-
-#: of paddle.fluid.layers.sums:4
-msgid "The input tensor that has the elements that need to be summed up."
-msgstr ""
-
-#: of paddle.fluid.layers.sums:8
-msgid ""
-"The tensor type variable that has the sum of input           written to "
-"it."
-msgstr ""
-
-#: of paddle.fluid.layers.sums:10
-msgid "The tensor type variable that has the sum of input"
-msgstr ""
-
-#: of paddle.fluid.layers.sums:11
-msgid "written to it."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1419
-#: ../../source/api_reference/layers.rst:1671
-msgid "assign"
-msgstr ""
-
-#: of paddle.fluid.layers.assign:1
-msgid "**Assign**"
-msgstr ""
-
-#: of paddle.fluid.layers.assign:3
-msgid "This function copies the *input* Variable to the *output* Variable."
-msgstr ""
-
-#: of paddle.fluid.layers.assign:5
-msgid "The source variable"
-msgstr ""
-
-#: of paddle.fluid.layers.assign:7
-msgid "The destination variable"
-msgstr ""
-
-#: of paddle.fluid.layers.assign:10
-msgid "The destination variable that was supplied as the *output*."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1427
-#: ../../source/api_reference/layers.rst:1679
-msgid "fill_constant_batch_size_like"
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:1
-msgid ""
-"This function creates a tensor of specified *shape*, *dtype* and batch "
-"size, and initializes this with a constant supplied in *value*. The batch"
-" size is obtained from the `input` tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:7
-#: paddle.fluid.layers.ones:6 paddle.fluid.layers.zeros:6
-msgid "It also sets *stop_gradient* to True."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:12
-msgid "Tensor whose input_dim_idx'th dimension specifies the batch_size."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:14
-msgid "The shape of the output."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:16
-msgid "It could be numpy.dtype. Output data type. Default is float32."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:18
-msgid "default 0. The value to be filled."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:20
-msgid "default 0. The index of input's batch size dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:22
-msgid "default 0. The index of output's batch size dimension."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant_batch_size_like:25
-msgid "Tensor of specified shape will be filled with the specified value."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1435
-#: ../../source/api_reference/layers.rst:1687
-msgid "fill_constant"
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:1
-msgid "**fill_constant**"
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:3
-msgid ""
-"This function creates a tensor with specified `shape` and `dtype`, and "
-"initializes it with a constant specifed by `value`."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:6
-msgid "The attribute `stop_gradient` of the created tensor is set to True."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:8
-msgid "Shape of the output tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:10
-msgid "Data type of the output tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:12
-msgid "The constant value used to initialize the output tensor."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:16
-msgid "data should be on CPU if set true."
-msgstr ""
-
-#: of paddle.fluid.layers.fill_constant:19
-msgid "The tensor variable storing the output."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1443
-#: ../../source/api_reference/layers.rst:1695
-msgid "argmin"
-msgstr ""
-
-#: of paddle.fluid.layers.argmin:1
-msgid "**argmin**"
-msgstr ""
-
-#: of paddle.fluid.layers.argmin:3
-msgid ""
-"This function computes the indices of the min elements of the input "
-"tensor's element along the provided axis."
-msgstr ""
-
-#: of paddle.fluid.layers.argmin:6
-msgid "The input to compute the indices of the min elements."
-msgstr ""
-
-#: of paddle.fluid.layers.argmax:9 paddle.fluid.layers.argmin:9
-msgid "Axis to compute indices along."
-msgstr ""
-
-#: of paddle.fluid.layers.argmax:12 paddle.fluid.layers.argmin:12
-#: paddle.fluid.layers.ones:13 paddle.fluid.layers.zeros:13
-msgid "The tensor variable storing the output"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1451
-#: ../../source/api_reference/layers.rst:1703
-msgid "argmax"
-msgstr ""
-
-#: of paddle.fluid.layers.argmax:1
-msgid "**argmax**"
-msgstr ""
-
-#: of paddle.fluid.layers.argmax:3
-msgid ""
-"This function computes the indices of the max elements of the input "
-"tensor's element along the provided axis."
-msgstr ""
-
-#: of paddle.fluid.layers.argmax:6
-msgid "The input to compute the indices of the max elements."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1459
-#: ../../source/api_reference/layers.rst:1711
-msgid "ones"
-msgstr ""
-
-#: of paddle.fluid.layers.ones:1
-msgid "**ones**"
-msgstr ""
-
-#: of paddle.fluid.layers.ones:3
-msgid ""
-"This function creates a tensor of specified *shape* and *dtype*, and "
-"initializes this with 1."
-msgstr ""
-
-#: of paddle.fluid.layers.ones:8 paddle.fluid.layers.zeros:8
-msgid "Shape of output tensor"
-msgstr ""
-
-#: of paddle.fluid.layers.ones:10 paddle.fluid.layers.zeros:10
-msgid "Data type of output tensor"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1467
-#: ../../source/api_reference/layers.rst:1719
-msgid "zeros"
-msgstr ""
-
-#: of paddle.fluid.layers.zeros:1
-msgid "**zeros**"
-msgstr ""
-
-#: of paddle.fluid.layers.zeros:3
-msgid ""
-"This function creates a tensor of specified *shape* and *dtype*, and "
-"initializes this with 0."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1473
-msgid "learning_rate_scheduler"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1478
-msgid "exponential_decay"
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:1
-msgid "Applies exponential decay to the learning rate."
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:3
-msgid "```python decayed_learning_rate = learning_rate *"
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:5
-msgid "decay_rate ^ (global_step / decay_steps)"
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:6
-msgid "``` :param learning_rate: A scalar float32 value or a Variable. This"
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:8
-msgid "will be the initial learning rate during training"
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:9
-#: paddle.fluid.layers.inverse_time_decay:10
-#: paddle.fluid.layers.natural_exp_decay:10
-msgid "A Python `int32` number."
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:10
-#: paddle.fluid.layers.inverse_time_decay:11
-#: paddle.fluid.layers.natural_exp_decay:11
-msgid "A Python `float` number."
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:11
-#: paddle.fluid.layers.inverse_time_decay:12
-#: paddle.fluid.layers.natural_exp_decay:12
-msgid "Boolean. If set true, decay the learning rate every decay_steps."
-msgstr ""
-
-#: of paddle.fluid.layers.exponential_decay:13
-#: paddle.fluid.layers.inverse_time_decay:14
-#: paddle.fluid.layers.natural_exp_decay:14
-#: paddle.fluid.layers.polynomial_decay:17
-msgid "The decayed learning rate"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1486
-msgid "natural_exp_decay"
-msgstr ""
-
-#: of paddle.fluid.layers.natural_exp_decay:1
-msgid "Applies natural exponential decay to the initial learning rate."
-msgstr ""
-
-#: of paddle.fluid.layers.natural_exp_decay:8
-msgid ""
-"A scalar float32 value or a Variable. This will be the initial learning "
-"rate during training"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1494
-msgid "inverse_time_decay"
-msgstr ""
-
-#: of paddle.fluid.layers.inverse_time_decay:1
-msgid "Applies inverse time decay to the initial learning rate."
-msgstr ""
-
-#: of paddle.fluid.layers.inverse_time_decay:8
-msgid ""
-"A scalar float32 value or a Variable. This will be the initial learning "
-"rate during training."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1502
-msgid "polynomial_decay"
-msgstr ""
-
-#: of paddle.fluid.layers.polynomial_decay:1
-msgid "Applies polynomial decay to the initial learning rate."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1510
-msgid "piecewise_decay"
-msgstr ""
-
-#: of paddle.fluid.layers.piecewise_decay:1
-msgid "Applies piecewise decay to the initial learning rate."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1518
-msgid "noam_decay"
-msgstr ""
-
-#: of paddle.fluid.layers.noam_decay:1
-msgid "Noam decay method. The numpy implementation of noam decay as follows."
-msgstr ""
-
-#: of paddle.fluid.layers.noam_decay:8
-msgid ""
-"Please reference `attention is all you need "
-"<https://arxiv.org/pdf/1706.03762.pdf>`_."
-msgstr ""
-
-#: of paddle.fluid.layers.noam_decay:11
-msgid "The dimensionality of input and output of model."
-msgstr ""
-
-#: of paddle.fluid.layers.noam_decay:13
-msgid "A super parameter."
-msgstr ""
-
-#: of paddle.fluid.layers.noam_decay:16
-msgid "The decayed learning rate."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1524
-msgid "detection"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1529
-msgid "prior_box"
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:1
-msgid "**Prior box operator**"
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:3
-msgid ""
-"Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. "
-"Each position of the input produce N prior boxes, N is determined by the "
-"count of min_sizes, max_sizes and aspect_ratios, The size of the box is "
-"in range(min_size, max_size) interval, which is generated in sequence "
-"according to the aspect_ratios."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:9
-msgid "The Input Variables, the format is NCHW."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:11 paddle.fluid.layers.prior_box:11
-msgid "The input image data of PriorBoxOp, the layout is NCHW."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:14
-msgid "min sizes of generated prior boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:16
-msgid "max sizes of generated prior boxes. Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:19
-msgid "the aspect ratios of generated prior boxes. Default: [1.]."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:48 paddle.fluid.layers.prior_box:22
-msgid "the variances to be encoded in prior boxes. Default:[0.1, 0.1, 0.2, 0.2]."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:51 paddle.fluid.layers.prior_box:25
-msgid "Whether to flip aspect ratios. Default:False."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:53 paddle.fluid.layers.prior_box:27
-msgid "Whether to clip out-of-boundary boxes. Default: False."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:29
-msgid ""
-"Prior boxes step across width and height, If step[0] == 0.0/step[1] == "
-"0.0, the prior boxes step across height/weight of the input will be "
-"automatically calculated. Default: [0., 0.]"
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:46 paddle.fluid.layers.prior_box:34
-msgid "Prior boxes center offset. Default: 0.5"
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:36
-msgid "Name of the prior box op. Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:39
-msgid ""
-"the output prior boxes of PriorBox.      The layout is [H, W, num_priors,"
-" 4].      H is the height of input, W is the width of input,      "
-"num_priors is the total      box count of each position of input. "
-"Variances(Variable): the expanded variances of PriorBox.      The layout "
-"is [H, W, num_priors, 4].      H is the height of input, W is the width "
-"of input      num_priors is the total      box count of each position of "
-"input"
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:44
-msgid "the output prior boxes of PriorBox."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:42
-msgid ""
-"The layout is [H, W, num_priors, 4]. H is the height of input, W is the "
-"width of input, num_priors is the total box count of each position of "
-"input."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:77 paddle.fluid.layers.prior_box:49
-msgid "Variances(Variable): the expanded variances of PriorBox."
-msgstr ""
-
-#: of paddle.fluid.layers.prior_box:47
-msgid ""
-"The layout is [H, W, num_priors, 4]. H is the height of input, W is the "
-"width of input num_priors is the total box count of each position of "
-"input"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1537
-msgid "multi_box_head"
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:1
-msgid "**Prior_boxes**"
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:3
-msgid ""
-"Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. "
-"The details of this algorithm, please refer the section 2.2 of SSD paper "
-"(SSD: Single Shot MultiBox Detector) <https://arxiv.org/abs/1512.02325>`_"
-" ."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:8
-msgid "The list of input Variables, the format of all Variables is NCHW."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:14
-msgid ""
-"the base_size is used to get min_size and max_size according to min_ratio"
-" and max_ratio."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:17
-msgid "The number of classes."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:19
-msgid ""
-"the aspect ratios of generated prior boxes. The length of input and "
-"aspect_ratios must be equal."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:22
-msgid "the min ratio of generated prior boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:24
-msgid "the max ratio of generated prior boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:26
-msgid ""
-"If `len(inputs) <=2`, min_sizes must be set up, and the length of "
-"min_sizes should equal to the length of inputs. Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:30
-msgid ""
-"If `len(inputs) <=2`, max_sizes must be set up, and the length of "
-"min_sizes should equal to the length of inputs. Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:34
-msgid ""
-"If step_w and step_h are the same, step_w and step_h can be replaced by "
-"steps."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:37
-msgid ""
-"Prior boxes step across width. If step_w[i] == 0.0, the prior boxes step "
-"across width of the inputs[i] will be automatically calculated. Default: "
-"None."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:42
-msgid ""
-"Prior boxes step across height, If step_h[i] == 0.0, the prior boxes step"
-" across height of the inputs[i] will be automatically calculated. "
-"Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:55
-msgid "The kernel size of conv2d. Default: 1."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:57
-msgid "The padding of conv2d. Default:0."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:59
-msgid "The stride of conv2d. Default:1,"
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:61
-msgid "Name of the prior box layer. Default: None."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:64
-msgid ""
-"The predicted boxes' location of the inputs.      The layout is [N, "
-"H*W*Priors, 4]. where Priors      is the number of predicted boxes each "
-"position of each input. mbox_conf(Variable): The predicted boxes' "
-"confidence of the inputs.      The layout is [N, H*W*Priors, C]. where "
-"Priors      is the number of predicted boxes each position of each input"
-"      and C is the number of Classes. boxes(Variable): the output prior "
-"boxes of PriorBox.      The layout is [num_priors, 4]. num_priors is the "
-"total      box count of each position of inputs. Variances(Variable): the"
-" expanded variances of PriorBox.      The layout is [num_priors, 4]. "
-"num_priors is the total      box count of each position of inputs"
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:67
-msgid "The predicted boxes' location of the inputs."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:67
-msgid ""
-"The layout is [N, H*W*Priors, 4]. where Priors is the number of predicted"
-" boxes each position of each input."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:71
-msgid "mbox_conf(Variable): The predicted boxes' confidence of the inputs."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:70
-msgid ""
-"The layout is [N, H*W*Priors, C]. where Priors is the number of predicted"
-" boxes each position of each input and C is the number of Classes."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:74
-msgid "boxes(Variable): the output prior boxes of PriorBox."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:74
-msgid ""
-"The layout is [num_priors, 4]. num_priors is the total box count of each "
-"position of inputs."
-msgstr ""
-
-#: of paddle.fluid.layers.multi_box_head:77
-msgid ""
-"The layout is [num_priors, 4]. num_priors is the total box count of each "
-"position of inputs"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1545
-msgid "bipartite_match"
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:1
-msgid ""
-"This operator implements a greedy bipartite matching algorithm, which is "
-"used to obtain the matching with the maximum distance based on the input "
-"distance matrix. For input 2D matrix, the bipartite matching algorithm "
-"can find the matched column for each row (matched means the largest "
-"distance), also can find the matched row for each column. And this "
-"operator only calculate matched indices from column to row. For each "
-"instance, the number of matched indices is the column number of the input"
-" distance matrix."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:10
-msgid ""
-"There are two outputs, matched indices and distance. A simple "
-"description, this algorithm matched the best (maximum distance) row "
-"entity to the column entity and the matched indices are not duplicated in"
-" each row of ColToRowMatchIndices. If the column entity is not matched "
-"any row entity, set -1 in ColToRowMatchIndices."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:16
-msgid ""
-"NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor. If "
-"LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. If "
-"Tensor, the height of ColToRowMatchIndices is 1."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:20
-msgid ""
-"NOTE: This API is a very low level API. It is used by :code:`ssd_loss` "
-"layer. Please consider to use :code:`ssd_loss` instead."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:23
-msgid ""
-"This input is a 2-D LoDTensor with shape [K, M]. It is pair-wise distance"
-" matrix between the entities represented by each row and each column. For"
-" example, assumed one entity is A with shape [K], another entity is B "
-"with shape [M]. The dist_matrix[i][j] is the distance between A[i] and "
-"B[j]. The bigger the distance is, the better matching the pairs are.  "
-"NOTE: This tensor can contain LoD information to represent a batch of "
-"inputs. One instance of this batch can contain different numbers of "
-"entities."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:23
-msgid ""
-"This input is a 2-D LoDTensor with shape [K, M]. It is pair-wise distance"
-" matrix between the entities represented by each row and each column. For"
-" example, assumed one entity is A with shape [K], another entity is B "
-"with shape [M]. The dist_matrix[i][j] is the distance between A[i] and "
-"B[j]. The bigger the distance is, the better matching the pairs are."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:30
-msgid ""
-"NOTE: This tensor can contain LoD information to represent a batch of "
-"inputs. One instance of this batch can contain different numbers of "
-"entities."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:34
-msgid ""
-"The type of matching method, should be 'bipartite' or 'per_prediction'. "
-"[default 'bipartite']."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:37
-msgid ""
-"If `match_type` is 'per_prediction', this threshold is to determine the "
-"extra matching bboxes based on the maximum distance, 0.5 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:42
-msgid ""
-"a tuple with two elements is returned. The first is matched_indices, the "
-"second is matched_distance.  The matched_indices is a 2-D Tensor with "
-"shape [N, M] in int type. N is the batch size. If match_indices[i][j] is "
-"-1, it means B[j] does not match any entity in i-th instance. Otherwise, "
-"it means B[j] is matched to row match_indices[i][j] in i-th instance. The"
-" row number of i-th instance is saved in match_indices[i][j].  The "
-"matched_distance is a 2-D Tensor with shape [N, M] in float type . N is "
-"batch size. If match_indices[i][j] is -1, match_distance[i][j] is also "
-"-1.0. Otherwise, assumed match_distance[i][j] = d, and the row offsets of"
-" each instance are called LoD. Then match_distance[i][j] = "
-"dist_matrix[d+LoD[i]][j]."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:42
-msgid ""
-"a tuple with two elements is returned. The first is matched_indices, the "
-"second is matched_distance."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:45
-msgid ""
-"The matched_indices is a 2-D Tensor with shape [N, M] in int type. N is "
-"the batch size. If match_indices[i][j] is -1, it means B[j] does not "
-"match any entity in i-th instance. Otherwise, it means B[j] is matched to"
-" row match_indices[i][j] in i-th instance. The row number of i-th "
-"instance is saved in match_indices[i][j]."
-msgstr ""
-
-#: of paddle.fluid.layers.bipartite_match:52
-msgid ""
-"The matched_distance is a 2-D Tensor with shape [N, M] in float type . N "
-"is batch size. If match_indices[i][j] is -1, match_distance[i][j] is also"
-" -1.0. Otherwise, assumed match_distance[i][j] = d, and the row offsets "
-"of each instance are called LoD. Then match_distance[i][j] = "
-"dist_matrix[d+LoD[i]][j]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1553
-msgid "target_assign"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:1
-msgid "**Target assigner operator**"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:3
-msgid ""
-"This operator can be, for given the target bounding boxes or labels, to "
-"assign classification and regression targets to each prediction as well "
-"as weights to prediction. The weights is used to specify which prediction"
-" would not contribute to training loss."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:8
-msgid ""
-"For each instance, the output `out` and`out_weight` are assigned based on"
-" `match_indices` and `negative_indices`. Assumed that the row offset for "
-"each instance in `input` is called lod, this operator assigns "
-"classification/regression targets by performing the following steps:"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:14
-msgid "Assigning all outpts based on `match_indices`:"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:16
-msgid "If id = match_indices[i][j] > 0,"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:18
-msgid "out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K] out_weight[i][j] = 1."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:21
-msgid "Otherwise,"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:23
-msgid ""
-"out[j][j][0 : K] = {mismatch_value, mismatch_value, ...} out_weight[i][j]"
-" = 0."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:26
-msgid "Assigning out_weight based on `neg_indices` if `neg_indices` is provided:"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:28
-msgid ""
-"Assumed that the row offset for each instance in `neg_indices` is called "
-"neg_lod, for i-th instance and each `id` of neg_indices in this instance:"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:31
-msgid ""
-"out[i][id][0 : K] = {mismatch_value, mismatch_value, ...} "
-"out_weight[i][id] = 1.0"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:34
-msgid "This input is a 3D LoDTensor with shape [M, P, K]."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:36
-msgid ""
-"Tensor<int>), The input matched indices is 2D Tenosr<int32> with shape "
-"[N, P], If MatchIndices[i][j] is -1, the j-th entity of column is not "
-"matched to any entity of row in i-th instance."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:41
-msgid ""
-"The input negative example indices are an optional input with shape [Neg,"
-" 1] and int32 type, where Neg is the total number of negative example "
-"indices."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:45
-msgid "Fill this value to the mismatched location."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:48
-msgid ""
-"The output is a 3D Tensor with shape [N, P, K],     N and P is the same "
-"as they are in `neg_indices`, K is the     same as it in input of X. If "
-"`match_indices[i][j]`. out_weight (Variable): The weight for output with "
-"the shape of [N, P, 1]."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:51
-msgid "The output is a 3D Tensor with shape [N, P, K],"
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:51
-msgid ""
-"N and P is the same as they are in `neg_indices`, K is the same as it in "
-"input of X. If `match_indices[i][j]`."
-msgstr ""
-
-#: of paddle.fluid.layers.target_assign:53
-msgid "out_weight (Variable): The weight for output with the shape of [N, P, 1]."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1561
-msgid "detection_output"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:1
-msgid "**Detection Output Layer for Single Shot Multibox Detector (SSD).**"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:3
-msgid ""
-"This operation is to get the detection results by performing following "
-"two steps:"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:6
-msgid "Decode input bounding box predictions according to the prior boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:7
-msgid ""
-"Get the final detection results by applying multi-class non maximum "
-"suppression (NMS)."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:10
-msgid ""
-"Please note, this operation doesn't clip the final output bounding boxes "
-"to the image window."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:13
-msgid ""
-"A 3-D Tensor with shape [N, M, 4] represents the predicted locations of M"
-" bounding bboxes. N is the batch size, and each bounding box has four "
-"coordinate values and the layout is [xmin, ymin, xmax, ymax]."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:18
-msgid ""
-"A 3-D Tensor with shape [N, M, C] represents the predicted confidence "
-"predictions. N is the batch size, C is the class number, M is number of "
-"bounding boxes. For each category there are total M scores which "
-"corresponding M bounding boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:23
-msgid ""
-"A 2-D Tensor with shape [M, 4] holds M boxes, each box is represented as "
-"[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate of the "
-"anchor box, if the input is image feature map, they are close to the "
-"origin of the coordinate system. [xmax, ymax] is the right bottom "
-"coordinate of the anchor box."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:30
-msgid "A 2-D Tensor with shape [M, 4] holds M group of variance."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:33
-msgid ""
-"The index of background label, the background label will be ignored. If "
-"set to -1, then all categories will be considered."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:37
-msgid "The threshold to be used in NMS."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:39
-msgid ""
-"Maximum number of detections to be kept according to the confidences "
-"aftern the filtering detections based on score_threshold."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:43
-msgid ""
-"Number of total bboxes to be kept per image after NMS step. -1 means "
-"keeping all bboxes after NMS step."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:46
-msgid ""
-"Threshold to filter out bounding boxes with low confidence score. If not "
-"provided, consider all boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:49
-msgid "The parameter for adaptive NMS."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:52
-msgid ""
-"The detection outputs is a LoDTensor with shape [No, 6].     Each row has"
-" six values: [label, confidence, xmin, ymin, xmax, ymax].     `No` is the"
-" total number of detections in this mini-batch. For each     instance, "
-"the offsets in first dimension are called LoD, the offset     number is N"
-" + 1, N is the batch size. The i-th image has     `LoD[i + 1] - LoD[i]` "
-"detected results, if it is 0, the i-th image     has no detected results."
-" If all images have not detected results,     all the elements in LoD are"
-" 0, and output tensor only contains one     value, which is -1."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:61
-msgid "The detection outputs is a LoDTensor with shape [No, 6]."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:55
-msgid ""
-"Each row has six values: [label, confidence, xmin, ymin, xmax, ymax]. "
-"`No` is the total number of detections in this mini-batch. For each "
-"instance, the offsets in first dimension are called LoD, the offset "
-"number is N + 1, N is the batch size. The i-th image has `LoD[i + 1] - "
-"LoD[i]` detected results, if it is 0, the i-th image has no detected "
-"results. If all images have not detected results, all the elements in LoD"
-" are 0, and output tensor only contains one value, which is -1."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:69
-msgid "pb = layers.data(name='prior_box', shape=[10, 4],"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:70
-#: paddle.fluid.layers.detection_output:72
-#: paddle.fluid.layers.detection_output:74
-#: paddle.fluid.layers.detection_output:76
-msgid "append_batch_size=False, dtype='float32')"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:71
-msgid "pbv = layers.data(name='prior_box_var', shape=[10, 4],"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:73
-msgid "loc = layers.data(name='target_box', shape=[2, 21, 4],"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:75
-msgid "scores = layers.data(name='scores', shape=[2, 21, 10],"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:79
-msgid "nmsed_outs = fluid.layers.detection_output(scores=scores,"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_output:78
-msgid "loc=loc, prior_box=pb, prior_box_var=pbv)"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1569
-msgid "ssd_loss"
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:1
-msgid "**Multi-box loss layer for object detection algorithm of SSD**"
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:3
-msgid ""
-"This layer is to compute dection loss for SSD given the location offset "
-"predictions, confidence predictions, prior boxes and ground-truth "
-"boudding boxes and labels, and the type of hard example mining. The "
-"returned loss is a weighted sum of the localization loss (or regression "
-"loss) and confidence loss (or classification loss) by performing the "
-"following steps:"
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:9
-msgid "Find matched bounding box by bipartite matching algorithm."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:11
-msgid "1.1 Compute IOU similarity between ground-truth boxes and prior boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:13
-msgid "1.2 Compute matched boundding box by bipartite matching algorithm."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:15
-msgid "Compute confidence for mining hard examples"
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:17
-msgid "2.1. Get the target label based on matched indices."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:19
-msgid "2.2. Compute confidence loss."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:21
-msgid ""
-"Apply hard example mining to get the negative example indices and update "
-"the matched indices."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:24
-msgid "Assign classification and regression targets"
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:26
-msgid "4.1. Encoded bbox according to the prior boxes."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:28
-msgid "4.2. Assign regression targets."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:30
-msgid "4.3. Assign classification targets."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:32
-msgid "Compute the overall objective loss."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:34
-msgid "5.1 Compute confidence loss."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:36
-msgid "5.1 Compute localization loss."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:38
-msgid "5.3 Compute the overall weighted loss."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:40
-msgid ""
-"The location predictions are a 3D Tensor with shape [N, Np, 4], N is the "
-"batch size, Np is total number of predictions for each instance. 4 is the"
-" number of coordinate values, the layout is [xmin, ymin, xmax, ymax]."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:45
-msgid ""
-"The confidence predictions are a 3D Tensor with shape [N, Np, C], N and "
-"Np are the same as they are in `location`, C is the class number."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:49
-msgid ""
-"The ground-truth boudding boxes (bboxes) are a 2D LoDTensor with shape "
-"[Ng, 4], Ng is the total number of ground-truth bboxes of mini-batch "
-"input."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:53
-msgid "The ground-truth labels are a 2D LoDTensor with shape [Ng, 1]."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:56
-msgid "The prior boxes are a 2D Tensor with shape [Np, 4]."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:58
-msgid "The variance of prior boxes are a 2D Tensor with shape [Np, 4]."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:61
-msgid "The index of background label, 0 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:63
-msgid ""
-"If match_type is 'per_prediction', use `overlap_threshold` to determine "
-"the extra matching bboxes when  finding matched boxes. 0.5 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:63
-msgid ""
-"If match_type is 'per_prediction', use `overlap_threshold` to determine "
-"the extra matching bboxes when"
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:65
-msgid "finding matched boxes. 0.5 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:67
-msgid ""
-"The ratio of the negative boxes to the positive boxes, used only when "
-"mining_type is 'max_negative', 3.0 by defalut."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:70
-msgid ""
-"The negative overlap upper bound for the unmatched predictions. Use only "
-"when mining_type is 'max_negative', 0.5 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:74
-msgid "Weight for localization loss, 1.0 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:76
-msgid "Weight for confidence loss, 1.0 by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:78
-msgid ""
-"The type of matching method during training, should be 'bipartite' or "
-"'per_prediction', 'per_prediction' by defalut."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:81
-msgid ""
-"The hard example mining type, should be 'hard_example' or 'max_negative',"
-" now only support `max_negative`."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:84
-msgid ""
-"Whether to normalize the SSD loss by the total number of output "
-"locations, True by default."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:87
-msgid ""
-"The max sample size of negative box, used only when mining_type is "
-"'hard_example'."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:91
-msgid ""
-"The weighted sum of the localization loss and confidence loss, with"
-"         shape [N * Np, 1], N and Np are the same as they are in "
-"`location`."
-msgstr ""
-
-#: of paddle.fluid.layers.ssd_loss:93
-msgid ""
-":exc:`ValueError` -- If mining_type is 'hard_example', now only support "
-"mining         type of `max_negative`."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1577
-msgid "detection_map"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:1
-msgid ""
-"Detection mAP evaluate operator. The general steps are as follows. First,"
-" calculate the true positive and"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:3
-msgid ""
-"false positive according to the input of detection and labels, then "
-"calculate the mAP evaluate value. Supporting '11 point' and 'integral' "
-"mAP algorithm. Please get more information from the following articles: "
-"https://sanchom.wordpress.com/tag/average-precision/ "
-"https://arxiv.org/abs/1512.02325"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:11
-msgid ""
-"(LoDTensor) A 2-D LoDTensor with shape [M, 6] represents the detections. "
-"Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax], M is "
-"the total number of detect results in this mini-batch. For each instance,"
-" the offsets in first dimension are called LoD, the number of offset is N"
-" + 1, if LoD[i + 1] - LoD[i] == 0, means there is no detected data. "
-"Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:13
-msgid ""
-"(LoDTensor) A 2-D LoDTensor represents theLabeled ground-truth data. Each"
-" row has 6 values: [label, xmin, ymin, xmax, ymax, is_difficult] or 5 "
-"values: [label, xmin, ymin, xmax, ymax], where N is the total number of "
-"ground-truth data in this mini-batch. For each instance, the offsets in "
-"first dimension are called LoD, the number of offset is N + 1, if LoD[i +"
-" 1] - LoD[i] == 0, means there is no ground-truth data. Duplicable: False"
-"  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:15
-msgid ""
-"(Tensor<int>) A tensor with shape [1], 0 means ignoring input states, "
-"which including PosCount, TruePos, FalsePos. Duplicable: False  Optional:"
-" True"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:17
-msgid ""
-"(Tensor) A tensor with shape [Ncls, 1], store the input positive example "
-"count of each class, Ncls is the count of input classification. This "
-"input is used to pass the AccumPosCount generated by the previous mini-"
-"batch when the multi mini-batches cumulative calculation carried out. "
-"When the input(PosCount) is empty, the cumulative calculation is not "
-"carried out, and only the results of the current mini-batch are "
-"calculated. Duplicable: False  Optional: True"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:19
-msgid ""
-"(LoDTensor) A 2-D LoDTensor with shape [Ntp, 2], store the input true "
-"positive example of each class.This input is used to pass the "
-"AccumTruePos generated by the previous mini-batch when the multi mini-"
-"batches cumulative calculation carried out. Duplicable: False  Optional: "
-"True"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:21
-msgid ""
-"(LoDTensor) A 2-D LoDTensor with shape [Nfp, 2], store the input false "
-"positive example of each class.This input is used to pass the "
-"AccumFalsePos generated by the previous mini-batch when the multi mini-"
-"batches cumulative calculation carried out. Duplicable: False  Optional: "
-"True"
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:23
-msgid "(int) The class number."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:25
-msgid ""
-"(int, defalut: 0) The index of background label, the background label "
-"will be ignored. If set to -1, then all categories will be considered."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:27
-msgid ""
-"(float) The lower bound jaccard overlap threshold of detection output and"
-" ground-truth data."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:29
-msgid ""
-"(bool, default true) Switch to control whether the difficult data is "
-"evaluated."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:31
-msgid ""
-"(string, default 'integral') The AP algorithm type, 'integral' or "
-"'11point'."
-msgstr ""
-
-#: of paddle.fluid.layers.detection_map:34
-msgid ""
-"(Tensor) A tensor with shape [Ncls, 1], store the positive example count "
-"of each class. It combines the input input(PosCount) and the positive "
-"example count computed from input(Detection) and input(Label)."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1585
-msgid "iou_similarity"
-msgstr ""
-
-#: of paddle.fluid.layers.iou_similarity:1
-msgid ""
-"IOU Similarity Operator. Computes intersection-over-union (IOU) between "
-"two box lists."
-msgstr ""
-
-#: of paddle.fluid.layers.iou_similarity:3
-msgid ""
-"Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, boxes in "
-"'Y' are shared by all instance of the batched inputs of X. Given two "
-"boxes A and B, the calculation of IOU is as follows:"
-msgstr ""
-
-#: of paddle.fluid.layers.iou_similarity:7
-msgid "$$ IOU(A, B) = \\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)} $$"
-msgstr ""
-
-#: of paddle.fluid.layers.iou_similarity:13
-msgid ""
-"(LoDTensor, default LoDTensor<float>) Box list X is a 2-D LoDTensor with "
-"shape [N, 4] holds N boxes, each box is represented as [xmin, ymin, xmax,"
-" ymax], the shape of X is [N, 4]. [xmin, ymin] is the left top coordinate"
-" of the box if the input is image feature map, they are close to the "
-"origin of the coordinate system. [xmax, ymax] is the right bottom "
-"coordinate of the box. This tensor can contain LoD information to "
-"represent a batch of inputs. One instance of this batch can contain "
-"different numbers of entities. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.iou_similarity:15
-msgid ""
-"(Tensor, default Tensor<float>) Box list Y holds M boxes, each box is "
-"represented as [xmin, ymin, xmax, ymax], the shape of X is [N, 4]. [xmin,"
-" ymin] is the left top coordinate of the box if the input is image "
-"feature map, and [xmax, ymax] is the right bottom coordinate of the box. "
-"Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.iou_similarity:18
-msgid ""
-"(LoDTensor, the lod is same as input X) The output of iou_similarity op, "
-"a tensor with shape [N, M] representing pairwise iou scores."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1593
-msgid "box_coder"
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:1
-msgid ""
-"Bounding Box Coder Operator. Encode/Decode the target bounding box with "
-"the priorbox information. The Encoding schema described below: ox = (tx -"
-" px) / pw / pxv oy = (ty - py) / ph / pyv ow = log(abs(tw / pw)) / pwv oh"
-" = log(abs(th / ph)) / phv The Decoding schema described below: ox = (pw "
-"* pxv * tx * + px) - tw / 2 oy = (ph * pyv * ty * + py) - th / 2 ow = "
-"exp(pwv * tw) * pw + tw / 2 oh = exp(phv * th) * ph + th / 2 where tx, "
-"ty, tw, th denote the target box's center coordinates, width and height "
-"respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor) "
-"center coordinates, width and height. pxv, pyv, pwv, phv denote the "
-"variance of the priorbox and ox, oy, ow, oh denote the encoded/decoded "
-"coordinates, width and height."
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:19
-msgid ""
-"(Tensor, default Tensor<float>) Box list PriorBox is a 2-D Tensor with "
-"shape [M, 4] holds M boxes, each box is represented as [xmin, ymin, xmax,"
-" ymax], [xmin, ymin] is the left top coordinate of the anchor box, if the"
-" input is image feature map, they are close to the origin of the "
-"coordinate system. [xmax, ymax] is the right bottom coordinate of the "
-"anchor box. Duplicable: False  Optional: False"
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:21
-msgid ""
-"(Tensor, default Tensor<float>, optional) PriorBoxVar is a 2-D Tensor "
-"with shape [M, 4] holds M group of variance. PriorBoxVar will set all "
-"elements to 1 by default. Duplicable: False  Optional: True"
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:23
-msgid ""
-"(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape [N, 4]"
-" when code_type is 'encode_center_size'. This input also can be a 3-D "
-"Tensor with shape [N, M, 4] when code_type is 'decode_center_size'. [N, "
-"4], each box is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is "
-"the left top coordinate of the box if the input is image feature map, "
-"they are close to the origin of the coordinate system. [xmax, ymax] is "
-"the right bottom coordinate of the box. This tensor can contain LoD "
-"information to represent a batch of inputs. One instance of this batch "
-"can contain different numbers of entities. Duplicable: False  Optional: "
-"False"
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:25
-msgid ""
-"(string, default encode_center_size) the code type used with the target "
-"box"
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:27
-msgid "(bool, default true) whether treat the priorbox as a noramlized box"
-msgstr ""
-
-#: of paddle.fluid.layers.box_coder:30
-msgid ""
-"(LoDTensor or Tensor) When code_type is 'encode_center_size', the output "
-"tensor of box_coder_op with shape [N, M, 4] representing the result of N "
-"target boxes encoded with M Prior boxes and variances. When code_type is "
-"'decode_center_size', N represents the batch size and M represents the "
-"number of deocded boxes."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1599
-msgid "metric"
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1604
-msgid "accuracy"
-msgstr ""
-
-#: of paddle.fluid.layers.accuracy:1
-msgid ""
-"This function computes the accuracy using the input and label. The output"
-" is the top k inputs and their indices."
-msgstr ""
-
-#: ../../source/api_reference/layers.rst:1612
-msgid "auc"
-msgstr ""
-
-#~ msgid "layers"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "Returns a list of places based on"
-#~ " flags. The list will be used "
-#~ "for parallel execution."
-#~ msgstr ""
-
-#~ msgid ""
-#~ "If :attr:`None`, it makes no effect "
-#~ "to lookup. Otherwise the given "
-#~ ":attr:`padding_idx` indicates padding the "
-#~ "output with zeros whenever lookup "
-#~ "encounters it in :attr:`input`. If "
-#~ ":math:`padding_idx < 0`, the padding_idx "
-#~ "to use in lookup is :math:`size[0] "
-#~ "+ dim`."
-#~ msgstr ""
-
-#~ msgid "**Convlution2D Layer**"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "The convolution2D layer calculates the "
-#~ "output based on the input, filter "
-#~ "and strides, paddings, dilations, groups "
-#~ "parameters. Input(Input) and Output(Output) "
-#~ "are in NCHW format. Where N is "
-#~ "batch size, C is the number of "
-#~ "channels, H is the height of the"
-#~ " feature, and W is the width of"
-#~ " the feature. The details of "
-#~ "convolution layer, please refer UFLDL's "
-#~ "`convolution, "
-#~ "<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_"
-#~ " . If bias attribution and activation"
-#~ " type are provided, bias is added "
-#~ "to the output of the convolution, "
-#~ "and the corresponding activation function "
-#~ "is applied to the final result."
-#~ msgstr ""
-
-#~ msgid "Output: Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "The input image with [N, C, H, "
-#~ "W] format. num_filters(int): The number "
-#~ "of filter. It is as same as "
-#~ "the output image channel."
-#~ msgstr ""
-
-#~ msgid "Mean Operator."
-#~ msgstr ""
-
-#~ msgid "Out is a scalar which is the mean of all elements in X."
-#~ msgstr ""
-
-#~ msgid "The input of mean op Duplicable: False  Optional: False"
-#~ msgstr ""
-
-#~ msgid "The output of mean op"
-#~ msgstr ""
-
-#~ msgid "Softshrink Activation Operator."
-#~ msgstr ""
-
-#~ msgid "$$ out = \\begin{cases}"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "x - \\lambda, \\text{if } x > "
-#~ "\\lambda \\\\ x + \\lambda, \\text{if"
-#~ " } x < -\\lambda \\\\ 0,  "
-#~ "\\text{otherwise} \\end{cases}"
-#~ msgstr ""
-
-#~ msgid "$$"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/metrics.po b/source/locale/en/LC_MESSAGES/api_reference/metrics.po
deleted file mode 100644
index 0b9c8a26f53c859201ea94c7c881dfa98d7afbdd..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/metrics.po
+++ /dev/null
@@ -1,248 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/metrics.rst:6
-msgid "fluid.metrics"
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:11
-msgid "MetricBase"
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:1
-msgid "Base Class for all evaluators"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy paddle.fluid.metrics.Auc
-#: paddle.fluid.metrics.EditDistance paddle.fluid.metrics.MetricBase
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:3
-msgid ""
-"The name of evaluator. such as, \"accuracy\". Used for generate temporary"
-" variable name."
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:15
-msgid "Interface:"
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:8
-msgid "Note(*) : the states is the attributes who not has _ prefix."
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:10
-msgid ""
-"get_config(): print current states and configuration reset(): clear the "
-"states. If the Metrics states type is not (int, float, np.ndarray),"
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:12
-msgid "Please override this method."
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase:13
-msgid ""
-"update(): update states at every minibatch eval(): get metric evaluation "
-"in numpy type."
-msgstr ""
-
-#: of paddle.fluid.metrics.MetricBase.reset:1
-msgid ""
-"states is the attributes who not has _ prefix. reset the states of "
-"metrics."
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:20
-msgid "CompositeMetric"
-msgstr ""
-
-#: of paddle.fluid.metrics.CompositeMetric:1
-msgid ""
-"Compute multiple metrics in each minibatch. for example, merge F1, "
-"accuracy, recall into one Metric."
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:29
-msgid "Accuracy"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:1
-msgid ""
-"Accumulate the accuracy from minibatches and compute the average accuracy"
-" for every pass."
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:4 paddle.fluid.metrics.EditDistance:4
-msgid "the metrics name"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:6 paddle.fluid.metrics.EditDistance:6
-msgid "Example"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:8
-msgid ""
-"minibatch_accuracy = fluid.layers.accuracy(pred, label) "
-"accuracy_evaluator = fluid.metrics.Accuracy() for epoch in PASS_NUM:"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:11
-msgid "accuracy_evaluator.reset() for data in batches:"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:13
-msgid "loss = exe.run(fetch_list=[cost, minibatch_accuracy])"
-msgstr ""
-
-#: of paddle.fluid.metrics.Accuracy:14
-msgid ""
-"accuracy_evaluator.update(value=minibatch_accuracy, weight=batches) "
-"accuracy = accuracy_evaluator.eval()"
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:38
-msgid "ChunkEvaluator"
-msgstr ""
-
-#: of paddle.fluid.metrics.ChunkEvaluator:1
-msgid ""
-"Accumulate counter numbers output by chunk_eval from mini-batches and "
-"compute the precision recall and F1-score using the accumulated counter "
-"numbers."
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:47
-msgid "EditDistance"
-msgstr ""
-
-#: of paddle.fluid.metrics.EditDistance:1
-msgid ""
-"Accumulate edit distance sum and sequence number from mini-batches and "
-"compute the average edit_distance and instance error of all batches."
-msgstr ""
-
-#: of paddle.fluid.metrics.EditDistance:8
-msgid ""
-"edit_distance_metrics = fluid.layers.edit_distance(input, label) "
-"distance_evaluator = fluid.metrics.EditDistance() for epoch in PASS_NUM:"
-msgstr ""
-
-#: of paddle.fluid.metrics.EditDistance:11
-msgid "distance_evaluator.reset() for data in batches:"
-msgstr ""
-
-#: of paddle.fluid.metrics.EditDistance:13
-msgid "loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))"
-msgstr ""
-
-#: of paddle.fluid.metrics.EditDistance:14
-msgid ""
-"distance_evaluator.update(*edit_distance_metrics) distance, "
-"instance_error = distance_evaluator.eval()"
-msgstr ""
-
-#: of paddle.fluid.metrics.EditDistance:17
-msgid ""
-"In the above example: 'distance' is the average of the edit distance in a"
-" pass. 'instance_error' is the instance error rate in a pass."
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:56
-msgid "DetectionMAP"
-msgstr ""
-
-#: of paddle.fluid.metrics.DetectionMAP:1
-msgid "Calculate the detection mean average precision (mAP)."
-msgstr ""
-
-#: of paddle.fluid.metrics.DetectionMAP:3
-msgid ""
-"TODO (Dang Qingqing): update the following doc. The general steps are as "
-"follows: 1. calculate the true positive and false positive according to "
-"the input"
-msgstr ""
-
-#: of paddle.fluid.metrics.DetectionMAP:6
-msgid "of detection and labels."
-msgstr ""
-
-#: of paddle.fluid.metrics.DetectionMAP:7
-msgid "calculate mAP value, support two versions: '11 point' and 'integral'."
-msgstr ""
-
-#: of paddle.fluid.metrics.DetectionMAP:10
-msgid "Please get more information from the following articles:"
-msgstr ""
-
-#: of paddle.fluid.metrics.DetectionMAP:10
-msgid ""
-"https://sanchom.wordpress.com/tag/average-precision/ "
-"https://arxiv.org/abs/1512.02325"
-msgstr ""
-
-#: ../../source/api_reference/metrics.rst:65
-msgid "Auc"
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:1
-msgid ""
-"Auc Metrics which adapts to binary classification. Need to note that auc "
-"metrics compute the value via Python natively. If you concern the speed, "
-"please use the fluid.layers.auc instead."
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:11
-msgid "The `auc` function creates four local variables, `true_positives`,"
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:6
-msgid ""
-"`true_negatives`, `false_positives` and `false_negatives` that are used "
-"to compute the AUC. To discretize the AUC curve, a linearly spaced set of"
-" thresholds is used to compute pairs of recall and precision values. The "
-"area under the ROC-curve is therefore computed using the height of the "
-"recall values by the false positive rate, while the area under the PR-"
-"curve is the computed using the height of the precision values by the "
-"recall."
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:13
-msgid "metric name"
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:14
-msgid ""
-"Specifies the name of the curve to be computed, 'ROC' [default] or 'PR' "
-"for the Precision-Recall-curve."
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:16
-msgid "The number of thresholds to use when discretizing the roc curve."
-msgstr ""
-
-#: of paddle.fluid.metrics.Auc:19
-msgid "\"NOTE: only implement the ROC curve type via Python now.\""
-msgstr ""
-
-#~ msgid "metrics"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/nets.po b/source/locale/en/LC_MESSAGES/api_reference/nets.po
deleted file mode 100644
index 608622c3b86cea232ca0f0c8db013b6d9bbf6b75..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/nets.po
+++ /dev/null
@@ -1,157 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/nets.rst:6
-msgid "fluid.nets"
-msgstr ""
-
-#: ../../source/api_reference/nets.rst:11
-msgid "simple_img_conv_pool"
-msgstr ""
-
-#: ../../source/api_reference/nets.rst:19
-msgid "sequence_conv_pool"
-msgstr ""
-
-#: ../../source/api_reference/nets.rst:27
-msgid "glu"
-msgstr ""
-
-#: of paddle.fluid.nets.glu:1
-msgid ""
-"The gated linear unit composed by split, sigmoid activation and "
-"elementwise multiplication. Specifically, Split the input into two equal "
-"sized parts :math:`a` and :math:`b` along the given dimension and then "
-"compute as following:"
-msgstr ""
-
-#: of paddle.fluid.nets.glu:10
-msgid ""
-"Refer to `Language Modeling with Gated Convolutional Networks "
-"<https://arxiv.org/pdf/1612.08083.pdf>`_."
-msgstr ""
-
-#: of paddle.fluid.nets.glu paddle.fluid.nets.scaled_dot_product_attention
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.nets.glu:13
-msgid "The input variable which is a Tensor or LoDTensor."
-msgstr ""
-
-#: of paddle.fluid.nets.glu:15
-msgid ""
-"The dimension along which to split. If :math:`dim < 0`, the dimension to "
-"split along is :math:`rank(input) + dim`."
-msgstr ""
-
-#: of paddle.fluid.nets.glu paddle.fluid.nets.scaled_dot_product_attention
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.nets.glu:19
-msgid "The Tensor variable with half the size of input."
-msgstr ""
-
-#: of paddle.fluid.nets.glu paddle.fluid.nets.scaled_dot_product_attention
-msgid "返回类型"
-msgstr ""
-
-#: of paddle.fluid.nets.glu:22
-#: paddle.fluid.nets.scaled_dot_product_attention:46
-msgid "Examples"
-msgstr ""
-
-#: ../../source/api_reference/nets.rst:35
-msgid "scaled_dot_product_attention"
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:1
-msgid "The dot-product attention."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:3
-msgid ""
-"Attention mechanism can be seen as mapping a query and a set of key-value"
-" pairs to an output. The output is computed as a weighted sum of the "
-"values, where the weight assigned to each value is computed by a "
-"compatibility function (dot-product here) of the query with the "
-"corresponding key."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:8
-msgid ""
-"The dot-product attention can be implemented through (batch) matrix "
-"multipication as follows:"
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:15
-msgid ""
-"Refer to `Attention Is All You Need "
-"<https://arxiv.org/pdf/1706.03762.pdf>`_."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:18
-#: paddle.fluid.nets.scaled_dot_product_attention:20
-#: paddle.fluid.nets.scaled_dot_product_attention:22
-msgid "The input variable which should be a 3-D Tensor."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:24
-msgid ""
-"Head number to compute the scaled dot product attention. Default value is"
-" 1."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:27
-msgid "The dropout rate to drop the attention weight. Default value is 0."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:31
-msgid ""
-"A 3-D Tensor computed by multi-head scaled dot product                   "
-"attention."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention
-msgid "raises"
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:34
-msgid ":exc:`ValueError` -- If input queries, keys, values are not 3-D Tensors."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:38
-msgid ""
-"1. When num_heads > 1, three linear projections are learned respectively "
-"to map input queries, keys and values into queries', keys' and values'. "
-"queries', keys' and values' have the same shapes with queries, keys and "
-"values."
-msgstr ""
-
-#: of paddle.fluid.nets.scaled_dot_product_attention:43
-msgid ""
-"1. When num_heads == 1, scaled_dot_product_attention has no learnable "
-"parameters."
-msgstr ""
-
-#~ msgid "nets"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/optimizer.po b/source/locale/en/LC_MESSAGES/api_reference/optimizer.po
deleted file mode 100644
index 7d5dceb78cb8cb1cda17b7d34da705dbb1bb0ec7..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/optimizer.po
+++ /dev/null
@@ -1,329 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/optimizer.rst:6
-msgid "fluid.optimizer"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:11
-msgid "SGD"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:20
-msgid "Momentum"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:29
-msgid "Adagrad"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:38
-msgid "Adam"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:47
-msgid "Adamax"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:56
-msgid "DecayedAdagrad"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:65
-msgid "SGDOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.SGDOptimizer:1
-msgid "Simple SGD optimizer without any state."
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:74
-msgid "MomentumOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.MomentumOptimizer:1
-msgid "Simple Momentum optimizer with velocity state"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:83
-msgid "AdagradOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.AdagradOptimizer:1
-msgid "Simple Adagrad optimizer with moment state"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:92
-msgid "AdamOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.AdamOptimizer:1
-msgid "Implements the Adam Optimizer"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:101
-msgid "AdamaxOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.AdamaxOptimizer:1
-msgid "Implements the Adamax Optimizer"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:110
-msgid "DecayedAdagradOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.DecayedAdagradOptimizer:1
-msgid "Simple Decayed Adagrad optimizer with moment state"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:119
-msgid "RMSPropOptimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:1
-msgid ""
-"Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive "
-"learning rate method. The original slides proposed RMSProp: Slide 29 of "
-"http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf ."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:5
-msgid "The original equation is as follows:"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:13
-msgid ""
-"The first equation calculates moving average of the squared gradient for "
-"each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:16
-msgid ""
-"In some cases, adding a momentum term :math: `\\beta` is beneficial. In "
-"our implementation, Nesterov momentum is used:"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:28
-msgid ""
-"where, :math: `\\rho` is a hyperparameter and typical values are 0.9, "
-"0.95 and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is"
-" a smoothing term to avoid division by zero, usually set somewhere in "
-"range from 1e-4 to 1e-8."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage
-#: paddle.fluid.optimizer.Optimizer.create_optimization_pass
-#: paddle.fluid.optimizer.RMSPropOptimizer
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:34
-msgid "global leraning rate."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:36
-msgid "rho is :math: `\\rho` in equation, set 0.95 by default."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:38
-msgid ""
-":math: `\\epsilon` in equation is smoothing term to avoid division by "
-"zero, set 1e-6 by default."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer
-msgid "math"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:38
-msgid "`\\epsilon` in equation is smoothing term to"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:39
-msgid "avoid division by zero, set 1e-6 by default."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:41
-msgid ":math: `\\beta` in equation is the momentum term, set 0.0 by default."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:41
-msgid "`\\beta` in equation is the momentum term,"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:42
-msgid "set 0.0 by default."
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer
-msgid "raises"
-msgstr ""
-
-#: of paddle.fluid.optimizer.RMSPropOptimizer:45
-msgid ":exc:`ValueError` -- If learning_rate, rho, epsilon, momentum are None."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:14
-#: paddle.fluid.optimizer.RMSPropOptimizer:47
-msgid "Examples"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:128
-msgid "Adadelta"
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:137
-msgid "ModelAverage"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:1
-msgid ""
-"Accumulate the average of parameters whtin sliding window. The average "
-"result will be saved in temporary variables which can be applied to "
-"parameter variables of current model by calling 'apply()' method. And the"
-" 'restore()' method is used to restored the parameter values of current "
-"model."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:6
-msgid ""
-"The size of average window is determined by average_window_rate, "
-"min_average_window, max_average_window and current update times."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:9
-msgid "The rate of average window."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:10
-msgid "A list of parameter-grad variable pairs."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:11
-msgid "The minimum size of average window."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:12
-msgid "The maximum size of average window."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:16
-msgid ""
-"... optimizer = fluid.optimizer.Momentum() _, params_grads = "
-"optimizer.minimize(cost) model_average = "
-"fluid.optimizer.ModelAverage(params_grads, 0.15,"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:20
-msgid "min_average_window=10000, max_average_window=20000)"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:27
-msgid "for pass_id in range(args.pass_num):"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:24
-msgid "for data in train_reader():"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:24
-msgid "exe.run(fluid.default_main_program()...)"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:27
-msgid "with model_average.apply(exe):"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:27
-msgid "for data in test_reader():"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage:28
-msgid "exe.run(inference_program...)"
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage.apply:1
-msgid "Apply average values to parameters of current model."
-msgstr ""
-
-#: of paddle.fluid.optimizer.ModelAverage.restore:1
-msgid "Restore parameter values of current model."
-msgstr ""
-
-#: ../../source/api_reference/optimizer.rst:146
-msgid "Optimizer"
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer:1
-msgid "Optimizer Base class."
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer:3
-msgid ""
-"Define the common interface of an optimizer. User should not use this "
-"class directly, but need to use one of it's implementation."
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.create_optimization_pass:1
-msgid "Add optimization operators to update gradients to variables."
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.create_optimization_pass:3
-msgid "the target that this optimization is for."
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.create_optimization_pass:4
-msgid "a list of (variable, gradient) pair to update."
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.create_optimization_pass
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.create_optimization_pass:6
-msgid ""
-"a list of operators that will complete one step of optimization. This "
-"will include parameter update ops, global step update ops and any other "
-"custom ops required by subclasses to manage their internal state. :param "
-"startup_program:"
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.create_optimization_pass
-msgid "返回类型"
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.global_learning_rate:1
-msgid "get global decayed learning rate :return:"
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.minimize:1
-msgid "Add operations to minimize `loss` by updating `parameter_list`."
-msgstr ""
-
-#: of paddle.fluid.optimizer.Optimizer.minimize:3
-msgid ""
-"This method combines interface `append_backward()` and "
-"`create_optimization_pass()` into one."
-msgstr ""
-
-#~ msgid "optimizer"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/param_attr.po b/source/locale/en/LC_MESSAGES/api_reference/param_attr.po
deleted file mode 100644
index b437c0ec96093160fed8989a0218b2dba0e99f29..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/param_attr.po
+++ /dev/null
@@ -1,42 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/param_attr.rst:6
-msgid "fluid.param_attr"
-msgstr ""
-
-#: ../../source/api_reference/param_attr.rst:11
-msgid "ParamAttr"
-msgstr ""
-
-#: ../../source/api_reference/param_attr.rst:20
-msgid "WeightNormParamAttr"
-msgstr ""
-
-#: of paddle.fluid.param_attr.WeightNormParamAttr:1
-msgid ""
-"Used for weight normalization. Any field in ParamAttr can also be set "
-"here. Besides, an extra field dim can be set to indicate the dimension "
-"except which to normalize."
-msgstr ""
-
-#~ msgid "param_attr"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/profiler.po b/source/locale/en/LC_MESSAGES/api_reference/profiler.po
deleted file mode 100644
index 65759d75bf4fb411fcd8a60b838c602ce11052a3..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/profiler.po
+++ /dev/null
@@ -1,129 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/profiler.rst:6
-msgid "fluid.profiler"
-msgstr ""
-
-#: ../../source/api_reference/profiler.rst:11
-msgid "cuda_profiler"
-msgstr ""
-
-#: of paddle.fluid.profiler.cuda_profiler:1
-msgid ""
-"The CUDA profiler. This fuctions is used to profile CUDA program by CUDA "
-"runtime application programming interface. The profiling result will be "
-"written into `output_file` with Key-Value pair format or Comma separated "
-"values format. The user can set the output mode by `output_mode` argument"
-" and set the counters/options for profiling by `config` argument. The "
-"default config is ['gpustarttimestamp', 'gpustarttimestamp', "
-"'gridsize3d', 'threadblocksize', 'streamid', 'enableonstart 0', "
-"'conckerneltrace']."
-msgstr ""
-
-#: of paddle.fluid.profiler.cuda_profiler paddle.fluid.profiler.profiler
-#: paddle.fluid.profiler.start_profiler paddle.fluid.profiler.stop_profiler
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.profiler.cuda_profiler:10
-msgid "The output file name, the result will be written into this file."
-msgstr ""
-
-#: of paddle.fluid.profiler.cuda_profiler:13
-msgid ""
-"The output mode has Key-Value pair format and Comma separated values "
-"format. It should be 'kvp' or 'csv'."
-msgstr ""
-
-#: of paddle.fluid.profiler.cuda_profiler:16
-msgid ""
-"The profiler options and counters can refer to \"Compute Command Line "
-"Profiler User Guide\"."
-msgstr ""
-
-#: ../../source/api_reference/profiler.rst:19
-msgid "reset_profiler"
-msgstr ""
-
-#: of paddle.fluid.profiler.reset_profiler:1
-msgid ""
-"The profiler clear interface. reset_profiler will clear the previous time"
-" record."
-msgstr ""
-
-#: ../../source/api_reference/profiler.rst:27
-msgid "profiler"
-msgstr ""
-
-#: of paddle.fluid.profiler.profiler:1
-msgid ""
-"The profiler interface. Different from cuda_profiler, this profiler can "
-"be used to profile both CPU and GPU program. By defalut, it records the "
-"CPU and GPU operator kernels, if you want to profile other program, you "
-"can refer the profiling tutorial to add more records."
-msgstr ""
-
-#: of paddle.fluid.profiler.profiler:7
-msgid ""
-"The profiling state, which should be 'CPU' or 'GPU', telling the profiler"
-" to use CPU timer or GPU timer for profiling. Although users may have "
-"already specified the execution place (CPUPlace/CUDAPlace) in the "
-"begining, for flexibility the profiler would not inherit this place."
-msgstr ""
-
-#: of paddle.fluid.profiler.profiler:13 paddle.fluid.profiler.stop_profiler:3
-msgid ""
-"If None, the profiling results will be printed in the order of first end "
-"time of events. Otherwise, the profiling results will be sorted by the "
-"this flag. This flag should be one of 'calls', 'total', 'max', 'min' or "
-"'ave'. The `calls` means sorting by the number of calls. The `total` "
-"means sorting by the total execution time. The `max` means sorting by the"
-" maximum execution time. The `min` means sorting by the minimum execution"
-" time. The `ave` means sorting by the average execution time."
-msgstr ""
-
-#: of paddle.fluid.profiler.profiler:23 paddle.fluid.profiler.stop_profiler:13
-msgid "If state == 'All', it will write a profile proto output file."
-msgstr ""
-
-#: ../../source/api_reference/profiler.rst:35
-msgid "start_profiler"
-msgstr ""
-
-#: of paddle.fluid.profiler.start_profiler:1
-msgid "Enable the profiler."
-msgstr ""
-
-#: of paddle.fluid.profiler.start_profiler:3
-msgid ""
-"The profiling state, which should be 'CPU', 'GPU' or 'All'. 'CPU' means "
-"only profile CPU. 'GPU' means profiling GPU as well. 'All' also generates"
-" timeline."
-msgstr ""
-
-#: ../../source/api_reference/profiler.rst:43
-msgid "stop_profiler"
-msgstr ""
-
-#: of paddle.fluid.profiler.stop_profiler:1
-msgid "Stop the profiler."
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/api_reference/regularizer.po b/source/locale/en/LC_MESSAGES/api_reference/regularizer.po
deleted file mode 100644
index dae595231b78aee08f9077fd6e9cc3b07da50013..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/api_reference/regularizer.po
+++ /dev/null
@@ -1,114 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/api_reference/regularizer.rst:6
-msgid "fluid.regularizer"
-msgstr ""
-
-#: ../../source/api_reference/regularizer.rst:11
-msgid "append_regularization_ops"
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops:1
-msgid "Create and add backward regularization Operators"
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops:3
-msgid ""
-"Creates and adds backward regularization operators in the BlockDesc. This"
-" will add gradients of the regularizer function to the gradients of the "
-"parameters and return these modified gradients. This is the same as "
-"implementing weight decay in optimizers for regularization."
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops
-msgid "参数"
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops:8
-msgid "A list of (parameters, gradients) pairs that need to be regularized."
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops:10
-msgid ""
-"A global regularizer. If the parameter is not set. It will be applied "
-"with regularizer."
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops
-msgid "返回"
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops:13
-msgid "list of (parameters, gradients) pair with the regularized gradient"
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops
-msgid "raises"
-msgstr ""
-
-#: of paddle.fluid.regularizer.append_regularization_ops:15
-msgid ":exc:`Exception` -- Unknown regularization type"
-msgstr ""
-
-#: ../../source/api_reference/regularizer.rst:19
-msgid "WeightDecayRegularizer"
-msgstr ""
-
-#: of paddle.fluid.regularizer.WeightDecayRegularizer:1
-msgid "Base class for weight decay regularizers"
-msgstr ""
-
-#: of paddle.fluid.regularizer.WeightDecayRegularizer:3
-msgid ""
-"Defines the common interface of weight-decay regularizers. Weight-decay "
-"regularizers are added only during the backward pass for faster "
-"regularization. They add operations to the network that correspond to "
-"gradient of the regularization function. Users should not use this class "
-"directly, but need to use one of its implementations"
-msgstr ""
-
-#: ../../source/api_reference/regularizer.rst:28
-msgid "L1Decay"
-msgstr ""
-
-#: ../../source/api_reference/regularizer.rst:37
-msgid "L2Decay"
-msgstr ""
-
-#: ../../source/api_reference/regularizer.rst:46
-msgid "L1DecayRegularizer"
-msgstr ""
-
-#: of paddle.fluid.regularizer.L1DecayRegularizer:1
-msgid "Implements the L1 Weight Decay Regularization"
-msgstr ""
-
-#: ../../source/api_reference/regularizer.rst:55
-msgid "L2DecayRegularizer"
-msgstr ""
-
-#: of paddle.fluid.regularizer.L2DecayRegularizer:1
-msgid "Implements the L2 Weight Decay Regularization"
-msgstr ""
-
-#~ msgid "regularizer"
-#~ msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/faq.po b/source/locale/en/LC_MESSAGES/faq.po
deleted file mode 100644
index fbc933005b94175942aecf1afad3661fdb6d1493..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/faq.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/faq.rst:3
-msgid "FAQ"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/index.po b/source/locale/en/LC_MESSAGES/index.po
deleted file mode 100644
index 06ace94f6a47defdf53ae107c730b8ba62a0998f..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/index.po
+++ /dev/null
@@ -1,40 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: 2018-06-19 16:12+0800\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-"Last-Translator: \n"
-"Language-Team: \n"
-"Language: en\n"
-"X-Generator: Poedit 2.0.8\n"
-
-#: ../../source/index.rst:8
-msgid "欢迎来到 Fluid"
-msgstr "Welcome to Fluid documentation"
-
-#: ../../source/index.rst:25
-msgid "Indices and tables"
-msgstr ""
-
-#: ../../source/index.rst:27
-msgid ":ref:`genindex`"
-msgstr ""
-
-#: ../../source/index.rst:28
-msgid ":ref:`modindex`"
-msgstr ""
-
-#: ../../source/index.rst:29
-msgid ":ref:`search`"
-msgstr ""
diff --git a/source/locale/en/LC_MESSAGES/quick_start/fit_a_line/index.po b/source/locale/en/LC_MESSAGES/quick_start/fit_a_line/index.po
deleted file mode 100644
index 1b00a65082cbccc33a075da2d953a1bc992ed117..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/fit_a_line/index.po
+++ /dev/null
@@ -1,291 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/fit_a_line/index.md:1
-msgid "线性回归"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:2
-msgid ""
-"让我们从经典的线性回归（Linear Regression "
-"[1]）模型开始这份教程。在这一章里，你将使用真实的数据集建立起一个房价预测模型，并且了解到机器学习中的若干重要概念。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:4
-msgid "本教程源代码目录在book/fit_a_line， 初次使用请参考PaddlePaddle安装教程，更多内容请参考本教程的视频课堂。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:6
-msgid "背景介绍"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:7
-msgid ""
-"给定一个大小为$n$的数据集  ${{y_{i}, x_{i1}, ..., x_{id}}}{i=1}^{n}$，其中$x{i1}, "
-"\\ldots, "
-"x_{id}$是第$i$个样本$d$个属性上的取值，$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述，即"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:9
-msgid ""
-"$$y_i = \\omega_1x_{i1} + \\omega_2x_{i2} + \\ldots + \\omega_dx_{id} + "
-"b,  i=1,\\ldots,n$$"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:11
-msgid ""
-"例如，在我们将要建模的房价预测问题里，$x_{ij}$是描述房子$i$的各种属性（比如房间的个数、周围学校和医院的个数、交通状况等），而 "
-"$y_i$是房屋的价格。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:13
-msgid "初看起来，这个假设实在过于简单了，变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点，它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍[2,3,4]也选择对线性模型独立成章重点讲解。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:15
-msgid "效果展示"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:16
-msgid ""
-"我们使用从UCI Housing Data "
-"Set获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。"
-" alt"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:22
-msgid "模型概览"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:24
-msgid "模型定义"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:26
-msgid ""
-"在波士顿房价数据集中，和房屋相关的值共有14个：前13个用来描述房屋相关的各种信息，即模型中的 "
-"$x_i$；最后一个值为我们要预测的该类房屋价格的中位数，即模型中的 $y_i$。因此，我们的模型就可以表示成："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:28
-msgid ""
-"$$\\hat{Y} = \\omega_1X_{1} + \\omega_2X_{2} + \\ldots + "
-"\\omega_{13}X_{13} + b$$"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:30
-msgid ""
-"$\\hat{Y}$ 表示模型的预测结果，用来和真实值$Y$区分。模型要学习的参数即：$\\omega_1, \\ldots, "
-"\\omega_{13}, b$。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:32
-msgid ""
-"建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值$\\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数（Loss "
-"Function，或Cost Function）这个概念。 "
-"输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\\hat{y_{i}}$，损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:34
-msgid "对于线性回归模型来讲，最常见的损失函数就是均方误差（Mean Squared Error， MSE）了，它的形式是："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:36
-msgid "$$MSE=\\frac{1}{n}\\sum_{i=1}^{n}{(\\hat{Y_i}-Y_i)}^2$$"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:38
-msgid "即对于一个大小为$n$的测试集，$MSE$是$n$个数据预测结果误差平方的均值。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:40
-msgid "训练过程"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:42
-msgid "定义好模型结构之后，我们要通过以下几个步骤进行模型训练"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:48
-msgid "数据集"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:50
-msgid "数据集接口的封装"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:51
-msgid "首先加载需要的包"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:58
-msgid "我们通过uci_housing模块引入了数据集合UCI Housing Data Set"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:60
-msgid "其中，在uci_housing模块中封装了："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:66
-msgid "数据集介绍"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:67
-msgid "这份数据集共506行，每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:151
-msgid "数据预处理"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:152
-msgid "连续值与离散值"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:153
-msgid "观察一下数据，我们的第一个发现是：所有的13维属性中，有12维的连续值和1维的离散值（CHAS）。离散值虽然也常使用类似0、1、2这样的数字表示，但是其含义与连续值是不同的，因为这里的差值没有实际意义。例如，我们用0、1、2来分别表示红色、绿色和蓝色的话，我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有$d$个可能取值的离散属性，我们会将它们转为$d$个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言，因为CHAS本身就是一个二值属性，就省去了这个麻烦。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:155
-msgid "属性的归一化"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:156
-msgid ""
-"另外一个稍加观察即可发现的事实是，各维属性的取值范围差别很大（如图2所示）。例如，属性B的取值范围是[0.32, "
-"396.90]，而属性NOX的取值范围是[0.3850, "
-"0.8170]。这里就要用到一个常见的操作-归一化（normalization）了。归一化的目标是把各位属性的取值范围放缩到差不多的区间，例如[-0.5,0.5]。这里我们使用一种很常见的操作方法：减掉均值，然后除以原取值范围。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:158
-msgid "做归一化（或 Feature scaling）至少有以下3个理由："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:163
-msgid "alt"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:168
-msgid "整理训练集与测试集"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:169
-msgid "我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为训练误差；另外一份被用来测试，模型在这份数据集上的误差被称为测试误差。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:172
-msgid "在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（Hyperparameter）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:174
-msgid "训练"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:176
-msgid "fit_a_line/trainer.py演示了训练的整体过程。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:178
-msgid "初始化PaddlePaddle"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:184
-msgid "模型配置"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:186
-msgid ""
-"线性回归的模型其实就是一个采用线性激活函数（linear activation，LinearActivation）的全连接层（fully-"
-"connected layer，fc_layer）："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:197
-msgid "保存网络拓扑"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:206
-msgid "创建参数"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:212
-msgid "创建Trainer"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:222
-msgid "读取数据且打印训练的中间信息"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:224
-msgid ""
-"PaddlePaddle提供一个 reader机制 来读取数据。 Reader返回的数据可以包括多列，我们需要一个Python dict把列 "
-"序号映射到网络里的数据层。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:233
-msgid "此外，我们还可以提供一个 event handler，来打印训练的进度："
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:285
-msgid "开始训练"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:298
-msgid "png"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:300
-msgid "应用模型"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:302
-msgid "1. 生成测试数据"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:316
-msgid "2. 推测 inference"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:331
-msgid "总结"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:332
-msgid "在这章里，我们借助波士顿房价这一数据集，介绍了线性回归模型的基本概念，以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来，因此弄清楚线性模型的原理和局限非常重要。"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:335
-msgid "参考文献"
-msgstr ""
-
-#: ../../source/quick_start/fit_a_line/index.md:341
-msgid ""
-"<br/> <a rel=\"license\" href=\"http://creativecommons.org/licenses/by-"
-"sa/4.0/\"><img alt=\"知识共享许可协议\" style=\"border-width:0\" "
-"src=\"https://i.creativecommons.org/l/by-sa/4.0/88x31.png\" /></a><br "
-"/><span xmlns:dct=\"http://purl.org/dc/terms/\" "
-"href=\"http://purl.org/dc/dcmitype/Text\" property=\"dct:title\" "
-"rel=\"dct:type\">本教程</span> 由 <a "
-"xmlns:cc=\"http://creativecommons.org/ns#\" "
-"href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" "
-"rel=\"cc:attributionURL\">PaddlePaddle</a> 创作，采用 <a rel=\"license\" "
-"href=\"http://creativecommons.org/licenses/by-sa/4.0/\">知识共享 署名-相同方式共享 "
-"4.0 国际 许可协议</a>进行许可。"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/index.po b/source/locale/en/LC_MESSAGES/quick_start/index.po
deleted file mode 100644
index cfc97c167e85daf9762925e979f5752687497361..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/index.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/index.rst:3
-msgid "新手入门"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/install/build_from_source_cn.po b/source/locale/en/LC_MESSAGES/quick_start/install/build_from_source_cn.po
deleted file mode 100644
index 77ea95aff360e9d161776295935708b6fa3e235d..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/install/build_from_source_cn.po
+++ /dev/null
@@ -1,485 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:2
-msgid "从源码编译"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:7
-msgid "需要的软硬件"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:9
-msgid "为了编译PaddlePaddle，我们需要"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:11
-msgid "一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:12
-msgid "Docker"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:14
-msgid "不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:19
-msgid "编译方法"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:21
-msgid ""
-"PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像 可以在 `这里 "
-"<https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ "
-"找到，您也可以 在 `这里 "
-"<https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__"
-" 找到 paddle_manylinux_devel "
-"镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:26
-msgid "如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:28
-msgid "编译PaddlePaddle，需要执行："
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:42
-msgid "注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:44
-msgid "编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装："
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:50
-msgid "如果机器中已经安装过PaddlePaddle，有两种方法："
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:64
-msgid "执行单元测试"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:66
-msgid "如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法："
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:68
-msgid ""
-"设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。 开启 "
-":code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:75
-msgid "如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）："
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:87
-msgid "常见问题"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:89
-msgid "什么是 Docker?"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:91
-msgid "如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:93
-msgid "Docker 还是虚拟机？"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:95
-msgid ""
-"有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container "
-"里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:97
-msgid "为什么用 Docker?"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:99
-msgid "把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:101
-msgid "另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:103
-msgid "我可以选择不用Docker吗？"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:105
-msgid ""
-"当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker "
-"的开发流程，是因为这个流程比其他方法都更简便。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:107
-msgid "学习 Docker 有多难？"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:109
-msgid ""
-"理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker "
-"<https://zhuanlan.zhihu.com/p/19902938>`_ "
-"。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle "
-"更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:111
-msgid "我可以用 IDE 吗？"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:113
-msgid "当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:115
-msgid "很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:122
-msgid "就可以按 `Ctrl-C` 和 `c` 键来启动编译了。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:124
-msgid "可以并行编译吗？"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:126
-msgid ""
-"是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 "
-"<https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_"
-" 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:128
-msgid "Docker 需要 sudo"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:130
-msgid ""
-"如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 "
-"Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:132
-msgid "在 Windows/MacOS 上编译很慢"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:134
-msgid ""
-"Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 "
-"CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 "
-"<https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:136
-msgid "磁盘不够"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:138
-msgid ""
-"本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用"
-" `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` "
-"命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container "
-"<https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:144
-msgid "附录：编译依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:146
-msgid "PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:148
-msgid "PaddlePaddle编译依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "版本"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "说明"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "CMake"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid ">=3.2"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "GCC"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "4.8.2"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "推荐使用CentOS的devtools2"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "Python"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "2.7.x"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "依赖libpython2.7.so"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "pip"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid ">=9.0"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "numpy"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "SWIG"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid ">=2.0"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "Go"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid ">=1.8"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "可选"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:164
-msgid "附录：编译选项"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:166
-msgid ""
-"PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。 "
-"用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-"
-"tutorial>`_ 。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:170
-msgid "在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如："
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:176
-msgid "编译选项说明"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "选项"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "默认值"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_GPU"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否支持GPU"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "ON"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_C_API"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否仅编译CAPI"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "OFF"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_DOUBLE"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否使用双精度浮点数"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_DSO"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_AVX"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否编译含有AVX指令集的PaddlePaddle二进制文件"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_PYTHON"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否内嵌PYTHON解释器"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_STYLE_CHECK"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否编译时进行代码风格检查"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_TESTING"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否开启单元测试"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_DOC"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否编译中英文文档"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_SWIG_PY"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "Auto"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_GOLANG"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否编译go语言的可容错parameter server"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "WITH_MKL"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:1
-msgid "是否使用MKL数学库，如果为否则是用OpenBLAS"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:194
-msgid "BLAS"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:196
-msgid ""
-"PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和 "
-"`OpenBlAS <http://www.openblas.net/>`_ "
-"两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集， 还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 "
-"<https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_"
-" 。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:200
-msgid "如果关闭MKL，则会使用OpenBLAS作为BLAS库。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:203
-msgid "CUDA/cuDNN"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:205
-msgid ""
-"PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 使用参数 "
-":code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:208
-msgid ""
-"PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。 "
-"我们推荐使用最新版本的cuDNN。"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:212
-msgid "编译选项的设置"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:214
-msgid ""
-"PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ "
-":code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` "
-"命令可以设置，例如"
-msgstr ""
-
-#: ../../source/quick_start/install/build_from_source_cn.rst:220
-msgid ""
-"**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` "
-"）**后，再指定。**"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/install/docker_install_cn.po b/source/locale/en/LC_MESSAGES/quick_start/install/docker_install_cn.po
deleted file mode 100644
index f3aef1964e4533cd92f48d1be9004ec8bd0846cc..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/install/docker_install_cn.po
+++ /dev/null
@@ -1,158 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/install/docker_install_cn.rst:2
-msgid "使用Docker安装运行"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:4
-msgid ""
-"使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 您可以在 "
-"`Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:7
-msgid ""
-"如果您在使用Windows，可以参考 `这篇 "
-"<https://docs.docker.com/toolbox/toolbox_install_windows/>`_ "
-"教程，完成在Windows上安装和使用Docker。"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:11
-msgid "在了解Docker的基本使用方法之后，即可开始下面的步骤："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:16
-msgid "获取PaddlePaddle的Docker镜像"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:18
-msgid "执行下面的命令获取最新的PaddlePaddle Docker镜像，版本为cpu_avx_mkl："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:24
-msgid "对于国内用户，我们提供了加速访问的镜像源："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:30
-msgid "下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:37
-msgid "选择下载使用不同的BLAS库的Docker镜像："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:46
-msgid ""
-"下载指定版本的Docker镜像，可以从 `DockerHub网站 "
-"<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:57
-msgid "在Docker中执行PaddlePaddle训练程序"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:59
-msgid ""
-"假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考 "
-"`PaddlePaddleBook "
-"<http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_"
-" 编写），就可以使用下面的命令开始执行训练："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:68
-msgid ""
-"上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work` "
-"指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work` 目录； "
-":code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py` "
-"为容器内执行的命令，即运行训练程序。"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:73
-msgid "当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:81
-msgid ""
-"**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get "
-"install -y vim` **安装后，在容器中编辑代码。**"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:86
-msgid "使用Docker启动PaddlePaddle Book教程"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:88
-msgid ""
-"使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。 "
-"PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 如果您想要更深入了解deep "
-"learning，PaddlePaddle Book一定是您最好的选择。 "
-"大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:93
-msgid "我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:99
-msgid "国内用户可以使用下面的镜像源来加速访问："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:105
-msgid "然后在浏览器中输入以下网址："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:111
-msgid "就这么简单，享受您的旅程！"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:116
-msgid "使用Docker执行GPU训练"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:118
-msgid ""
-"为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用 `nvidia-docker <https://github.com/NVIDIA"
-"/nvidia-docker>`_ 来运行镜像。 请不要忘记提前在物理机上安装GPU最新驱动。"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:126
-msgid "**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:134
-msgid "**关于AVX：**"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:136
-msgid ""
-"AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 "
-"是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独 `编译 <./build_from_source_cn.html>`_ "
-"PaddlePaddle为no-avx版本。"
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:140
-msgid "以下指令能检查Linux电脑是否支持AVX："
-msgstr ""
-
-#: ../../source/quick_start/install/docker_install_cn.rst:146
-msgid "如果输出是No，就需要选择使用no-AVX的镜像"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/install/index.po b/source/locale/en/LC_MESSAGES/quick_start/install/index.po
deleted file mode 100644
index c23962cb04a9d75b16ad811b1000aa5908532ffe..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/install/index.po
+++ /dev/null
@@ -1,88 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/install/index.rst:2
-msgid "安装与编译"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:6
-msgid "PaddlePaddle针对不同的用户群体提供了多种安装方式。"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:9
-msgid "专注深度学习模型开发"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:11
-msgid "PaddlePaddle提供了多种python wheel包，可通过pip一键安装："
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:18
-msgid "这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:21
-msgid "关注底层框架"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:23
-msgid "PaddlePaddle提供了基于Docker的安装方式，请参照以下教程："
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:30
-msgid "我们推荐在Docker中运行PaddlePaddle，该方式具有以下优势："
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:32
-msgid "无需单独安装第三方依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:33
-msgid "方便分享运行时环境，易于问题的复现"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:35
-msgid "对于有定制化二进制文件需求的用户，我们同样提供了从源码编译安装PaddlePaddle的方法："
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:44
-msgid "需要提醒的是，这种安装方式会涉及到一些第三方库的下载、编译及安装，整个安装过程耗时较长。"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:48
-msgid "常见问题汇总"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:50
-msgid "如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案："
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:52
-msgid ":ref:`常见问题解答 <install_faq>`"
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:54
-msgid "如果问题没有得到解决，欢迎向PaddlePaddle社区反馈问题："
-msgstr ""
-
-#: ../../source/quick_start/install/index.rst:56
-msgid "`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/install/pip_install_cn.po b/source/locale/en/LC_MESSAGES/quick_start/install/pip_install_cn.po
deleted file mode 100644
index a3a14016e80c15bcebaacb9ae0ce27f6c5b6310e..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/install/pip_install_cn.po
+++ /dev/null
@@ -1,332 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/install/pip_install_cn.rst:2
-#: ../../source/quick_start/install/pip_install_cn.rst:11
-msgid "使用pip安装"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:4
-msgid ""
-"PaddlePaddle可以使用常用的Python包管理工具 `pip "
-"<https://pip.pypa.io/en/stable/installing/>`_ "
-"完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:13
-msgid "执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:19
-msgid "当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:26
-msgid "如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行："
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:32
-msgid "当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下："
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:35
-msgid "版本号"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-#: ../../source/quick_start/install/pip_install_cn.rst:35
-msgid "版本说明"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:37
-msgid "paddlepaddle-gpu==0.12.0"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:37
-msgid "使用CUDA 8.0和cuDNN 5编译的0.12.0版本"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:38
-msgid "paddlepaddle-gpu==0.11.0.post87"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:38
-msgid "使用CUDA 8.0和cuDNN 7编译的0.11.0版本"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:39
-msgid "paddlepaddle-gpu==0.11.0.post8"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:39
-msgid "使用CUDA 8.0和cuDNN 5编译的0.11.0版本"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:40
-msgid "paddlepaddle-gpu==0.11.0"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:40
-msgid "使用CUDA 7.5和cuDNN 5编译的0.11.0版本"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:43
-msgid ""
-"您可以在 `Release History <https://pypi.org/project/paddlepaddle-"
-"gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:45
-msgid ""
-"如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装， "
-"您可以从下面的表格中找到需要的版本："
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:48
-msgid "如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载："
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:54
-msgid "各个版本最新的whl包"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cp27-cp27mu"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cp27-cp27m"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cpu_avx_mkl"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful"
-"/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful"
-"/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cpu_avx_openblas"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful"
-"/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful"
-"/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cpu_noavx_openblas"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful"
-"/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful"
-"/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cuda8.0_cudnn5_avx_mkl"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful"
-"/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful"
-"/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "cuda8.0_cudnn7_avx_mkl"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful"
-"/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid ""
-"`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl "
-"<https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful"
-"/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:67
-msgid "运行环境依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:69
-msgid ""
-"PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS"
-" 6以上，Ubuntu 14.04以上，MacOS 10.12以上。"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:71
-msgid ""
-"PaddlePaddle发布的安装包会尽量对齐 `manylinux1 "
-"<https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ "
-"标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS "
-"5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:73
-msgid "PaddlePaddle环境依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "依赖"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "版本"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "说明"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "操作系统"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "Linux, MacOS"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "Python"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "2.7.x"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "暂时不支持Python3"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "libc.so"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "GLIBC_2.7"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "glibc至少包含GLIBC_2.7以上的符号"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "libstdc++.so"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "GLIBCXX_3.4.11, CXXABI_1.3.3"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "libgcc_s.so"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "GCC_3.3"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:1
-msgid "至少包含GCC_3.3以上的符号"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:86
-msgid "安装常见问题和解决方法"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:88
-msgid "paddlepaddle*.whl is not a supported wheel on this platform."
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:90
-msgid ""
-"出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip"
-" (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip："
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:96
-msgid "如果仍然存在问题，可以执行："
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:102
-msgid ""
-"获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 "
-"<https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。"
-msgstr ""
-
-#: ../../source/quick_start/install/pip_install_cn.rst:104
-msgid ""
-"如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 "
-"manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 "
-"再安装。"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/quick_start.po b/source/locale/en/LC_MESSAGES/quick_start/quick_start.po
deleted file mode 100644
index 3b4cded3d8628b6ff1bf09d399664484155af1f3..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/quick_start.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/quick_start.rst:3
-msgid "快速入门"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/recognize_digits/index.po b/source/locale/en/LC_MESSAGES/quick_start/recognize_digits/index.po
deleted file mode 100644
index 781aa37d0ca7e81f267d55879fc5895fa486332a..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/recognize_digits/index.po
+++ /dev/null
@@ -1,305 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-15 16:34+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/recognize_digits/index.md:1
-msgid "识别数字"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:3
-msgid "本教程源代码目录在book/recognize_digits， 初次使用请参考PaddlePaddle安装教程，更多内容请参考本教程的视频课堂。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:5
-msgid "背景介绍"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:6
-msgid ""
-"当我们学习编程的时候，编写的第一个程序一般是实现打印\"Hello World\"。而机器学习（或深度学习）的入门教程，一般都是 MNIST "
-"数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:9
-#: ../../source/quick_start/recognize_digits/index.md:49
-#: ../../source/quick_start/recognize_digits/index.md:65
-#: ../../source/quick_start/recognize_digits/index.md:74
-#: ../../source/quick_start/recognize_digits/index.md:83
-#: ../../source/quick_start/recognize_digits/index.md:100
-msgid "alt"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:15
-msgid ""
-"MNIST数据集是从 NIST 的Special Database 3（SD-3）和Special Database "
-"1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann"
-" "
-"LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:17
-msgid ""
-"Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural "
-"Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann"
-" LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见图像分类 "
-"教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:19
-msgid ""
-"有很多算法在MNIST上进行实验。1998年，LeCun分别用单层线性分类器、多层感知器（Multilayer Perceptron, "
-"MLP）和多层卷积神经网络LeNet进行实验，使得测试集上的误差不断下降（从12%下降到0.7%）[1]。此后，科学家们又基于K近邻（K-Nearest"
-" "
-"Neighbors）算法[2]、支持向量机（SVM）[3]、神经网络[4-7]和Boosting方法[8]等做了大量实验，并采用多种预处理方法（如去除歪曲、去噪、模糊等）来提高识别的准确率。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:21
-msgid "本教程中，我们从简单的模型Softmax回归开始，带大家入门手写字符识别，并逐步进行模型优化。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:24
-msgid "模型概览"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:26
-msgid "基于MNIST数据训练一个分类器，在介绍本教程使用的三个基本图像分类网络前，我们先给出一些定义："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:31
-msgid "Softmax回归(Softmax Regression)"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:33
-msgid "最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征，然后直接通过softmax 函数进行多分类[9]。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:35
-msgid "输入层的数据$X$传到输出层，在激活操作之前，会乘以相应的权重 $W$ ，并加上偏置变量 $b$ ，具体如下："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:37
-msgid "$$ y_i = \\text{softmax}(\\sum_j W_{i,j}x_j + b_i) $$"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:39
-msgid "其中 $ \\text{softmax}(x_i) = \\frac{e^{x_i}}{\\sum_j e^{x_j}} $"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:41
-msgid ""
-"对于有 $N$ 个类别的多分类问题，指定 $N$ 个输出节点，$N$ 维结果向量经过softmax将归一化为 $N$ "
-"个[0,1]范围内的实数值，分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:43
-msgid "在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy），公式如下："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:45
-msgid "$$  \\text{crossentropy}(label, y) = -\\sum_i label_ilog(y_i) $$"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:47
-msgid "图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:54
-msgid "多层感知器(Multilayer Perceptron, MLP)"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:56
-msgid "Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层[10]。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:63
-msgid "图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:70
-msgid "卷积神经网络(Convolutional Neural Network, CNN)"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:72
-msgid "在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。LeNet-5是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:79
-msgid "卷积层"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:81
-msgid "卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:88
-msgid ""
-"图5给出一个卷积计算过程的示例图，输入图像大小为$H=5,W=5,D=3$，即$5 \\times "
-"5$大小的3通道（RGB，也称作深度）彩色图像。这个示例图中包含两（用$K$表示）组卷积核，即图中滤波器$W_0$和$W_1$。在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含（$D=3）$个$3"
-" \\times 3$（用$F \\times "
-"F$表示）大小的卷积核。另外，这个示例中卷积核在图像的水平方向（$W$方向）和垂直方向（$H$方向）的滑动步长为2（用$S$表示）；对输入图像周围各填充1（用$P$表示）个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为$3"
-" \\times 3 \\times 2$（用$H_{o} \\times W_{o} \\times K$表示）大小的特征图，即$3 "
-"\\times 3$大小的2通道特征图，其中$H_o$计算公式为：$H_o = (H - F + 2 \\times P)/S + "
-"1$，$W_o$同理。 "
-"而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置$b_o$，偏置通常对于每个输出特征图是共享的。输出特征图$o[:,:,0]$中的最后一个$-2$计算如图5右下角公式所示。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:90
-msgid ""
-"在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为$D \\times F \\times F \\times "
-"K$。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:96
-msgid ""
-"通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性（shift-"
-"invariant），平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:98
-msgid "池化层"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:105
-msgid "池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:107
-msgid "更详细的关于卷积神经网络的具体知识可以参考斯坦福大学公开课和图像分类教程。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:109
-msgid "常见激活函数介绍"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:118
-msgid "更详细的介绍请参考维基百科激活函数。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:120
-msgid "数据介绍"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:122
-msgid "PaddlePaddle在API中提供了自动加载MNIST数据的模块paddle.dataset.mnist。加载后的数据位于/home/username/.cache/paddle/dataset/mnist下："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:125
-msgid ""
-"|    文件名称          |       说明              | "
-"|----------------------|-------------------------| |train-images-"
-"idx3-ubyte|  训练数据图片，60,000条数据 | |train-labels-idx1-ubyte|  "
-"训练数据标签，60,000条数据 | |t10k-images-idx3-ubyte |  测试数据图片，10,000条数据 | |t10k-"
-"labels-idx1-ubyte |  测试数据标签，10,000条数据 |"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:132
-msgid "配置说明"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:134
-msgid "首先，加载PaddlePaddle的V2 api包。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:139
-msgid "其次，定义三个不同的分类器："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:195
-msgid "接着，通过layer.data调用来获取数据，然后调用分类器（这里我们提供了三个不同的分类器）得到分类结果。训练时，对该结果计算其损失函数，分类问题常常选择交叉熵损失函数。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:213
-msgid "然后，指定训练相关的参数。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:231
-msgid ""
-"下一步，我们开始训练过程。paddle.dataset.movielens.train()和paddle.dataset.movielens.test()分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python"
-" yield generator。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:233
-msgid ""
-"下面shuffle是一个reader decorator，它接受一个reader A，返回另一个reader B —— reader B "
-"每次读入buffer_size条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:235
-msgid ""
-"batch是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— "
-"在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:237
-msgid "event_handler_plot可以用来在训练过程中画图如下："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:239
-msgid "png"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:268
-msgid "event_handler 用来在训练过程中输出训练结果"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:300
-msgid "训练过程是完全自动的，event_handler里打印的日志类似如下所示："
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:311
-msgid ""
-"训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 "
-"92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:314
-msgid "应用模型"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:316
-msgid "可以使用训练好的模型对手写体数字图片进行分类，下面程序展示了如何使用paddle.infer接口进行推断。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:339
-msgid "总结"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:341
-msgid "本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型，后续章节中复杂的神经网络都是从它们衍生出来的，因此这几个模型对之后的学习大有裨益。同时，我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候，MNIST数据集上的识别准确率有了大幅度的提升，原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候，希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外，本教程还介绍了PaddlePaddle模型搭建的基本流程，从dataprovider的编写、网络层的构建，到最后的训练和预测。对这个流程熟悉以后，大家就可以用自己的数据，定义自己的网络模型，并完成自己的训练和预测任务了。"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:343
-msgid "参考文献"
-msgstr ""
-
-#: ../../source/quick_start/recognize_digits/index.md:356
-msgid ""
-"<br/> <a rel=\"license\" href=\"http://creativecommons.org/licenses/by-"
-"sa/4.0/\"><img alt=\"知识共享许可协议\" style=\"border-width:0\" "
-"src=\"https://i.creativecommons.org/l/by-sa/4.0/88x31.png\" /></a><br "
-"/><span xmlns:dct=\"http://purl.org/dc/terms/\" "
-"href=\"http://purl.org/dc/dcmitype/Text\" property=\"dct:title\" "
-"rel=\"dct:type\">本教程</span> 由 <a "
-"xmlns:cc=\"http://creativecommons.org/ns#\" "
-"href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" "
-"rel=\"cc:attributionURL\">PaddlePaddle</a> 创作，采用 <a rel=\"license\" "
-"href=\"http://creativecommons.org/licenses/by-sa/4.0/\">知识共享 署名-相同方式共享 "
-"4.0 国际 许可协议</a>进行许可。"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/quick_start/theoretical_background.po b/source/locale/en/LC_MESSAGES/quick_start/theoretical_background.po
deleted file mode 100644
index 1bf28b046bdf2c1790fb448c8581a2395ce14e34..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/quick_start/theoretical_background.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/quick_start/theoretical_background.rst:3
-msgid "理论知识"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/user_guides/howto/index.po b/source/locale/en/LC_MESSAGES/user_guides/howto/index.po
deleted file mode 100644
index 11cfd42d4eda458ec4e31327ceaabad51b70a56b..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/user_guides/howto/index.po
+++ /dev/null
@@ -1,48 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/user_guides/howto/index.rst:3
-msgid "如何使用PaddlePaddle"
-msgstr ""
-
-#: ../../source/user_guides/howto/index.rst:7
-msgid "概述"
-msgstr ""
-
-#: ../../source/user_guides/howto/index.rst:12
-msgid "数据预处理"
-msgstr ""
-
-#: ../../source/user_guides/howto/index.rst:16
-msgid "配置简单的网络"
-msgstr ""
-
-#: ../../source/user_guides/howto/index.rst:20
-msgid "训练"
-msgstr ""
-
-#: ../../source/user_guides/howto/index.rst:25
-msgid "调试"
-msgstr ""
-
-#: ../../source/user_guides/howto/index.rst:28
-msgid "模型评估"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/user_guides/index.po b/source/locale/en/LC_MESSAGES/user_guides/index.po
deleted file mode 100644
index edac7027c52b965f363b5e1fd77801348513ad56..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/user_guides/index.po
+++ /dev/null
@@ -1,24 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/user_guides/index.rst:3
-msgid "使用指南"
-msgstr ""
-
diff --git a/source/locale/en/LC_MESSAGES/user_guides/model_bank/index.po b/source/locale/en/LC_MESSAGES/user_guides/model_bank/index.po
deleted file mode 100644
index 46a7408a77552e03be7cb5fafaf9a8e06ecb4d06..0000000000000000000000000000000000000000
--- a/source/locale/en/LC_MESSAGES/user_guides/model_bank/index.po
+++ /dev/null
@@ -1,40 +0,0 @@
-# SOME DESCRIPTIVE TITLE.
-# Copyright (C) 2018, paddle-dev@baidu.com
-# This file is distributed under the same license as the PaddlePaddle Fluid
-# package.
-# FIRST AUTHOR <EMAIL@ADDRESS>, 2018.
-#
-#, fuzzy
-msgid ""
-msgstr ""
-"Project-Id-Version: PaddlePaddle Fluid 0.13.0\n"
-"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2018-06-14 18:52+0800\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
-"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
-"Language-Team: LANGUAGE <LL@li.org>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=utf-8\n"
-"Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.6.0\n"
-
-#: ../../source/user_guides/model_bank/index.rst:3
-msgid "模型库"
-msgstr ""
-
-#: ../../source/user_guides/model_bank/index.rst:7
-msgid "图像"
-msgstr ""
-
-#: ../../source/user_guides/model_bank/index.rst:11
-msgid "NLP"
-msgstr ""
-
-#: ../../source/user_guides/model_bank/index.rst:15
-msgid "语音"
-msgstr ""
-
-#: ../../source/user_guides/model_bank/index.rst:18
-msgid "其他"
-msgstr ""
-
diff --git a/source/mobile/foo.rst b/source/mobile/foo.rst
deleted file mode 100644
index 9d43c91a8544c3b281b2e8d556cb8b8e069d7e0a..0000000000000000000000000000000000000000
--- a/source/mobile/foo.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-###
-FAQ
-###
diff --git a/source/user_guides/howto/configure_simple_model/index.rst b/source/user_guides/howto/configure_simple_model/index.rst
deleted file mode 100644
index 9bed6fb9fe5476a33a8614be93dc76806521ee73..0000000000000000000000000000000000000000
--- a/source/user_guides/howto/configure_simple_model/index.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-..  _user_guide_configure_simple_model:
-
-##############
-配置简单的网络
-##############
-
-在解决实际问题时，可以先从逻辑层面对问题进行建模，明确模型所需要的 **输入数据类型**、**计算逻辑**、**求解目标** 以及 **优化算法**。PaddlePaddle提供了丰富的算子来实现模型逻辑。下面以一个简单回归任务举例说明如何使用PaddlePaddle构建模型。该例子完整代码参见 `fit_a_line <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py>`_。
-
-问题描述及定义
-##############
-
-问题描述: 给定一组数据 :math:`<X, Y>`，求解出函数 :math:`f`，使得 :math:`y=f(x)`，其中 :math:`x\subset X` 表示一条样本的特征，为 :math:`13` 维的实数向量；:math:`y \subset Y` 为一实数表示该样本对应的值。
-
-我们可以尝试用回归模型来对问题建模，回归问题的损失函数有很多，这里选择常用的均方误差。为简化问题，这里假定 :math:`f` 为简单的线性变换函数，同时选用随机梯度下降算法来求解模型。
-
-+----------------+----------------------------------------------+
-| 输入数据类型   |  样本特征: 13 维 实数                        |
-+                +----------------------------------------------+
-|                |  样本标签: 1 维 实数                         |
-+----------------+----------------------------------------------+
-| 计算逻辑       | 使用线性模型，产生 1维实数作为模型的预测输出 |
-+----------------+----------------------------------------------+
-| 求解目标       | 最小化模型预测输出与样本标签间的均方误差     |
-+----------------+----------------------------------------------+
-| 优化算法       | 随机梯度下降                                 |
-+----------------+----------------------------------------------+
-
-使用PaddlePadle建模
-###################
-
-从逻辑层面明确了输入数据格式、模型结构、损失函数以及优化算法后，需要使用PaddlePaddle提供的API及算子来实现模型逻辑。一个典型的模型主要包含4个部分，分别是：输入数据格式定义，模型前向计算逻辑，损失函数以及优化算法。
-
-数据层
-------
-
-PaddlePaddle提供了 :ref:`api_fluid_layers_data` 算子来描述输入数据的格式。
-
-:ref:`api_fluid_layers_data` 算子的输出是一个Variable。这个Variable的实际类型是Tensor。Tensor具有强大的表征能力，可以表示多维数据。为了精确描述数据结构，通常需要指定数据shape以及数值类型type。其中shape为一个整数向量，type可以是一个字符串类型。目前支持的数据类型参考    :ref:`user_guide_paddle_support_data_types` 。 模型训练一般会使用batch的方式读取数据，而batch的size在训练过程中可能不固定。data算子会依据实际数据来推断batch size，所以这里提供shape时不用关心batch size，只需关心一条样本的shape即可，更高级用法请参考 :ref:`user_guide_customize_batch_size_rank`。从上知，:math:`x` 为 :math:`13` 维的实数向量，:math:`y` 为实数，可使用下面代码定义数据层：
-
-.. code-block:: python
-
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-该模型使用的数据比较简单，事实上data算子还可以描述变长的、嵌套的序列数据。也可以使用 :code:`open_files` 打开文件进行训练。更详细的文档可参照 :ref:`user_guide_prepare_data`。
-
-前向计算逻辑
-------------
-
-实现一个模型最重要的部分是实现计算逻辑，PaddlePaddle提供了丰富的算子。这些算子的封装粒度不同，通常对应一种或一组变换逻辑。算子输出即为对输入数据执行变换后的结果。用户可以灵活使用算子来完成复杂的模型逻辑。比如图像相关任务中会使用较多的卷积算子、序列任务中会使用LSTM/GRU等算子。复杂模型通常会组合多种算子，以完成复杂的变换。PaddlePaddle提供了非常自然的方式来组合算子，一般地可以使用下面的方式：
-
-.. code-block:: python
-
-    op_1_out = fluid.layers.op_1(input=op_1_in, ...)
-    op_2_out = fluid.layers.op_2(input=op_1_out, ...)
-    ...
-
-其中op_1和op_2表示算子类型，可以是fc来执行线性变换(全连接)，也可以是conv来执行卷积变换等。通过算子的输入输出的连接来定义算子的计算顺序以及数据流方向。上面的例子中，op_1的输出是op_2的输入，那么在执行计算时，会先计算op_1，然后计算op_2。更复杂的模型可能需要使用控制流算子，依据输入数据来动态执行，针对这种情况，PaddlePaddle提供了IfElseOp和WhileOp等。算子的文档可参考 :ref:`api_fluid_layers`。具体到这个任务, 我们使用一个fc算子：
-
-.. code-block:: python
-
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-损失函数
---------
-
-损失函数对应求解目标，我们可以通过最小化损失来求解模型。大多数模型使用的损失函数，输出是一个实数值。但是PaddlePaddle提供的损失算子一般是针对一条样本计算。当输入一个batch的数据时，损失算子的输出有多个值，每个值对应一条样本的损失，所以通常会在损失算子后面使用mean等算子，来对损失做归约。模型在一次前向迭代后会得到一个损失值，PaddlePaddle会自动执行链式求导法则计算模型里面每个参数和变量对应的梯度值。这里使用均方误差损失：
-
-.. code-block:: python
-
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-优化方法
---------
-
-确定损失函数后，可以通过前向计算得到损失值，然后通过链式求导法则得到参数的梯度值。获取梯度值后需要更新参数，最简单的算法是随机梯度下降法：:math:`w=w - \eta \cdot g`。但是普通的随机梯度下降算法存在一些问题: 比如收敛不稳定等。为了改善模型的训练速度以及效果，学术界先后提出了很多优化算法，包括： :code:`Momentum`、:code:`RMSProp`、:code:`Adam` 等。这些优化算法采用不同的策略来更新模型参数，一般可以针对具体任务和具体模型来选择优化算法。不管使用何种优化算法，学习率一般是一个需要指定的比较重要的超参数，需要通过实验仔细调整。这里采用随机梯度下降算法：
-
-.. code-block:: python
-
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-
-更多优化算子可以参考 :ref:`api_fluid_optimizer` 。
-
-下一步做什么？
-##############
-
-使用PaddlePaddle实现模型时需要关注 **数据层**、**前向计算逻辑**、**损失函数** 和 **优化方法**。不同的任务需要的数据格式不同，涉及的计算逻辑不同，损失函数不同，优化方法也不同。PaddlePaddle提供了丰富的模型示例，可以以这些示例为参考来构建自己的模型结构。用户可以访问 `模型库 <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_ 查看官方提供的示例。
diff --git a/source/user_guides/howto/modification/foo.rst b/source/user_guides/howto/modification/foo.rst
deleted file mode 100644
index 9d43c91a8544c3b281b2e8d556cb8b8e069d7e0a..0000000000000000000000000000000000000000
--- a/source/user_guides/howto/modification/foo.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-###
-FAQ
-###
diff --git a/source/user_guides/howto/prepare_data/feeding_data.rst b/source/user_guides/howto/prepare_data/feeding_data.rst
deleted file mode 100644
index 78f43338df02c503d6b46b93aaddb4d01a0f00ee..0000000000000000000000000000000000000000
--- a/source/user_guides/howto/prepare_data/feeding_data.rst
+++ /dev/null
@@ -1,169 +0,0 @@
-.. _user_guide_use_numpy_array_as_train_data:
-
-###########################
-使用Numpy Array作为训练数据
-###########################
-
-PaddlePaddle Fluid支持使用 :ref:`api_fluid_layers_data` 配置数据层；
-再使用 Numpy Array 或者直接使用Python创建C++的
-:ref:`api_guide_lod_tensor` , 通过 :code:`Executor.run(feed=...)` 传给
-:ref:`api_guide_executor` 或 :ref:`api_guide_parallel_executor` 。
-
-数据层配置
-##########
-
-通过 :ref:`api_fluid_layers_data` 可以配置神经网络中需要的数据层。具体方法为:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-
-   image = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-   # use image/label as layer input
-   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
-   loss = fluid.layers.cross_entropy(input=prediction, label=label)
-   ...
-
-上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data`
-创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据;
-:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是:
-
-1. Fluid中默认使用 :code:`-1` 表示 batch size 维度，默认情况下会在 :code:`shape`
-   的第一个维度添加 :code:`-1` 。 所以 上段代码中， 我们可以接受将一个
-   :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size
-   维度的位置的话，请设置 :code:`fluid.layers.data(append_batch_size=False)` 。
-   请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。
-
-
-2. Fluid中用来做类别标签的数据类型是 :code:`int64`，并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。
-
-.. _user_guide_feed_data_to_executor:
-
-传递训练数据给执行器
-####################
-
-:code:`Executor.run` 和 :code:`ParallelExecutor.run` 都接受一个 :code:`feed` 参数。
-这个参数是一个Python的字典。它的键是数据层的名字，例如上文代码中的 :code:`image`。
-它的值是对应的numpy array。
-
-例如:
-
-.. code-block:: python
-
-   exe = fluid.Executor(fluid.CPUPlace())
-   exe.run(feed={
-      "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
-      "label": numpy.random.random(size=(32, 1)).astype('int64')
-   })
-
-进阶使用
-########
-
-如何传入序列数据
-----------------
-
-序列数据是PaddlePaddle Fluid支持的特殊数据类型，可以使用 :code:`LoDTensor` 作为
-输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据;
-2.每个序列的长度信息。
-用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。
-
-传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level`。
-例如训练数据是词汇组成的句子，:code:`lod_level=1`；训练数据是 词汇先组成了句子，
-句子再组成了段落，那么 :code:`lod_level=2`。
-
-例如:
-
-.. code-block:: python
-
-   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
-
-   ...
-
-   exe.run(feed={
-     "sentence": create_lod_tensor(
-       data=numpy.array([1, 3, 4, 5, 3, 6, 8], dtype='int64').reshape(-1, 1),
-       lod=[4, 1, 2],
-       place=fluid.CPUPlace()
-     )
-   })
-
-训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2`。
-他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]`。
-
-如何分别设置ParallelExecutor中每个设备的训练数据
-------------------------------------------------
-
-用户将数据传递给使用 :code:`ParallelExecutor.run(feed=...)` 时，
-可以显示指定每一个训练设备(例如GPU)上的数据。
-用户需要将一个列表传递给 :code:`feed` 参数，列表中的每一个元素都是一个字典。
-这个字典的键是数据层的名字，值是数据层的值。
-
-例如:
-
-.. code-block:: python
-
-   parallel_executor = fluid.ParallelExecutor()
-   parallel_executor.run(
-     feed=[
-        {
-          "image": numpy.random.random(size=(32, 3, 224, 224)).astype('float32'),
-          "label": numpy.random.random(size=(32, 1)).astype('int64')
-        },
-        {
-          "image": numpy.random.random(size=(16, 3, 224, 224)).astype('float32'),
-          "label": numpy.random.random(size=(16, 1)).astype('int64')
-        },
-     ]
-   )
-
-上述代码中，GPU0会训练 32 个样本，而 GPU1训练 16 个样本。
-
-
-.. _user_guide_customize_batch_size_rank:
-
-自定义BatchSize维度
--------------------
-
-PaddlePaddle Fluid默认batch size是数据的第一维度，以 :code:`-1` 表示。但是在高级
-使用中，batch_size 可以固定，也可以是其他维度或者多个维度来表示。这都需要设置
-:code:`fluid.layers.data(append_batch_size=False)` 来完成。
-
-1. 固定batch size维度
-
-  .. code-block:: python
-
-     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
-
-  这里，:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。
-
-2. 使用其他维度表示batch size
-
-  .. code-block:: python
-
-     sentence = fluid.layers.data(name="sentence",
-                                  shape=[80, -1, 1],
-                                  append_batch_size=False,
-                                  dtype="int64")
-
-  这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经
-  网络中。
-
-
-.. _user_guide_paddle_support_data_types:
-
-Fluid目前支持的数据类型
------------------------
-
-PaddlePaddle Fluid目前支持的数据类型包括:
-
-   * float16： 部分操作支持
-   * float32:  主要实数类型
-   * float64:  次要实数类型，支持大部分操作
-   * int32:  次要标签类型
-   * int64: 主要标签类型
-   * uint64: 次要标签类型
-   * bool: 控制流数据类型
-   * int16: 次要标签类型
-   * uint8: 输入数据类型，可用于图像像素
\ No newline at end of file
diff --git a/source/user_guides/howto/prepare_data/index.rst b/source/user_guides/howto/prepare_data/index.rst
deleted file mode 100644
index 643702e95ef6c245524fb7c54efd8d120da8c629..0000000000000000000000000000000000000000
--- a/source/user_guides/howto/prepare_data/index.rst
+++ /dev/null
@@ -1,52 +0,0 @@
-..  _user_guide_prepare_data:
-
-########
-准备数据
-########
-
-PaddlePaddle Fluid支持两种传入数据的方式:
-
-1. 用户需要使用 :code:`fluid.layers.data`
-配置数据输入层，并在 :ref:`api_guide_executor` 或 :ref:`api_guide_parallel_executor`
-中，使用 :code:`executor.run(feed=...)` 传入训练数据。
-
-2. 用户需要先将训练数据
-转换成 Paddle 识别的 :ref:`api_guide_recordio_file_format` ， 再使用
-:code:`fluid.layers.open_files` 以及 :ref:`api_guide_reader` 配置数据读取。
-
-这两种准备数据方法的比较如下:
-
-.. _user_guide_prepare_data_comparision:
-
-+------------+----------------------------------+---------------------------------------+
-|            |        Feed数据                  |         使用Reader                    |
-+============+==================================+=======================================+
-| API接口    | :code:`executor.run(feed=...)`   |         :ref:`api_guide_reader`       |
-+------------+----------------------------------+---------------------------------------+
-| 数据格式   |           Numpy Array            | :ref:`api_guide_recordio_file_format` |
-+------------+----------------------------------+---------------------------------------+
-| 数据增强   | Python端使用其他库完成           | 使用Fluid中的Operator 完成            |
-+------------+----------------------------------+---------------------------------------+
-|   速度     |                 慢               |                 快                    |
-+------------+----------------------------------+---------------------------------------+
-| 推荐用途   |   调试模型                       |   工业训练                            |
-+------------+----------------------------------+---------------------------------------+
-
-这些准备数据的详细使用方法，请参考:
-
-.. toctree::
-   :maxdepth: 2
-
-   feeding_data
-   use_recordio_reader
-
-Python Reader
-#############
-
-为了方便用户在Python中定义数据处理流程，PaddlePaddle Fluid支持 Python Reader，
-具体请参考:
-
-.. toctree::
-   :maxdepth: 2
-
-   reader.md
diff --git a/source/user_guides/howto/prepare_data/use_recordio_reader.rst b/source/user_guides/howto/prepare_data/use_recordio_reader.rst
deleted file mode 100644
index 3121ae74c4380b1ddfc4258f2a4f6be8782b306e..0000000000000000000000000000000000000000
--- a/source/user_guides/howto/prepare_data/use_recordio_reader.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _user_guide_use_recordio_as_train_data:
-
-############################
-使用RecordIO文件作为训练数据
-############################
-
-相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
-:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
-但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
-:ref:`api_fluid_layers_open_files` 层在神经网络配置中导入 RecordIO 文件。
-用户还可以使用 :ref:`api_fluid_layers_double_buffer` 加速数据从内存到显存的拷贝，
-使用 :ref:`api_fluid_layers_Preprocessor` 工具进行数据增强。
-
-将训练数据转换成RecordIO文件格式
-################################
-
-:ref:`api_guide_recordio_file_format` 中，每个记录都是一个
-:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
-的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
-
-用户可以使用 :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` 可以将
-:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
-:ref:`api_fluid_recordio_writer_convert_reader_to_recordio_files` 将一个
-:ref:`user_guide_reader` 转换成多个RecordIO文件。
-
-具体使用方法为:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   import numpy
-
-   def reader_creator():
-       def __impl__():
-           for i in range(1000):
-               yield [
-                        numpy.random.random(size=[3,224,224], dtype="float32"),
-                        numpy.random.random(size=[1], dtype="int64")
-                     ]
-       return __impl__
-
-   img = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
-
-   BATCH_SIZE = 32
-   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
-   fluid.recordio_writer.convert_reader_to_recordio_file(
-      "train.recordio", feeder=feeder, reader_creator=reader)
-
-其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
-:ref:`_api_fluid_data_feeder_DataFeeder`
-是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
-:ref:`user_guide_reader` 。
-
-上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
-其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
-用户可以将每一个Record的样本数设置成1。并参考
-:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
-
-
-配置神经网络, 打开RecordIO文件
-##############################
-
-RecordIO文件转换好之后，用户可以使用 :ref:`api_fluid_layers_open_files`
-打开文件，并使用 :ref:`api_fluid_layers_read_file` 读取文件内容。
-简单使用方法如下:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-
-   file_obj = fluid.layers.open_files(
-     filenames=["train.recordio"],
-     shape=[[3, 224, 224], [1]],
-     lod_levels=[0, 0],
-     dtypes=["float32", "int64"],
-     pass_num=100
-   )
-
-   image, label = fluid.layers.read_file(file_obj)
-
-其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
-直到读取了 :code:`pass_num` 遍。
-
-
-
-进阶使用
-########
-
-
-使用 :ref:`api_fluid_layers_double_buffer`
-------------------------------------------
-
-:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
-需要使用 :ref:`api_fluid_layers_double_buffer` 修饰文件对象。 例如:
-
-.. code-block:: python
-
-   import paddle.fliud as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.double_buffer(file_obj)
-
-   image, label = fluid.layers.read_file(file_obj)
-
-双缓冲技术可以参考
-`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
-
-配置数据增强
-------------
-
-使用 :ref:`api_fluid_layers_Preprocessor` 可以配置文件的数据增强方法。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   preprocessor = fluid.layers.Preprocessor(reader=data_file)
-   with preprocessor.block():
-       image, label = preprocessor.inputs()
-       image = image / 2
-       label = label + 1
-       preprocessor.outputs(image, label)
-
-如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
-:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
-:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
-:code:`preprocessor.outputs()` 标记预处理后的输出。
-
-.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
-
-使用Op组batch
--------------
-
-使用 :ref:`api_fluid_layers_batch` 可以在训练的过程中动态的组batch。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.batch(file_obj, batch_size=32)
-
-   img, label = fluid.layers.read_file(file_obj)
-
-需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
-那么这几个样本直接组成一个批量数据进行训练。
-
-读入数据的shuffle
------------------
-
-使用 :ref:`api_fluid_layers_shuffle` 可以在训练过程中动态重排训练数据。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
-
-   img, label = fliud.layers.read_file(file_obj)
-
-需要注意的是:
-
-1. :code:`shuffle` 实现方法是:
-先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
-
-2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
-足够支持缓存 :code:`buffer_size` 条数据。
diff --git a/source/user_guides/models/index.rst b/source/user_guides/models/index.rst
deleted file mode 100644
index 0eba1bcdd4c87d0e9e83eb0485fb2fe2febcc5f6..0000000000000000000000000000000000000000
--- a/source/user_guides/models/index.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-Fluid 模型库
-============
-
-图像分类
---------
-
-图像分类是根据图像的语义信息对不同类别图像进行区分，是计算机视觉中重要的基础问题，是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础，在许多领域都有着广泛的应用。如：安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
-
-在深度学习时代，图像分类的准确率大幅度提升，在图像分类任务中，我们向大家介绍了如何在经典的数据集ImageNet上，训练常用的模型，包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual
-Path
-Network)、SE-ResNeXt模型，也开源了\ `训练的模型 <https://github.com/PaddlePaddle/models/blob/develop/fluid/image_classification/README_cn.md#已有模型及其性能>`__\ 方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle
-Fluid模型配置和参数文件的工具。
-
--  `AlexNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `VGG <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `GoogleNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `Residual
-   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `Inception-v4 <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `MobileNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `Dual Path
-   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `SE-ResNeXt <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models>`__
--  `Caffe模型转换为Paddle
-   Fluid配置和模型文件工具 <https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid>`__
-
-目标检测
---------
-
-目标检测任务的目标是给定一张图像或是一个视频帧，让计算机找出其中所有目标的位置，并给出每个目标的具体类别。对于人类来说，目标检测是一个非常简单的任务。然而，计算机能够“看到”的是图像被编码之后的数字，很难解图像或是视频帧中出现了人或是物体这样的高层语义概念，也就更加难以定位目标出现在图像中哪个区域。与此同时，由于目标会出现在图像或是视频帧中的任何位置，目标的形态千变万化，图像或是视频帧的背景千差万别，诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。
-
-在目标检测任务中，我们介绍了如何基于\ `PASCAL
-VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`__\ 、\ `MS
-COCO <http://cocodataset.org/#home>`__\ 数据的训练目标检测算法SSD，SSD全称Single
-Shot MultiBox
-Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点，并开源了训练好的\ `MobileNet-SSD模型 <https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md#模型发布>`__\ 。
-
--  `Single Shot MultiBox
-   Detector <https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md>`__
-
-图像语义分割
-------------
-
-图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割，图像语义是指对图像内容的理解，例如，能够描绘出什么物体在哪里做了什么事情等，分割是指对图片中的每个像素点进行标注，标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。
-
-在图像语义分割任务中，我们介绍如何基于图像级联网络(Image Cascade
-Network,ICNet)进行语义分割，相比其他分割算法，ICNet兼顾了准确率和速度。
-
--  `ICNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/icnet>`__
-
-场景文字识别
-------------
-
-许多场景图像中包含着丰富的文本信息，对理解图像信息有着重要作用，能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生，如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。
-
-在场景文字识别任务中，我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合，免除人工定义特征，避免字符分割，使用自动学习到的图像特征，完成端到端地无约束字符定位和识别。当前，介绍了CRNN-CTC模型，后续会引入基于注意力机制的序列到序列模型。
-
--  `CRNN-CTC模型 <https://github.com/PaddlePaddle/models/tree/develop/fluid/ocr_recognition>`__
-
-语音识别
---------
-
-自动语音识别（Automatic Speech Recognition,
-ASR）是将人类声音中的词汇内容转录成计算机可输入的文字的技术。语音识别的相关研究经历了漫长的探索过程，在HMM/GMM模型之后其发展一直较为缓慢，随着深度学习的兴起，其迎来了春天。在多种语言识别任务中，将深度神经网络(DNN)作为声学模型，取得了比GMM更好的性能，使得
-ASR
-成为深度学习应用最为成功的领域之一。而由于识别准确率的不断提高，有越来越多的语言技术产品得以落地，例如语言输入法、以智能音箱为代表的智能家居设备等
-—— 基于语言的交互方式正在深刻的改变人类的生活。
-
-与 `DeepSpeech <https://github.com/PaddlePaddle/DeepSpeech>`__
-中深度学习模型端到端直接预测字词的分布不同，本实例更接近传统的语言识别流程，以音素为建模单元，关注语言识别中声学模型的训练，利用\ `kaldi <http://www.kaldi-asr.org>`__\ 进行音频数据的特征提取和标签对齐，并集成
-kaldi 的解码器完成解码。
-
--  `DeepASR <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepASR/README_cn.md>`__
-
-机器翻译
---------
-
-机器翻译（Machine
-Translation）将一种自然语言(源语言)转换成一种自然语言（目标语音），是自然语言处理中非常基础和重要的研究方向。在全球化的浪潮中，机器翻译在促进跨语言文明的交流中所起的重要作用是不言而喻的。其发展经历了统计机器翻译和基于神经网络的神经机器翻译(Nueural
-Machine Translation, NMT)等阶段。在 NMT
-成熟后，机器翻译才真正得以大规模应用。而早阶段的 NMT
-主要是基于循环神经网络 RNN
-的，其训练过程中当前时间步依赖于前一个时间步的计算，时间步之间难以并行化以提高训练速度。因此，非
-RNN 结构的 NMT 得以应运而生，例如基于卷积神经网络 CNN
-的结构和基于自注意力机制（Self-Attention）的结构。
-
-本实例所实现的 Transformer
-就是一个基于自注意力机制的机器翻译模型，其中不再有RNN或CNN结构，而是完全利用
-Attention 学习语言中的上下文依赖。相较于RNN/CNN,
-这种结构在单层内计算复杂度更低、易于并行化、对长程依赖更易建模，最终在多种语言之间取得了最好的翻译效果。
-
--  `Transformer <https://github.com/PaddlePaddle/models/blob/develop/fluid/neural_machine_translation/transformer/README_cn.md>`__
-
-强化学习
---------
-
-强化学习是近年来一个愈发重要的机器学习方向，特别是与深度学习相结合而形成的深度强化学习(Deep
-Reinforcement Learning,
-DRL)，取得了很多令人惊异的成就。人们所熟知的战胜人类顶级围棋职业选手的
-AlphaGo 就是 DRL
-应用的一个典型例子，除游戏领域外，其它的应用还包括机器人、自然语言处理等。
-
-深度强化学习的开山之作是在Atari视频游戏中的成功应用，
-其可直接接受视频帧这种高维输入并根据图像内容端到端地预测下一步的动作，所用到的模型被称为深度Q网络(Deep
-Q-Network, DQN)。本实例就是利用PaddlePaddle Fluid这个灵活的框架，实现了
-DQN 及其变体，并测试了它们在 Atari 游戏中的表现。
-
--  `DeepQNetwork <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepQNetwork/README_cn.md>`__
-
-中文词法分析
-------------
-
-中文分词(Word Segmentation)是将连续的自然语言文本，切分出具有语义合理性和完整性的词汇序列的过程。因为在汉语中，词是承担语义的最基本单位，切词是文本分类、情感分析、信息检索等众多自然语言处理任务的基础。 词性标注（Part-of-speech Tagging）是为自然语言文本中的每一个词汇赋予一个词性的过程，这里的词性包括名词、动词、形容词、副词等等。 命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别自然语言文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等。 我们将这三个任务统一成一个联合任务，称为词法分析任务，基于深度神经网络，利用海量标注语料进行训练，提供了一个端到端的解决方案。
-
-我们把这个联合的中文词法分析解决方案命名为LAC。LAC既可以认为是Lexical Analysis of Chinese的首字母缩写，也可以认为是LAC Analyzes Chinese的递归缩写。
-
-- `LAC <https://github.com/baidu/lac/blob/master/README.md>`__
-
-情感倾向分析
-------------
-
-情感倾向分析针对带有主观描述的中文文本，可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极、 中性。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控，为企业提供有力的决策支持。本次我们开放 AI开放平台中情感倾向分析采用的模型(http://ai.baidu.com/tech/nlp/sentiment_classify )， 提供给用户使用。
-
-- `Senta <https://github.com/baidu/Senta/blob/master/README.md>`__
-
-AnyQ
-----
-
-`AnyQ <https://github.com/baidu/AnyQ>`__\ (ANswer Your Questions)
-开源项目主要包含面向FAQ集合的问答系统框架、文本语义匹配工具SimNet。
-问答系统框架采用了配置化、插件化的设计，各功能均通过插件形式加入，当前共开放了20+种插件。开发者可以使用AnyQ系统快速构建和定制适用于特定业务场景的FAQ问答系统，并加速迭代和升级。
-
-SimNet是百度自然语言处理部于2013年自主研发的语义匹配框架，该框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM-DNN等核心网络结构形式，同时基于该框架也集成了学术界主流的语义匹配模型，如MatchPyramid、MV-LSTM、K-NRM等模型。使用SimNet构建出的模型可以便捷的加入AnyQ系统中，增强AnyQ系统的语义匹配能力。
-
--  `SimNet in PaddlePaddle
-   Fluid <https://github.com/baidu/AnyQ/blob/master/tools/simnet/train/paddle/README.md>`__