diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 378cc88c9cc708fd797c97a91ca59f0d57bd570a..0000000000000000000000000000000000000000
--- a/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-.env
-.DS_Store
-._.DS_Store
-*.mo
diff --git a/.gitmodules b/.gitmodules
index fa45d0eaa75ca251ab9cbb410ec7e7a73936cab4..3bc190175db8837a22f2b255a00f66176415ec9c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,15 @@
-[submodule "paddle"]
- path = paddle
- url = https://github.com/PaddlePaddle/Paddle.git
-[submodule "book"]
- path = book
- url = https://github.com/PaddlePaddle/book.git
-[submodule "source/anakin"]
- path = source/anakin
+[submodule "external/Paddle"]
+ path = external/Paddle
+ url = https://github.com/PaddlePaddle/Paddle
+[submodule "external/book"]
+ path = external/book
+ url = https://github.com/PaddlePaddle/book
+[submodule "external/Anakin"]
+ path = external/Anakin
url = https://github.com/PaddlePaddle/Anakin
+[submodule "external/paddle-mobile"]
+ path = external/paddle-mobile
+ url = https://github.com/PaddlePaddle/paddle-mobile
+[submodule "external/models"]
+ path = external/models
+ url = https://github.com/PaddlePaddle/models
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..80180d4ae952ba673c2db1e3bad8e95db0e346d2
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,59 @@
+language: cpp
+cache:
+ bundler: true
+ directories:
+ - $HOME/.ccache
+ - $HOME/.cache/pip
+ - $HOME/docker
+ # - $TRAVIS_BUILD_DIR/external/
+ - $TRAVIS_BUILD_DIR/external/Paddle/build/third_party
+
+sudo: required
+dist: trusty
+services:
+ - docker
+os:
+ - linux
+env:
+ - JOB=doc
+ - JOB=lite_lib
+ - JOB=lite_lib2
+ - JOB=en_external_doc
+
+addons:
+ apt:
+ packages:
+ - git
+ - python
+ - python-pip
+ - python2.7-dev
+ ssh_known_hosts: 13.229.163.131
+before_install:
+ - sudo pip install pylint pytest astroid isort
+ # Load cached docker images
+ #- if [[ -d $HOME/docker ]]; then ls $HOME/docker/*.tar.gz | xargs -I {file} sh -c "zcat {file} | docker load"; fi
+
+script:
+ - |
+ if [ $JOB == "doc" ]; then scripts/deploy_docs.sh full
+ fi
+
+ if [ $JOB == "lite_lib" ]; then scripts/deploy_docs.sh pybind
+ fi
+
+ if [ $JOB == "lite_lib2" ]; then scripts/deploy_docs.sh proto
+ fi
+
+ if [ $JOB == "en_external_doc" ]; then scripts/deploy_en_external_docs.sh
+ fi
+
+#before_cache:
+# # Save tagged docker images
+# - >
+# mkdir -p $HOME/docker && docker images -a --filter='dangling=false' --format 'paddlepaddle/paddle:latest-dev {{.ID}}'
+# | xargs -n 2 -t sh -c 'test -e $HOME/docker/$1.tar.gz || docker save $0 | gzip -2 > $HOME/docker/$1.tar.gz'
+
+notifications:
+ email:
+ on_success: change
+ on_failure: always
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 36ee0a07dbab2ea7c9c1e7031ab3a871bcf1a008..0000000000000000000000000000000000000000
--- a/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS =
-SPHINXBUILD = sphinx-build
-PAPER =
-BUILDDIR = build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4 = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
-
-help:
- @echo "Please use \`make ' where is one of"
- @echo " html to make standalone HTML files"
- @echo " dirhtml to make HTML files named index.html in directories"
- @echo " singlehtml to make a single large HTML file"
- @echo " pickle to make pickle files"
- @echo " json to make JSON files"
- @echo " htmlhelp to make HTML files and a HTML help project"
- @echo " qthelp to make HTML files and a qthelp project"
- @echo " applehelp to make an Apple Help Book"
- @echo " devhelp to make HTML files and a Devhelp project"
- @echo " epub to make an epub"
- @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
- @echo " latexpdf to make LaTeX files and run them through pdflatex"
- @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
- @echo " text to make text files"
- @echo " man to make manual pages"
- @echo " texinfo to make Texinfo files"
- @echo " info to make Texinfo files and run them through makeinfo"
- @echo " gettext to make PO message catalogs"
- @echo " changes to make an overview of all changed/added/deprecated items"
- @echo " xml to make Docutils-native XML files"
- @echo " pseudoxml to make pseudoxml-XML files for display purposes"
- @echo " linkcheck to check all external links for integrity"
- @echo " doctest to run all doctests embedded in the documentation (if enabled)"
- @echo " coverage to run coverage check of the documentation (if enabled)"
-
-clean:
- rm -rf $(BUILDDIR)/*
-
-html:
- $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
- @echo
- @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
- $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
- @echo
- @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
- $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
- @echo
- @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
- $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
- @echo
- @echo "Build finished; now you can process the pickle files."
-
-json:
- $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
- @echo
- @echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
- $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
- @echo
- @echo "Build finished; now you can run HTML Help Workshop with the" \
- ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
- $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
- @echo
- @echo "Build finished; now you can run "qcollectiongenerator" with the" \
- ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
- @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PaddlePaddleFluid.qhcp"
- @echo "To view the help file:"
- @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PaddlePaddleFluid.qhc"
-
-applehelp:
- $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
- @echo
- @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
- @echo "N.B. You won't be able to view it unless you put it in" \
- "~/Library/Documentation/Help or install it in your application" \
- "bundle."
-
-devhelp:
- $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
- @echo
- @echo "Build finished."
- @echo "To view the help file:"
- @echo "# mkdir -p $$HOME/.local/share/devhelp/PaddlePaddleFluid"
- @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PaddlePaddleFluid"
- @echo "# devhelp"
-
-epub:
- $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
- @echo
- @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
- $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
- @echo
- @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
- @echo "Run \`make' in that directory to run these through (pdf)latex" \
- "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
- $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
- @echo "Running LaTeX files through pdflatex..."
- $(MAKE) -C $(BUILDDIR)/latex all-pdf
- @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
- $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
- @echo "Running LaTeX files through platex and dvipdfmx..."
- $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
- @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
- $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
- @echo
- @echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
- $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
- @echo
- @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
- $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
- @echo
- @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
- @echo "Run \`make' in that directory to run these through makeinfo" \
- "(use \`make info' here to do that automatically)."
-
-info:
- $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
- @echo "Running Texinfo files through makeinfo..."
- make -C $(BUILDDIR)/texinfo info
- @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
- $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
- @echo
- @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
- $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
- @echo
- @echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
- $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
- @echo
- @echo "Link check complete; look for any errors in the above output " \
- "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
- $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
- @echo "Testing of doctests in the sources finished, look at the " \
- "results in $(BUILDDIR)/doctest/output.txt."
-
-coverage:
- $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
- @echo "Testing of coverage in the sources finished, look at the " \
- "results in $(BUILDDIR)/coverage/python.txt."
-
-xml:
- $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
- @echo
- @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
- $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
- @echo
- @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/README.md b/README.md
index e744e193bd198d3f6f6b001f0a906c580173cd38..c3c671091254e84a798014fedeb211094c2b72c9 100644
--- a/README.md
+++ b/README.md
@@ -1,57 +1,25 @@
-# Fluid Documentation Skeleton
+# Introduction
+Fluiddoc consolidates all the documentations related to Paddle. It supplies the contents to PaddlePaddle.org via CI.
-## Build
+# Architecture
+FluidDoc submodules Paddle, Book, Models, Mobile and Anakin under `external` folder. All submodules should be put under `external` as standard practice.
-To build documentation, you need have a linux machine and have python2, virtualenv, gmake installed.
+Fluiddoc then uses them as references to load up the documents. The FluidDoc constructs the whole doc-tree under the `FluidDoc/doc/fluid` folder. The entry point is `FluidDoc/doc/fluid/index_cn.rst` and `FluidDoc/doc/fluid/index_en.rst`
-### Preparation
+When a release branch is pushed to Github, Travis-CI will start automatically to compile documents and deploy documents to the server.
-You need to create a `virtualenv` instead of polute the global python library path
+## Note:
+FluidDoc needs Paddle python module to compile API documents. Unfortunately, compiling Paddle python module takes longer time Travis CI permits. Usually Travis CI will fail due because of timeout. That's why there three jobs on Travis, two of them are to build libraries. Once the libraries are cached on the Travis, next build will be a lot faster.
-```bash
-virtualenv .env
-```
+## Preview with PPO
+To preview documents constructured by FluidDoc. Please follow the regular preview step, but replace the path to paddle with the path to FluidDoc
+`./runserver --paddle `
-You can enter virtualenv by
+# Publish New release
+1. Checkout a new release branch. The branch name should follow `release/`
+1. Update the documentations on the submodules or within FluidDoc
+1. Make sure all the submodules are ready for release. Paddle, book, model, mobile and Anakin should all have stable commits. Note: Paddle repo should update the API RST files accordinly if Paddle changes the included module/classes.
+1. Update the submodules under `external` folder and commit the changes.
+1. Git push the branch to Github, Travis CI will start several builds to publish the documents to the PaddlePaddle.org server
+1. Please notify the PaddlePaddle.org team that the release content is ready. PaddlePaddl.org team should enable the version and update the default version to the latest one. PaddlePaddl.org should also update the search index accordingly (Until the search server is up)
-```bash
-source .env/bin/activate
-```
-
-You can exit virtualenv by
-
-```bash
-deactivate
-```
-
-### Install dependencies
-
-```bash
-# enter virtualenv
-source .env/bin/activate
-# install dependencies
-pip install -r requirements.txt
-```
-
-### Make HTML
-
-```bash
-# make clean # make clean to regenerate toctree. Just `make html` may have a cache.
-make html
-```
-and the html files will be generated to `build/html`. You can open `build/html/index.html` with your browser to see the documentation.
-
-## Edit
-
-### Edit documentation
-
-It is suggested to use `reStructuredText` because it is the only official markup language supportted by our documentation generating system, sphinx. `markdown` can also be used. However, since the `markdown` has so many dialects, there is no guarantee that the `markdown` source file can be rendered well.
-
-The `reStructuredText` cheatsheet is [here](http://docutils.sourceforge.net/docs/user/rst/quickref.html).
-
-
-### Edit structure
-
-The `sphinx` (our documentation generating system) uses `toctree` to organize documentation. `toctree` means `table of content tree`.
-
-Please see the [sphinx documentation](http://www.sphinx-doc.org/en/master/), especially [`toctree` directives](http://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html)
diff --git a/book b/book
deleted file mode 160000
index f4b5cc835ef77e55cfc001d51f8f77565475dc45..0000000000000000000000000000000000000000
--- a/book
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f4b5cc835ef77e55cfc001d51f8f77565475dc45
diff --git a/build/.gitignore b/build/.gitignore
deleted file mode 100644
index 72e8ffc0db8aad71a934dd11e5968bd5109e54b4..0000000000000000000000000000000000000000
--- a/build/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架,它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家,致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性,是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架: PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架,在保证性能的同时,极大的提升了框架对模型的表达能力,能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好:经过百度内多种大规模计算业务的打磨,PaddlePaddle在分布式计算上表现优异,基于EDL技术能够节约大量计算资源,同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习:通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构,帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程:百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材,帮助开发者从零掌握深度学习
+
+- 深度学习实训:对于目的是科研和学习的用户,PaddlePaddle提供了无需安装、线上运行的开发环境,并提供算法、算力、数据支持
+
+- 线下培训:提供丰富、高质量的线下教育活动,如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL:可以帮助零算法基础的企业快速完成一个深度学习任务,只需少量的数据即可得到优质的模型
+
+- AI市场:提供标准化的AI 能力、产品的交易机制,帮助企业快速找到所需,有效开展AI业务
+
+- 深度学习竞赛: PaddlePaddle汇聚顶尖深度学习开发者,企业可以发布自己的商业问题,通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题:可以在 `PaddlePaddle开源社区 `_,以及 `PaddlePaddle中文社区 `_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议:可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架,共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/source/advanced_usage/benchmark.rst b/doc/fluid/advanced_usage/benchmark.rst
similarity index 100%
rename from source/advanced_usage/benchmark.rst
rename to doc/fluid/advanced_usage/benchmark.rst
diff --git a/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..08ea379f81d16407ed5f82770b55a34bcf138da8
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
@@ -0,0 +1,56 @@
+# Anakin ARM 性能测试
+
+## 测试环境和参数:
++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
++ 采用android ndk交叉编译,gcc 4.9,enable neon, ABI: armveabi-v7a with neon -mfloat-abi=softfp
++ 测试平台
+ - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+ - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+ - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ 多线程:openmp
++ 时间:warmup10次,运行10次取均值
++ ncnn版本:来源于github的master branch中commits ID:307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)版本
++ TFlite版本:来源于github的master branch中commits ID:65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)版本
+
+在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
+
+## BenchMark model
+
+> 注意在性能测试之前,请先将测试model通过[External Converter](#10003)转换为Anakin model
+> 对这些model,本文在ARM上进行多线程的单batch size测试。
+
+- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
+
+### mobilenetv1
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
+ |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
+ |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan|
+
+### mobilenetv2
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
+ |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
+ |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
+
+### mobilenet-ssd
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
+ |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
+ |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
+
+## How to run those Benchmark models?
+
+1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换
+2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
+3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
+4. 最后,终端显示器上将会打印该模型的运行时间
+5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
diff --git a/doc/fluid/advanced_usage/deploy/anakin_example.md b/doc/fluid/advanced_usage/deploy/anakin_example.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_example.md
@@ -0,0 +1,28 @@
+# Example
+Anakin目前只支持NCHW的格式
+示例文件在test/framework/net下
+
+## 在NV的GPU上运行CNN模型
+示例文件为打开example_nv_cnn_net.cpp,整体流程如下:
+- 将模型的的path设置为anakin模型的路径,初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
+- 根据模型设置网络图的输入尺寸,进行图优化
+- 根据优化后的网络图初始化网络执行器
+- 取出网络的输入tensor,将数据拷贝到输入tensor
+- 运行推导
+- 取出网络的输出tensor
+
+以NV平台为例演示Anakin框架的使用方法,注意编译时需要打开GPU编译开关
+
+## 在X86上运行RNN模型
+示例文件为example_x86_rnn_net.cpp
+整体流程与在NV的GPU上运行CNN模型相似,不同之处如下:
+- 使用X86标识初始化图对象和网络执行器对象
+- rnn模型的输入尺寸是可变的,初始化图时的输入维度是维度的最大值,输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词,其中第0到第4个词是第一句话,第5到第11个词是第二句话
+
+以X86平台为例演示Anakin框架的使用方法,注意编译时需要打开X86编译开关
+
+## 在NV的GPU上使用Anakin的线程池运行CNN模型
+示例文件为example_nv_cnn_net_multi_thread.cpp ,示例使用worker的同步预测接口
+整体流程与在NV的GPU上运行CNN模型相似,不同之处如下:
+- 用模型地址和线程池大小初始化worker对象
+- 将输入tensor注入任务队列,获得输出tensor
diff --git a/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..667f9396f1169a0d891b9e6b0e912aa5527ab0b8
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -0,0 +1,170 @@
+# Anakin GPU Benchmark
+
+## Machine:
+
+> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
+> GPU: `Tesla P4`
+> cuDNN: `v7`
+
+
+## Counterpart of anakin :
+
+The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support.
+
+## Benchmark Model
+
+The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
+ You can use pretrained caffe model or the model trained by youself.
+
+> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+
+- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
+- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
+- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [RNN](#7) *not support yet*
+
+We tested them on single-GPU with single-thread.
+
+### VGG16
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 8.8690 | 8.2815 |
+| 2 | 15.5344 | 13.9116 |
+| 4 | 26.6000 | 21.8747 |
+| 8 | 49.8279 | 40.4076 |
+| 32 | 188.6270 | 163.7660 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 963 | 997 |
+| 2 | 965 | 1039 |
+| 4 | 991 | 1115 |
+| 8 | 1067 | 1269 |
+| 32 | 1715 | 2193 |
+
+
+### Yolo
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 16.4596| 15.2124 |
+| 2 | 26.6347| 25.0442 |
+| 4 | 43.3695| 43.5017 |
+| 8 | 80.9139 | 80.9880 |
+| 32 | 293.8080| 310.8810 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 1569 | 1775 |
+| 2 | 1649 | 1815 |
+| 4 | 1709 | 1887 |
+| 8 | 1731 | 2031 |
+| 32 | 2253 | 2907 |
+
+### Resnet50
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 4.2459 | 4.1061 |
+| 2 | 6.2627 | 6.5159 |
+| 4 | 10.1277 | 11.3327 |
+| 8 | 17.8209 | 20.6680 |
+| 32 | 65.8582 | 77.8858 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 531 | 503 |
+| 2 | 543 | 517 |
+| 4 | 583 | 541 |
+| 8 | 611 | 589 |
+| 32 | 809 | 879 |
+
+### Resnet101
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 7.5562 | 7.0837 |
+| 2 | 11.6023 | 11.4079 |
+| 4 | 18.3650 | 20.0493 |
+| 8 | 32.7632 | 36.0648 |
+| 32 | 123.2550 | 135.4880 |
+
+- GPU Memory Used (`MB)`
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 701 | 683 |
+| 2 | 713 | 697 |
+| 4 | 793 | 721 |
+| 8 | 819 | 769 |
+| 32 | 1043 | 1059 |
+
+### MobileNet V1
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 45.5156 | 1.3947 |
+| 2 | 46.5585 | 2.5483 |
+| 4 | 48.4242 | 4.3404 |
+| 8 | 52.7957 | 8.1513 |
+| 32 | 83.2519 | 31.3178 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 329 | 283 |
+| 2 | 345 | 289 |
+| 4 | 371 | 299 |
+| 8 | 393 | 319 |
+| 32 | 531 | 433 |
+
+### MobileNet V2
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 65.6861 | 2.9842 |
+| 2 | 66.6814 | 4.7472 |
+| 4 | 69.7114 | 7.4163 |
+| 8 | 76.1092 | 12.8779 |
+| 32 | 124.9810 | 47.2142 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 341 | 293 |
+| 2 | 353 | 301 |
+| 4 | 385 | 319 |
+| 8 | 421 | 351 |
+| 32 | 637 | 551 |
+
+## How to run those Benchmark models?
+
+> 1. At first, you should parse the caffe model with [`external converter`](https://github.com/PaddlePaddle/Anakin/blob/b95f31e19993a192e7428b4fcf852b9fe9860e5f/docs/Manual/Converter_en.md).
+> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file.
+> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen.
+> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in model log file.
diff --git a/doc/fluid/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5efbc89abd469871b318c306e8cb03dd95f0c85b
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
@@ -0,0 +1,639 @@
+# Anakin 使用教程 ##
+
+本教程将会简略的介绍Anakin的工作原理,一些基本的Anakin API,以及如何调用这些API。
+
+## 内容 ###
+
+- [Anakin的工作原理](#principle)
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## Anakin的工作原理 ###
+
+
+
+用Anakin来进行前向计算主要分为三个步骤:
+
+- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型
+ 在使用Anakin之前,用户必须将所有其他模型转换成Anakin模型,我们提供了转换脚本,用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。
+- 生成Anakin计算图
+ 加载Anakin模型生成原始计算图,然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
+- 执行计算图
+ Anakin会选择不同硬件平台执行计算图。
+
+
+## Anakin APIs ###
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理,为ops提供统一的数据接口。`Tensor`包含以下几个属性:
+
+- Buffer
+ 数据存储区
+- Shape
+ 数据的维度信息
+- Event
+ 用于异步计算的同步
+
+ `Tensor` 类包含三个`Shape`对象, 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息,`_valid_shape`表示当前`tensor`使用的空间信息, `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
+
+
+Dimentions | Math entity |
+ :----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template
+ class Tensor .../* Inherit other class */{
+ //some implements
+ ...
+ };
+```
+
+TargetType是平台类型,如X86,GPU等等,在Anakin内部有相应的标识与之对应;datatype是普通的数据类型,在Anakin内部也有相应的标志与之对应;[LayOutType](#layout)是数据分布类型,如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
+
+1. TargetType
+
+ Anakin TargetType | platform
+ :----: | :----:|
+ NV | NVIDIA GPU
+ ARM | ARM
+ AMD | AMD GPU
+ X86 | X86
+ NVHX86 | NVIDIA GPU with Pinned Memory
+
+2. DataType
+
+Anakin DataType | C++ | Description
+:---: | :---: | :---: |
+AK_HALF | short | fp16
+AK_FLOAT | float | fp32
+AK_DOUBLE | double | fp64
+AK_INT8 | char | int8
+AK_INT16 | short | int16
+AK_INT32 | int | int32
+AK_INT64 | long | int64
+AK_UINT8 | unsigned char | uint8
+AK_UINT16 | unsigned short | uint8
+AK_UINT32 | unsigned int | uint32
+AK_STRING | std::string | /
+AK_BOOL | bool | /
+AK_SHAPE | / | Anakin Shape
+AK_TENSOR | / | Anakin Tensor
+
+
+3. LayOutType
+
+Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+:---: | :---: | :---: | :---: |
+W | 1-D | YES | NO
+HW | 2-D | YES | NO
+WH | 2-D | YES | NO
+NW | 2-D | YES | YES
+NHW | 3-D | YES |YES
+NCHW ( default ) | 4-D | YES | YES
+NHWC | 4-D | YES | NO
+NCHW_C4 | 5-D | YES | YES
+
+
+理论上,Anakin支持申明1维以上的tensor,但是对于Anakin中的Op来说,只支持NW、NHW、NCHW、NCHW_C4这四种LayOut,其中NCHW是默认的LayOutType,NCHW_C4是专门针对于int8这种数据类型的。
+
+
+例子
+
+> 下面的代码将展示如何使用tensor, 我们建议先看看这些示例。
+
+> 要想获得更多关于tensor的信息, 请参考 *soure_path/core/tensor.h*
+
+> 1. 使用shape对象初始化tensor
+``` c++
+ //create a null tensor. A null tensor holds for nothing.
+ //tensor's buffer is resident at CPU and its datatype is AK_FLOAT.
+ //tensor's Layout is NCHW(default)
+ Tensor mytensor;
+
+ //1. using shape object to create a tensor.
+ Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+ Tensor mytensor1(shape1); //1-D tensor.
+
+ // A 4-D shape
+ Shape shape2(N, C, H, W); // batch x channel x height x width
+```
+
+>`注意:Shape的维度必须和tensor的`[LayoutType](#layout)`相同,比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW,否则会出错。如下列代码所示`
+
+
+```c++
+ // A 4-D tensor.
+ Tensor mytensor2(shape2); //right
+
+ //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+ Tensor mytensor3(shape2); //right
+
+ Tensor mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+ Tensor mytensor5(shape2); //wrong!!!!
+
+```
+
+> 2. 使用现有的数据和shape初始化tensor
+
+```c++
+
+ /**
+ * A construtor of Tensor.
+ * data_ptr is a pointer to any data type of data
+ * TargetType is type of a platform [Anakin TargetType]
+ * id : device id
+ * shape: a Anakin shape
+ */
+ Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+ //using existing data feed to a tensor
+ Tensor mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
+```
+
+> 3. 使用tensor初始化tensor
+
+```c++
+ Tensor tensor(exist_tensor);
+```
+
+
+> 提示: 你可以用` typedef Tensor Tensor4d_X86 `方便定义tensor
+
+
+#### 填充tensor数据区
+
+
+填充数据区得看你申明tensor的方式, 下面展示了如何填充tensor的数据区。
+
+```c++
+首先来看看tensor的四种声明方式:
+
+1. Tensor mytensor;
+2. Tensor mytensor1(shape1);
+3. Tensor mytensor(data_ptr, TargetType, device_id, shape);
+4. Tensor tensor(exist_tensor);
+
+
+相关的声明方式的数据填充方法如下:
+
+1:声明一个空的tensor,此时没有为其分配内存,所以,我们需要手动的为其分配内存。
+
+ //parama shape
+ mytensor.re_alloc(Shape shape);
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+2: 这种声明方式会自动分配内存
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor1.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+
+3:在该种声明方式中,我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存,得依情况而定。如果data_ptr和申明的
+tensor都在都一个目标平台上,那么该tensor就会与data_ptr共享内存空间,相反,如果他们不在同一个平台上(如data_ptr在X86上,而
+tensor在GPU上),那么此时tensor就会开辟一个新的内存空间,并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+4:该种方式仍不需要手动分配内存
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+
+另外,你还可以获取一个tensor的可读指针,示例如下:
+ //Get read-only pointer to mytensor.
+ //parama index (int): where you start to read.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.data(index/*=0*/);
+ //do something ...
+```
+
+如果想更详细的了解tensor,请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+//some declarations
+// ...
+Shape shape = mytensor.shape();
+
+//Get a first dimetion size of tesor, if it has.
+int d1 = shape[0];
+
+//Get a second dimention size of tensor, if it has.
+int d2 = shape[1];
+
+...
+
+//Get a n-th dimention size of tensor, if it has.
+int dn = shape[n-1];
+
+
+//Get a tensor's dimention
+int dims = mytensor.dims();
+
+//Get the size of tensor.
+//size = d1 x d2 x ... x dn.
+int size = mytensor.size();
+
+//Get the size of tensor at interval [Di, Dj)
+// form i-th dimention to j-th dimention, but not including the j-th dimention.
+// which means di x (di+1) x ... x (dj -1)
+int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+
+```c++
+/**
+ * \brief set a tensor's shape
+ * \param valid_shape [a Shape object]
+ * \param shape [a Shape object]
+ * \param offset [a Shape object]
+ * \return the status of this operation, that means whether it success * or not.
+ */
+SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value));
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同,如果不同就会出错,返回SaberInvalidValue。 如果相同,那么将成功设置tensor的shape。
+
+```c++
+
+// some declarations
+// ...
+//valid_shape, shape , offset are Shape object;
+//All these Shape object's LayOutType must be equal to mytensor's.
+mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+//some declarations
+Shape shape, valid_shape, offset;
+
+//do some initializations
+...
+mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意: Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样,graph也接受三个模板参数。
+
+```c++
+
+template
+class Graph ... /* inherit other class*/{
+
+ //some implements
+ ...
+
+};
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+
+```c++
+
+//Create a empty graph object.
+Graph graph = Graph tmp();
+
+//Create a pointer to a empty graph.
+Graph *graph = new Graph();
+
+//Create a pointer to a empty graph.
+auto graph = new Graph();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+//some declarations
+...
+auto graph = new Graph();
+std::string model_path = "the/path/to/where/your/models/are";
+const char *model_path1 = "the/path/to/where/your/models/are";
+
+//Loading Anakin model to generate a compute graph.
+auto status = graph->load(model_path);
+
+//Or this way.
+auto status = graph->load(model_path1);
+//Check whether load operation success.
+if(!status){
+ std::cout << "error" << endl;
+ //do something...
+}
+
+```
+
+#### 优化计算图
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//According to the ops of loaded graph, optimize compute graph.
+graph->Optimize();
+
+```
+
+> 注意: 第一次加载原始图,必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型, 特别的, 你可以保存一个优化的模型,这样,下次再加载模型时,就不必进行优化操作。
+
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+// save a model
+//save_model_path: the path to where your model is.
+auto status = graph->save(save_model_path);
+
+//Checking
+if(!status){
+ cout << "error" << endl;
+ //do somethin...
+}
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+vector shape{10, 256, 256, 10};
+//input_name : std::string.
+//Reshape a tensor named input_name.
+graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//input_name : std::string.
+//Reset a tensor named input_name.
+int new_batch_size = 4;
+graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+### Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。
+
+
+```c++
+template
+class Net{
+ //some implements
+ ...
+
+};
+```
+由于有些Op可能支持多种精度,我们可以通过Precision来指定。OpRunType表示同步或异步类型,异步是默认类型。OpRunType::SYNC表示同步,在GPU上只有单个流;OpRunType::ASYNC表示异步,在GPU上有多个流并以异步方式执行。实际上,Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. Precision
+
+Precision | Op support
+:---: | :---:
+Precision::INT4 | NO
+Precision::INT8 | NO
+Precision::FP16 | NO
+Precision::FP32 | YES
+Precision::FP64 | NO
+
+现在Op的精度只支持FP32, 但在将来我们会支持剩下的Precision.
+
+
+
+2. OpRunType
+
+OpRunType | Sync/Aync |Description
+:---: | :---: | :---:
+OpRunType::SYNC | Synchronization | single-stream on GPU
+OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器。
+```c++
+//some declarations
+...
+//Create a pointer to a graph.
+auto graph = new Graph();
+//do something...
+...
+
+//create a executor
+Net executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+
+获取输入输出tensor,并填充输入tensor的buffer。如果想要获取输入和输出tensor,那么必须指定输入的名字,如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外,如果想知道input_i对应哪个输入,你需要去dash board查看,如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
+
+```c++
+//some declaratinos
+...
+
+//create a executor
+//TargetType is NV [NVIDIA GPU]
+Net executor(*graph);
+
+//Get the first input tensor.
+//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+//Note: Member function get_in returns an pointer to tensor.
+Tensor* tensor_in0 = executor.get_in("input_0");
+
+//If you have multiple input tensors
+//You just type this code below.
+Tensor* tensor_in1 = executor.get_in("input_1");
+...
+auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后,就可以填充它的数据区了。
+
+```c++
+//This tensor is resident at GPU.
+auto tensor_d_in = executor.get_in("input_0");
+
+//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+//using Tensor4d = Tensor;
+Tensor4d tensor_h_in; //host tensor;
+//Tensor tensor_h_in;
+
+//Allocate memory for host tensor.
+tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+//Get a writable pointer to tensor.
+float *h_data = tensor_h_in.mutable_data();
+
+//Feed your tensor.
+/** example
+for(int i = 0; i < tensor_h_in.size(); i++){
+ h_data[i] = 1.0f;
+}
+*/
+//Copy host tensor's data to device tensor.
+tensor_d_in->copy_from(tensor_h_in);
+
+// And then
+```
+
+
+类似的,我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是, 我们需要指定输入tensor结点的名字,这个可以从dash board中看到,请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor:
+```c++
+//Note: this tensor are resident at GPU.
+Tensor* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+
+#### Executing graph
+
+
+当一切准备就绪后,我们就可以执行真正的计算了!
+```c++
+executor.prediction();
+```
+
+## 示例代码 ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前, 请确保你已经有了Anakin模型。如果还没有,那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+// Create an empty graph object.
+auto graph = new Graph();
+// Load Anakin model.
+auto status = graph->load(model_path);
+if(!status ) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+}
+// Reshape
+graph->Reshape("input_0", {10, 384, 960, 10});
+// You must optimize graph for the first time.
+graph->Optimize();
+// Create a executer.
+Net net_executer(*graph);
+
+//Get your input tensors through some specific string such as "input_0", "input_1", and
+//so on.
+//And then, feed the input tensor.
+//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+auto d_tensor_in_p = net_executer.get_in("input_0");
+Tensor4d h_tensor_in;
+auto valid_shape_in = d_tensor_in_p->valid_shape();
+for (int i=0; icopy_from(h_tensor_in);
+
+//Do inference.
+net_executer.prediction();
+
+//Get result tensor through the name of output node.
+//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+//For example, you've got a output node named obj_pre_out
+//Then, you can get an output tensor.
+auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+//......
+// do something else ...
+//...
+//save model.
+//You might not optimize the graph when you load the saved model again.
+std::string save_model_path = model_path + std::string(".saved");
+auto status = graph->save(save_model_path);
+if (!status ) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+}
+
+```
diff --git a/doc/fluid/advanced_usage/deploy/build_and_install_lib_cn.rst b/doc/fluid/advanced_usage/deploy/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3884284ea020fe94ed9c03ec84c856ee44aa8c3f
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/build_and_install_lib_cn.rst
@@ -0,0 +1,99 @@
+.. _install_or_build_cpp_inference_lib:
+
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+====================== ========================================
+版本说明 C++预测库
+====================== ========================================
+cpu_avx_mkl `fluid.tgz `_
+cpu_avx_openblas `fluid.tgz `_
+cpu_noavx_openblas `fluid.tgz `_
+cuda7.5_cudnn5_avx_mkl `fluid.tgz `_
+cuda8.0_cudnn5_avx_mkl `fluid.tgz `_
+cuda8.0_cudnn7_avx_mkl `fluid.tgz `_
+cuda9.0_cudnn7_avx_mkl `fluid.tgz `_
+====================== ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项:
+
+================= =========
+选项 值
+================= =========
+CMAKE_BUILD_TYPE Release
+FLUID_INSTALL_DIR 安装路径
+WITH_FLUID_ONLY ON(推荐)
+WITH_SWIG_PY OFF(推荐
+WITH_PYTHON OFF(推荐)
+WITH_GPU ON/OFF
+WITH_MKL ON/OFF
+================= =========
+
+建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径):
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+ PADDLE_ROOT=/path/of/capi
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+ mkdir build
+ cd build
+ cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DWITH_FLUID_ONLY=ON \
+ -DWITH_SWIG_PY=OFF \
+ -DWITH_PYTHON=OFF \
+ -DWITH_MKL=OFF \
+ -DWITH_GPU=OFF \
+ ..
+ make
+ make inference_lib_dist
+
+成功编译后,使用C++预测库所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件;(3)版本信息与编译选项信息)
+均会存放于PADDLE_ROOT目录中。目录结构如下:
+
+ .. code-block:: text
+
+ PaddleRoot/
+ ├── CMakeCache.txt
+ ├── paddle
+ │ └── fluid
+ │ ├── framework
+ │ ├── inference
+ │ ├── memory
+ │ ├── platform
+ │ ├── pybind
+ │ └── string
+ ├── third_party
+ │ ├── boost
+ │ │ └── boost
+ │ ├── eigen3
+ │ │ ├── Eigen
+ │ │ └── unsupported
+ │ └── install
+ │ ├── gflags
+ │ ├── glog
+ │ ├── mklml
+ │ ├── protobuf
+ │ ├── snappy
+ │ ├── snappystream
+ │ └── zlib
+ └── version.txt
+
+version.txt 中记录了该预测库的版本信息,包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号,如:
+
+ .. code-block:: text
+
+ GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+ WITH_MKL: ON
+ WITH_GPU: ON
+ CUDA version: 8.0
+ CUDNN version: v5
diff --git a/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..56ca582b2b47f404ede777712830731ea7f4e9b5
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
@@ -0,0 +1,73 @@
+# 模型转换指南
+
+Anakin 支持不同框架的模型预测。但由于格式的差别,Anakin 需要您预先转换模型。本文档介绍如何转换模型。
+
+## 简介
+
+Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型,模型包含网络结构(model 或 prototxt)和权重参数(param 或 caffemodel)。
+
+模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。
+
+
+## 系统要求
+
+- python 2.7+
+- pyyaml
+- flask
+- protobuf 3.5+
+
+
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于 *系统要求* 一节。
+
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。
+
+#### config.yaml
+```bash
+OPTIONS:
+ Framework: CAFFE # 依框架类型填写 CAFFE 或 FLUID
+ SavePath: ./output # 转换结束后模型的保存位置
+ ResultName: googlenet # 输出模型的名字
+ Config:
+ LaunchBoard: ON # 是否生成网络结构预览页面
+ Server:
+ ip: 0.0.0.0
+ port: 8888 # 从一个可用端口访问预览页面
+ OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项
+ enable: OFF
+ path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
+ LOGGER:
+ LogToPath: ./log/ # 生成日志的路径
+ WithColor: ON
+
+TARGET:
+ CAFFE:
+ # 当 Framework 为 CAFFE 时需填写
+ ProtoPaths:
+ - /path/to/caffe/src/caffe/proto/caffe.proto
+ PrototxtPath: /path/to/your/googlenet.prototxt
+ ModelPath: /path/to/your/googlenet.caffemodel
+
+ FLUID:
+ # 当 Framework 为 FLUID 时需填写
+ Debug: NULL
+ ProtoPaths:
+ - /
+ PrototxtPath: /path/to/fluid/inference_model
+ ModelPath: /path/to/fluid/inference_model
+ # ...
+```
+
+### 3、转换
+在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。
+
+
+### 4、预览
+最后一步,就是在浏览器中查看令人振奋的转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。
+
+> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2783eb9f591a31443f2a692ce0eb1bcc9b1063a
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念,详情请参考设计文档。
+
+```framework```: 上层的逻辑代码,负责从parser中获取参数及weights,添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码,Anakin通过saber封装了不同的backends,不同的实现(impl)分别特化出自己的实现,外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中,增加op主要修改saber/funcs下的内容。
+
+saber的文件结构:
+* saber/funcs下的是各个funcs的外部接口,这一层的op与具体的设备实现无关,只与各op完成的功能有关。由于跟实现(impl)无关,本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明,特定设备需要完成该层声明的特化版本,如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本,saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关,均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件,添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类,位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类,提供统一的对外接口,位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape,并通过```tensor```的```set_shape```接口(只设置shape,不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口,所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类,一类以```vender_```为前缀,带有```vender_```代码意为使用第三方库来实现该op,如cudnn的conv,或mkl的conv等等,这类op的性能我们难以调优,因此单独列为一类。另一类是带有源码的saber实现,这些实现都带有```saber_```为前缀,此类实现带有源码,能够通过后续优化不断提升性能,实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步:
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步,以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下:$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件:```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param,这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数,含参数的构造函数,复制构造函数,```operator=()```及```operator==()```。
+```
+template // 能够获得target, datatype, layout
+struct MulParam{
+ MulParam()
+ : alpha(0)
+ {}
+ MulParam(float alpha_in)
+ : alpha(alpha_in)
+ {}
+ MulParam(const MulParam& right)
+ : alpha(right.alpha)
+ {}
+ MulParam &operator=(const MulParam &right) {
+ alpha = right.alpha;
+ }
+ bool operator==(const MulParam &right) {
+ return alpha == right.alpha;
+ }
+ float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类,这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现,需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template
+class Mul : public BaseFunc<
+ Tensor,
+ Tensor,
+ Tensor,
+ ImplBase, MulParam> {
+public:
+ using BaseFunc<
+ Tensor,
+ Tensor,
+ Tensor,
+ ImplBase, MulParam>::BaseFunc;
+ Mul() = default;
+ typedef Tensor InDataTensor;
+ typedef Tensor OutDataTensor;
+ typedef Tensor OpTensor;
+ typedef MulParam Param_t;
+ typedef std::vector Input_v;
+ typedef std::vector Output_v;
+ typedef std::vector Shape_v;
+
+ virtual SaberStatus compute_output_shape(const Input_v &input,
+ Output_v &output, Param_t ¶m) override {
+ //计算输出的shape,
+ Shape output_shape = (input[0]->valid_shape());
+ /* code */
+ return output[0]->set_shape(output_shape);
+ }
+ virtual SaberStatus init_impl(ImplEnum implenum) override {
+ // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+ switch (implenum) {
+ case VENDER_IMPL:
+ this->_impl.push_back(new VenderMul );
+ return SaberSuccess;
+ case SABER_IMPL:
+ this->_impl.push_back(new SaberMul );
+ return SaberSuccess;
+ default:
+ return SaberUnImplError;
+ }
+ }
+private:
+ virtual void pick_best_static() override {
+ if (true) // some condition?
+ this->_best_impl = this->_impl[0];
+ }
+ virtual void pick_best_specify(ImplEnum implenum) override {
+ this->_best_impl = this->_impl[0];
+ }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl声明
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明,特化版本放在对应的文件夹下,这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字,第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端实现
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op,```saber```指的源码实现的op。这里以cuda的vender实现为例,简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template
+class VenderMul :
+ public ImplBase<
+ Tensor,
+ Tensor,
+ Tensor,
+ MulParam > >
+{
+public:
+ typedef Tensor DataTensor_in;
+ typedef Tensor DataTensor_out;
+ typedef Tensor OpTensor;
+ typedef typename DataTensor_in::Dtype InDataType;
+ typedef typename DataTensor_out::Dtype OutDataType;
+ typedef typename OpTensor::Dtype OpDataType;
+ VenderMul(){}
+ ~VenderMul() {}
+
+ virtual SaberStatus init(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param, Context& ctx) {
+ this->_ctx = ctx;
+ create(inputs, outputs, param, ctx);
+ }
+
+ virtual SaberStatus create(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param, Context& ctx) {
+ // set内部参数
+ }
+
+ virtual SaberStatus dispatch(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param) {
+ // dispatch kernel.
+ }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别:```init```接口是第一次初始化op的时候进入的接口,此函数只在第一次初始化op时调用,这个接口一般放一些只需要执行一次的代码,如malloc或者create之类的函数。```create```函数除了第一次init执行外,在输入发生变化或者param发生变化时会再次触发,create一般放置set函数,设置内部变量,当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内,如果```create```函数执行了一些严重耗时的操作,这里会拖慢整个op的执行时间,需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template
+class MulHelper;
+
+template
+class Mul : public Operator {
+public:
+ Mul() {}
+ /// forward impl
+ virtual void operator() (OpContext &ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+ LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">";
+ }
+ friend class MulHelper;
+};
+template
+class MulHelper : public OperatorHelper {
+public:
+ MulHelper() = default;
+ ~MulHelper();
+ Status InitParam() override;
+
+ Status Init(OpContext &ctx,
+ const std::vector >& ins,
+ std::vector >& outs) override;
+ Status InferShape(const std::vector >& ins,
+ std::vector >& outs) override;
+
+public:
+ saber::MulParam> _param_mul;
+ saber::Mul _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下:
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul::operator()(
+ OpContext& ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+ auto* impl =
+ static_cast*>(this->_helper);
+ auto& param =
+ static_cast*>(this->_helper)->_param_mul;
+ impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template
+Status MulHelper::InitParam() {
+ auto alpha = GET_PARAMETER(float, alpha);
+ MulParam> param_mul(alpha);
+ _param_mul = param_mul;
+ return Status::OK();
+}
+
+template
+Status MulHelper::Init(OpContext& ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+
+ SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+ return Status::OK();
+}
+
+template
+Status MulHelper::InferShape(const
+ std::vector >& ins,
+ std::vector >& outs) {
+ SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+ return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+ // init tensors and some param.
+
+ // start Reshape & doInfer
+ Context ctx1(0, 1, 1);
+
+ // create param
+ MulParam > param(alpha);
+
+ std::vector*> input;
+ std::vector*> output;
+
+ // create saber op
+ Mul mul;
+
+ // compute output shape
+ mul.compute_output_shape(input, output, param);
+
+ // re_alloc output tensors memory based on output shape
+ output[0]->re_alloc(output[0]->shape());
+
+ // init saber op(calling init and create)
+ mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+ // call operator()
+ mul(input, output, param, ctx1);
+
+ // cuda specified, record events
+ cudaStream_t cuda_stream = ctx1.get_compute_stream();
+ output[0]->record_event(cuda_stream);
+ output_dev.sync();
+
+ // param changed
+ param.alpha = 2.0;
+ // auto calling saber op(create and dispatch)
+ mul(input, output, param, ctx1);
+
+ cudaDeviceSynchronize();
+ CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+ anakin::saber::Env::env_init();
+
+ // initial logger
+ //logger::init(argv[0]);
+ InitTest();
+ RUN_ALL_TESTS(argv[0]);
+ return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现,由于存在saber/funcs/impl的非特化版本声明,当有op在某种设备下没有对应实现时,也能够编译,但此时是没有任何实现的空实现,
diff --git a/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f75f5e95cfb90f26d3782ba30a6d1887a70424
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤:
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## 在`CMakeList`中添加设备的支持 ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下,修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+ anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## 在`saber`中添加设备的实现 ##
+`saber`是`Anakin`的基础计算库,对外提供设备无关的统一的API,设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+ eINVALID = -1,
+ eNV = 1,
+ eAMD = 2,
+ eARM = 3,
+ eX86 = 4,
+ eNVHX86 = 5,
+ eTNEW = 6
+};
+
+typedef TargetType NV;
+typedef TargetType ARM;
+typedef TargetType AMD;
+typedef TargetType X86;
+typedef TargetType TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits {
+ typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+ typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型,则特化出设备的`DataTrait`类的实现,例如opencl数据类型的实现如下:
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+ ClMem(){
+ dmem = nullptr;
+ offset = 0;
+ }
+
+ ClMem(cl_mem* mem_in, int offset_in = 0) {
+ dmem = mem_in;
+ offset = offset_in;
+ }
+
+ ClMem(ClMem& right) {
+ dmem = right.dmem;
+ offset = right.offset;
+ }
+
+ ClMem& operator=(ClMem& right) {
+ this->dmem = right.dmem;
+ this->offset = right.offset;
+ return *this;
+ }
+
+ ClMem& operator+(int offset_in) {
+ this->offset += offset_in;
+ return *this;
+ }
+
+ int offset{0};
+ cl_mem* dmem;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef float dtype;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef double dtype;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类,在`target_wrapper.h`中声明函数,具体如下:
+```c++
+template <>
+struct TargetWrapper { //根据TNEW的具体类型修改__xxx_target,__host_target或者__device_target
+
+ typedef xxx_event event_t; //根据设备实现xxx_event
+ typedef xxx_stream stream_t; //根据设备实现xxx_stream
+
+ static void get_device_count(int& count);
+
+ static void set_device(int id);
+
+ //We should add strategy to avoid malloc directly
+ static void mem_alloc(void** ptr, size_t n);
+
+ static void mem_free(void* ptr);
+
+ static void mem_set(void* ptr, int value, size_t n);
+
+ static void create_event(event_t& event, bool flag = false);
+
+ static void create_stream(stream_t& stream);
+
+ static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+ static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+ static void destroy_stream(stream_t& stream);
+
+ static void destroy_event(event_t& event);
+
+ static void record_event(event_t& event, stream_t stream);
+
+ static void query_event(event_t& event);
+
+ static void sync_event(event_t& event);
+
+ static void sync_stream(event_t& event, stream_t& stream);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __DtoD);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __DtoD);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __HtoD);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __HtoD);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __DtoH);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __DtoH);
+
+ static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+ int src_dev, size_t count);
+
+ static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+ int src_dev, size_t count, stream_t& stream);
+
+ static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper`结构体中各函数的定义。
+如果`TargetWrapper`的实现与默认的模板类一致,则不用特化出该类。
+
+```c++
+typedef TargetWrapper TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+ // add implementation
+}
+
+void TNEW_API::set_device(int id){
+ // add implementation
+}
+
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+ // add implementation
+}
+
+void TNEW_API::mem_free(void* ptr){
+ if(ptr != nullptr){
+ // add implementation
+ }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device`
+
+```c++
+template <>
+void Device::create_stream() {
+ // add implementation
+}
+
+template <>
+void Device::get_info() {
+
+ // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](addCustomOp.md)
+
+
+## 在`framework`中添加设备的具体化或实例化 ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net;
+template class Net;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker;
+template class Worker;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template
+class PBlock {
+public:
+ typedef Tensor4d::type> type;
+
+ PBlock() {
+ _inner_tensor = std::make_shared();
+ }
+ ...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host {
+ typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+
+```c++
+ #ifdef USE_TNEW_PLACE
+ template class Graph;
+ template class Graph;
+ template class Graph;
+ #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+
+```c++
+ #ifdef USE_TNEW_PLACE
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ #endif
+```
+
+* `model_io.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class NodeIO;
+template class NodeIO;
+template class NodeIO;
+#endif
+```
+
+### `framework/operators`
+
+为`framework/operators`目录下所有op添加实例化或具体化
+以`activation.cpp`为例,实例化如下:
+
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template class ActivationHelper;
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+如果TNEW设备函数的实现与现有模板实现不一致,可以特化实现如下(以init()为例):
+```c++
+#ifdef USE_TNEW_PLACE
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
+INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
+template <>
+Status ActivationHelper::Init(OpContext &ctx,\
+ const std::vector >& ins, \
+ std::vector >& outs) {
+ SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
+ return Status::OK();
+}
+ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
+#endif
+```
+
+在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
+
+```c++
+#ifdef USE_TNEW_PLACE
+.__alias__("activation")
+#endif
+```
+
+## 注意事项
+不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/doc/fluid/advanced_usage/deploy/index_anakin.rst b/doc/fluid/advanced_usage/deploy/index_anakin.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e65dd941ea6e39e00014f2610b579f095e3ba774
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/index_anakin.rst
@@ -0,0 +1,26 @@
+服务器端部署 - Anakin
+#######################
+
+
+使用文档
+~~~~~~~
+
+.. toctree::
+ :maxdepth: 1
+
+ install_anakin.md
+ convert_paddle_to_anakin.md
+ run_anakin_on_arm.md
+ anakin_tutorial.md
+ anakin_example.md
+ anakin_gpu_benchmark.md
+ anakin_arm_benchmark.md
+
+开发文档
+~~~~~~~
+
+.. toctree::
+ :maxdepth: 1
+
+ how_to_add_anakin_op.md
+ how_to_support_new_device_in_anakin.md
diff --git a/doc/fluid/advanced_usage/deploy/index_mobile.rst b/doc/fluid/advanced_usage/deploy/index_mobile.rst
new file mode 100644
index 0000000000000000000000000000000000000000..47df6392c123d520c701089db6ee1ae72e4f8ea5
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/index_mobile.rst
@@ -0,0 +1,9 @@
+移动端部署
+##########
+
+.. toctree::
+ :maxdepth: 2
+
+ mobile_build.md
+ mobile_dev.md
+
diff --git a/doc/fluid/advanced_usage/deploy/install_anakin.md b/doc/fluid/advanced_usage/deploy/install_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb7c1950308622e3de292268a718e6ec688e6ae6
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/install_anakin.md
@@ -0,0 +1,69 @@
+## 从源码编译安装Anakin ##
+
+我们已经在CentOS 7.3上成功的安装和测试了Anakin,对于其他操作系统,我们将很快支持。
+
+### 安装概览 ###
+
+* [在CentOS上安装 Anakin]()
+* [在Ubuntu上安装 Anakin]()
+* [在ARM上安装 Anakin](run_on_arm_ch.md)
+* [验证安装]()
+
+
+### 在CentOS上安装 Anakin ###
+#### 1. 系统要求 ####
+
+* make 3.82+
+* cmake 2.8.12+
+* gcc 4.8.2+
+* g++ 4.8.2+
+* 其他需要补充的。。。
+
+#### 2. 编译CPU版Anakin ####
+
+暂时不支持
+
+#### 3. 编译支持NVIDIA GPU的Anakin ####
+
+- 3.1. 安装依赖
+ - 3.1.1 protobuf
+ >$ git clone https://github.com/google/protobuf
+ >$ cd protobuf
+ >$ git submodule update --init --recursive
+ >$ ./autogen.sh
+ >$ ./configure --prefix=/path/to/your/insall_dir
+ >$ make
+ >$ make check
+ >$ make install
+ >$ sudo ldconfig
+
+
+ 如安装protobuf遇到任何问题,请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
+
+- 3.2 CUDA Toolkit
+ - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher. 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+ - [cuDNN v7](https://developer.nvidia.com/cudnn). 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+- 3.3 编译Anakin
+ >$ git clone https:/xxxxx
+ >$ cd anakin
+ >$ mkdir build
+ >$ camke ..
+ >$ make
+
+
+#### 4. 编译支持AMD GPU的Anakin ####
+
+暂时还不支持
+
+
+### 在Ubuntu上安装 Anakin ###
+
+暂时还不支持
+
+
+### 在ARM上安装 Anakin ###
+
+暂时还不支持
+
+### 验证安装 ###
+we are coming soon...
diff --git a/doc/fluid/advanced_usage/deploy/mobile_build.md b/doc/fluid/advanced_usage/deploy/mobile_build.md
new file mode 100644
index 0000000000000000000000000000000000000000..e51593164987d548e256ddebbc5fa8d960fb5255
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/mobile_build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下,执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY TAG IMAGE ID CREATED SIZE
+paddle-mobile dev 33b146787711 45 hours ago 372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录,执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+ Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX /usr/local
+ CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU ON
+ DEBUGING ON
+ FPGA OFF
+ LOG_PROFILE ON
+ MALI_GPU OFF
+ NET googlenet
+ USE_EXCEPTION ON
+ USE_OPENMP OFF
+```
+修改选项后,按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法,可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具,可能需要设置 CC,CXX 环境变量,或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake,或者增加自己需要的 toolchain file。
diff --git a/doc/fluid/advanced_usage/deploy/mobile_dev.md b/doc/fluid/advanced_usage/deploy/mobile_dev.md
new file mode 100644
index 0000000000000000000000000000000000000000..474380f9dbfd2fb8a06630cb1ca3ca5cd14ca9d9
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/mobile_dev.md
@@ -0,0 +1,72 @@
+# iOS开发文档
+
+## 编译
+
+### 一. 使用 build.sh 编译
+
+```sh
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+
+### 二. 使用 xcode 编译
+
+我们提供了 ios 开发更为熟悉的 xcode 编译环境:
+在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+
+### 三. 集成
+
+#### 如使用 c++ 接口
+将
+
+```
+libpaddle-mobile.a
+io.h
+program.h
+types.h
+lod_tensor.h
+tensor.h
+```
+拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+
+#### 如使用 oc 接口
+将在xcode 编译生成的
+```
+libPaddleMobile.a
+PaddleMobile.h
+```
+拖入工程, 接口如下:
+
+```
+/*
+ 创建单例对象
+*/
++ (instancetype)sharedInstance;
+
+/*
+ load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+ 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image means:(NSArray *)means scale:(float)scale;
+
+/*
+ 进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image;
+
+/*
+ 清理内存
+*/
+- (void)clear;
+
+```
diff --git a/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
new file mode 100644
index 0000000000000000000000000000000000000000..ebeb38f534ebfc8cb5a41d103abe3bb1de7e379a
--- /dev/null
+++ b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
@@ -0,0 +1,151 @@
+## 源码编译 Anakin ##
+
+目前Anakin支持ARM Android平台,采用Android NDK交叉编译工具链,已在mac os和centos上编译和测试通过。
+
+### 安装概览 ###
+
+* [系统需求](#0001)
+* [安装第三方依赖](#0002)
+* [Anakin源码编译](#0003)
+* [验证安装](#0004)
+
+
+### 1. 系统需求 ###
+
+* 宿主机: linux, mac
+* cmake 3.8.2+
+* Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
+
+### 2. 安装第三方依赖 ###
+
+- 2.1 protobuf3.4.0
+ 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)
+ - 2.1.1 为宿主机编译protobuf
+ ```bash
+ $ tar -xzf protobuf-3.4.0.tar.gz
+ $ cd protobuf-3.4.0
+ $ ./autogen.sh
+ $ ./configure
+ $ make
+ $ make check
+ $ make install
+ ```
+ 上述 $make install 执行后,可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下,
+ 如有问题,请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
+ 然后将已经生成文件清除。
+ ```bash
+ $ make distclean
+ ```
+ - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf,注意设置ANDROID_NDK的路径,以及ARCH_ABI、HOSTOSN的值,
+ ```bash
+
+ $ export ANDROID_NDK=your_ndk_path
+ $ ARCH_ABI="arm-linux-androideabi-4.9"
+ $ HOSTOSN="darwin-x86_64"
+ $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm
+ $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
+ $ export LDFLAGS="--sysroot=$SYSROOT"
+ $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
+ $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
+ $ export CPPFLAGS=""
+ $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
+ $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
+ $ export CCFLAGS="$CXXFLAGS"
+ $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
+ $ export CC="$CXX"
+ $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"
+ $ ./autogen.sh
+ $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"
+ $ make
+ ```
+
+ 编译生成 *.a 静态库,若希望编译*.so 动态链接库 ,请在./configure参数中改--disable-shared为--disable-static --enable-shared。
+ 生成文件在src/.libs/下,将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。
+ 在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。
+ ```cmake
+ set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
+ ```
+
+- 2.2 opencv 2.4.3+(optional)
+ Anakin只在examples示例中使用opencv
+ Android系统的opencv从[这里下载](https://opencv.org/releases.html)
+ 解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`
+ 在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`,
+ 并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。
+ ```cmake
+ include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
+ LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
+ ```
+### 3. Anakin源码编译 ###
+
+#### 编译Android版本
+
+ 克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
+```bash
+ cd your_dir
+ git clone https://github.com/PaddlePaddle/Anakin.git
+ cd Anakin
+ git fetch origin arm
+ git checkout arm
+ ```
+ 修改`android_build.sh`
+- 修改NDK路径
+ ```bash
+ #modify "your_ndk_path" to your NDK path
+ export ANDROID_NDK=your_ndk_path
+ ```
+- 修改ARM 处理器架构
+ 对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`,
+ 对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。
+ 目前我们只支持 `armeabi-v7a with NEON`;`arm64-v8a` 还在开发中。
+ ```bash
+ -DANDROID_ABI="armeabi-v7a with NEON"
+ ```
+- 设置Android API
+ 根据Android系统的版本设置API level, 例如API Level 21 -> Android 5.0.1
+ ```bash
+ -DANDROID_NATIVE_API_LEVEL=21
+ ```
+
+- 选择编译静态库或动态库
+ 设置`BUILD_SHARED=NO`编译静态库
+ 设置`BUILD_SHARED=YES`编译动态库
+ ```bash
+ -DBUILD_SHARED=NO
+ ```
+- OpenMP多线程支持
+ 设置`USE_OPENMP=YES`开启OpenMP多线程
+ ```bash
+ -DUSE_OPENMP=YES
+ ```
+
+- 编译单测文件
+ 设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件
+ ```bash
+ -DBUILD_WITH_UNIT_TEST=YES
+ ```
+
+- 编译示例文件
+ 设置`BUILD_EXAMPLES=YES`将会编译示例文件
+ ```bash
+ -DBUILD_EXAMPLES=YES
+ ```
+
+- 开启opencv
+ 如果使用opencv,设置`USE_OPENCV=YES`
+ ```bash
+ -DUSE_OPENCV=YES
+ ```
+
+- 开始编译
+ 运行脚本 `android_build.sh` 将自动编译Anakin
+ ```bash
+ ./android_build.sh
+ ```
+
+### 4. 验证安装 ###
+ 编译好的库会放在目录`${Anakin_root}/output`下;
+ 编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下;
+ 编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
+
+ 对于Android系统,打开设备的调试模式,通过ADB可以访问的目录是`data/local/tmp`,通过ADB push将测试文件、模型和数据发送到设备目录, 运行测试文件。
diff --git a/doc/fluid/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/advanced_usage/development/contribute_to_paddle.md
new file mode 100644
index 0000000000000000000000000000000000000000..3244eedf918b93f9351258f1218dfb2d507c1a9c
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/contribute_to_paddle.md
@@ -0,0 +1,243 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献,欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开,并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页,然后单击 `Fork` 按钮,生成自己目录下的仓库,比如 。
+
+## 克隆(Clone)
+
+将远程仓库 clone 到本地:
+
+```bash
+➜ git clone https://github.com/USERNAME/Paddle
+➜ cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发,测试,发行和维护,具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成,一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜ git checkout -b my-cool-stuff
+```
+
+值得注意的是,在 checkout 之前,需要保持当前分支目录 clean,否则会把 untracked 的文件也带到新分支上,这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 Paddle,首先安装并在当前目录运行它:
+
+```bash
+➜ pip install pre-commit
+➜ pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。
+
+注:通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的,Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中,我删除了 README.md 中的一行,并创建了一个新文件。
+
+通过 `git status` 查看当前状态,这会提示当前目录的一些变化,同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜ git status
+On branch test
+Changes not staged for commit:
+ (use "git add ..." to update what will be committed)
+ (use "git checkout -- ..." to discard changes in working directory)
+
+ modified: README.md
+
+Untracked files:
+ (use "git add ..." to include in what will be committed)
+
+ test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像,在源码目录树的根目录中运行:
+
+```bash
+➜ docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以:
+
+```bash
+➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`):
+
+```bash
+➜ docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试,可以用如下命令:
+
+```bash
+➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息,请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交(commit)
+
+接下来我们取消对 README.md 文件的改变,然后提交新添加的 test 文件。
+
+```bash
+➜ git checkout -- README.md
+➜ git status
+On branch test
+Untracked files:
+ (use "git add ..." to include in what will be committed)
+
+ test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜ git add test
+```
+
+Git 每次提交代码,都需要写提交说明,这可以让其他人知道这次提交做了哪些改变,这可以通过`git commit` 完成。
+
+```bash
+➜ git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前,需要同步原仓库()最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜ git remote
+origin
+➜ git remote -v
+origin https://github.com/USERNAME/Paddle (fetch)
+origin https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字,也就是自己用户名下的 Paddle,接下来我们创建一个原始 Paddle 仓库的远程主机,命名为 upstream。
+
+```bash
+➜ git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜ git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜ git fetch upstream
+➜ git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上,也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜ git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题,并记录它的编号。
+
+切换到所建分支,然后点击 `New pull request`。
+
+
+
+选择目标分支:
+
+
+
+在 PR 的描述说明中,填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后,自动关闭对应的 Issue,具体请见 。
+
+接下来等待 review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。
+
+
+
+也可以使用 `git push origin :分支名` 删除远程分支,如:
+
+```bash
+➜ git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后,删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜ git checkout develop
+
+# 删除 my-cool-stuff 分支
+➜ git branch -D my-cool-stuff
+```
+
+至此,我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。
+2. 提交PUll Request前:
+ - 请注意commit的数量:
+ - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。
+ - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+ - 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。
+3. 如果解决了某个Issue的问题,请在该PUll Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外,在回复评审人意见时,请您遵守以下约定:
+
+1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢):
+ - 对评审意见同意且按其修改完的,给个简单的`Done`即可;
+ - 对评审意见不同意的,请给出您自己的反驳理由。
+2. 如果评审意见比较多:
+ - 请给出总体的修改情况。
+ - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。
diff --git a/doc/fluid/advanced_usage/development/cpu_profiling_cn.md b/doc/fluid/advanced_usage/development/cpu_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..198a05a79e19227e90eaafe116217a164cd51a7d
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/cpu_profiling_cn.md
@@ -0,0 +1,183 @@
+# CPU性能调优
+
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址,即可显示性能分析的结果:
+
+```
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.284 0.284 29.514 29.514 main.py:1()
+ 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/executor.py:20(run)
+ 4696 12.040 0.003 12.040 0.003 {built-in method run}
+ 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14()
+```
+
+每一列的含义是:
+
+
+
+
+列名 |
+含义 |
+
+
+
+
+ ncalls |
+ 函数的调用次数 |
+
+
+tottime |
+ 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+
+
+ percall |
+ tottime的每次调用平均时间 |
+
+
+ cumtime |
+ 函数总时间。包含这个函数调用其他函数的时间 |
+
+
+ percall |
+ cumtime的每次调用平均时间 |
+
+
+ filename:lineno(function) |
+ 文件名, 行号,函数名 |
+
+
+
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序,效果如下:
+
+```text
+ 4696 12.040 0.003 12.040 0.003 {built-in method run}
+ 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+ 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
+ 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
+ 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1()
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
+
+```text
+Called By:
+
+ Ordered by: internal time
+ List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function was called by...
+ ncalls tottime cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:487(clone)
+ 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+ Ordered by: internal time
+ List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后,我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python` ./main.py.prof
+```
+
+这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址,我们可以查看性能分析的结果。结果如下图所示:
+
+
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中,
+
+
+
+在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。
+
+在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。
diff --git a/doc/fluid/advanced_usage/development/gpu_profiling_cn.rst b/doc/fluid/advanced_usage/development/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2396716bddd4810fa77c738d41f5482aa6d6055
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+============
+GPU性能调优
+============
+
+.. contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析?
+- 为什么需要性能分析?
+- 如何进行性能分析?
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析?
+================
+在软件工程的范畴里,性能分析(Profiling)是一个动态程序分析的术语,它可以指测量一个程序的空间(内存)复杂度或时间复杂度,
+也可以说是某些特定指令的使用情况,或者是函数调用的频率和耗时等。通常情况下,分析得到的信息用于协助进行程序的优化。
+
+简单来说,性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为,那程序分析工具是必不可少的利器。简单的性能分析,可以告诉您某个操作到底花了多长时间?而更深入的分析,甚至能解释为什么某个操作花了很长时间?
+
+为什么需要性能分析?
+============================
+训练好一个深层神经网络通常要耗费非常长的时间,所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务,是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时,那也就不需要急着优化性能啦!
+
+如何进行性能分析?
+========================
+为了达到性能最优,您可以采用下面五个步骤:
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU, it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下,处理器有两个关键性能限制:一个是浮点计算量,另一个是内存操作量。
+GPU则还需要高并行性,才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说,市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具, **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中,我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+ :language: c++
+ :lines: 137-151
+ :linenos:
+
+上述的代码片段包含了两种方法,您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装,可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象,封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作;同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器,您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子:
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数(如高亮部分)。
+
+ .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+ :language: c++
+ :lines: 137-151
+ :emphasize-lines: 8-12,14
+ :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开,重新编译PaddlePaddle。
+
+ .. code-block:: bash
+
+ cmake .. -DWITH_TIMER=ON
+ make
+
+3. 执行您的代码,并观察结果(如高亮部分)。
+
+ .. code-block:: bash
+ :emphasize-lines: 1,12-15
+
+ > ./paddle/legacy/math/tests/test_GpuProfiler
+ I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
+ I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+ I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+ [==========] Running 1 test from 1 test case.
+ [----------] Global test environment set-up.
+ [----------] 1 test from Profiler
+ [ RUN ] Profiler.BilinearFwdBwd
+ I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+ gSizeX = 64, imgSizeY = 64"
+ I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+ I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+ I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1
+ I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+ I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+ [ OK ] Profiler.BilinearFwdBwd (136 ms)
+ [----------] 1 test from Profiler (136 ms total)
+
+ [----------] Global test environment tear-down
+ [==========] 1 test from 1 test case ran. (136 ms total)
+ [ PASSED ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**,您按如下步骤操作即可:
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中(参考强调部分)。
+
+ .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
+ :language: c++
+ :lines: 137-151
+ :emphasize-lines: 6-7
+ :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开,重新编译PaddlePaddle。
+
+ .. code-block:: bash
+
+ cmake .. -DWITH_PROFILER=ON
+ make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+ .. code-block:: bash
+
+ nvprof ./paddle/legacy/math/tests/test_GpuProfiler
+
+然后,您就能获得如下的分析结果:
+
+.. code-block:: bash
+
+ ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
+ ==78544== Profiling result:
+ Time(%) Time Calls Avg Min Max Name
+ 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD]
+ 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw
+ 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw
+ 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH]
+
+ ==78544== API calls:
+ Time(%) Time Calls Avg Min Max Name
+ 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags
+ 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree
+ 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate
+ 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy
+ 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize
+ 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc
+ 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc
+ 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice
+ 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags
+ 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute
+ 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount
+ 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties
+ 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch
+ 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName
+ 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem
+ 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice
+ 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate
+ 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute
+ 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart
+ 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall
+ 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError
+ 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument
+ 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet
+ 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount
+ 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion
+ 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit
+ 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**,您可以导入 :code:`nvprof -o ...` 的输出,或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启)
+
+.. image:: nvvp1.png
+ :align: center
+ :scale: 33%
+
+从内核函数的角度, **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时,如下图所示, **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+.. image:: nvvp2.png
+ :align: center
+ :scale: 33%
+
+而从应用的角度, **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如,下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议,为您做性能调优提供了方向。
+
+.. image:: nvvp3.png
+ :align: center
+ :scale: 33%
+
+.. image:: nvvp4.png
+ :align: center
+ :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段,从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密,您最好先确认:这一块的耗时比例真的太高,值得深入分析。
+- 可能的情况下,试着让输出的分析数据和理论值对应。
+
+ 1) 例如,如果我知道内核花了10ms来移动1GB数据,那我会期望分析工具统计到速度是100GB/s。
+ 2) 若有不一致之处,很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件:如果您的GPU理论可以达到6 TFLOPs(6万亿次浮点运算每秒),而当前已经有5.5 TFLOPs了,那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果!
+当然,具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015
diff --git a/doc/fluid/advanced_usage/development/host_memory_profiling_cn.md b/doc/fluid/advanced_usage/development/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb0883dd937465d15479b29df95078edb50e069
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+# 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
+
+
+目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能:
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 环境
+
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
+
+## 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下:
+
+```
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式:
+ - 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。
+
+ ```
+ pprof --pdf python test.log.0012.heap
+ ```
+ 上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。
+
+ 
+
+ - Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。
+ ```
+ pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+ ```
+ 生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+
+ 从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。
+
+ 
+ 
+
diff --git a/doc/fluid/advanced_usage/development/new_op.md b/doc/fluid/advanced_usage/development/new_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff7408111fa20a7a6a3a2fe9f9ba20835918f399
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/new_op.md
@@ -0,0 +1,435 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现c类)
+ - [定义ProtoMaker类](#定义protomaker类)
+ - [定义Operator类](#定义operator类)
+ - [定义OpKernel类](#定义opkernel类)
+ - [注册Operator](#注册operator)
+ - [编译](#编译)
+ - [绑定Python](#绑定python)
+ - [实现单元测试](#实现单元测试)
+ - [前向Operator单测](#前向operator单测)
+ - [反向Operator单测](#反向operator单测)
+ - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
+
+
+## 概念简介
+
+简单介绍需要用到基类,详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写,Op)基类。
+- `framework::OpKernel`: Op计算函数的基类,称作Kernel。
+- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
+- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
+
+
+
+
+内容 |
+定义位置 |
+
+
+
+
+OpProtoMake定义 |
+.cc 文件,Backward Op不需要定义OpProtoMake |
+
+
+Op定义 |
+ .cc 文件 |
+
+
+Kernel实现 |
+ CPU、CUDA共享Kernel实现在.h 文件中,否则,CPU 实现在.cc 文件中,CUDA 实现在.cu 文件中。 |
+
+
+注册Op |
+ Op注册实现在.cc 文件;Kernel注册CPU实现在.cc 文件中,CUDA实现在.cu 文件中 |
+
+
+
+
+
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 定义ProtoMaker类
+
+矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+ AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+ AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+ AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+ }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数:
+
+ - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
+ - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
+
+构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
+
+上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例:
+
+```cpp
+template
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X", "(Tensor) Input tensor of scale operator.");
+ AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+ AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
+)DOC");
+ AddAttr("scale",
+ "(float, default 1.0)"
+ "The scaling factor of the scale operator.")
+ .SetDefault(1.0);
+ }
+};
+```
+
+这个例子有`AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
+
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker,若未定制对应前向Op的GradProtoMaker,fluid提供了DefaultGradProtoMaker,默认注册会使用全部输入输出,包括Input, Output, Output@Grad等,使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ std::unique_ptr Apply() const override {
+ auto *grad_op = new framework::OpDesc();
+ grad_op->SetType("scale");
+ grad_op->SetInput("X", OutputGrad("Out"));
+ grad_op->SetOutput("Out", InputGrad("X"));
+ grad_op->SetAttr("scale", GetAttr("scale"));
+ return std::unique_ptr(grad_op);
+ }
+};
+```
+
+### 定义Operator类
+
+下面实现了MulOp的定义:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(const framework::InferShapeContext &ctx) const override {
+ auto dim0 = ctx.Input("X")->dims();
+ auto dim1 = ctx.Input("Y")->dims();
+ PADDLE_ENFORCE_EQ(dim0.size(), 2,
+ "input X(%s) should be a tensor with 2 dims, a matrix",
+ ctx.op_.Input("X"));
+ PADDLE_ENFORCE_EQ(dim1.size(), 2,
+ "input Y(%s) should be a tensor with 2 dims, a matrix",
+ ctx.op_.Input("Y"));
+ PADDLE_ENFORCE_EQ(
+ dim0[1], dim1[0],
+ "First matrix's width must be equal with second matrix's height.");
+ ctx.Output("Out")->Resize({dim0[0], dim1[1]});
+ }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员:
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
+
+ - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
+ - 2). 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中
+
+### 定义OpKernel类
+
+`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
+
+- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型,如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现:
+
+ ```cpp
+ template
+ class MulKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto* X = context.Input("X");
+ auto* Y = context.Input("Y");
+ auto* Z = context.Output("Out");
+ Z->mutable_data(context.GetPlace());
+ auto& device_context = context.template device_context();
+ math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
+ }
+ };
+ ```
+
+需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
+
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
+
+### 注册Operator
+
+- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
+
+ ```cpp
+ namespace ops = paddle::operators;
+ REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+ paddle::framework::DefaultGradOpDescMaker)
+ REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
+ REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
+ REGISTER_OP_CPU_KERNEL(mul_grad,
+ ops::MulGradKernel);
+ ```
+
+ 在上面的代码中:
+
+ - `REGISTER_OPERATOR` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
+ - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。
+
+
+- 在 `.cu`文件中注册CUDA Kernel。
+ - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
+
+ ```cpp
+ // if use Eigen unsupported module before include head files
+ #define EIGEN_USE_GPU
+
+ namespace ops = paddle::operators;
+ REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel);
+ REGISTER_OP_CUDA_KERNEL(mul_grad,
+ ops::MulGradKernel);
+ ```
+
+### 编译
+
+运行下面命令可以进行编译:
+
+```
+make mul_op
+```
+
+## 绑定Python
+
+系统会对新增的op自动绑定Python,并链接到生成的lib库中。
+
+## 实现单元测试
+
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
+
+### 前向Operator单测
+
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要:
+
+1. 在`setUp`函数定义输入、输出,以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。
+
+
+ ```python
+ import unittest
+ import numpy as np
+ from op_test import OpTest
+
+
+ class TestMulOp(OpTest):
+ def setUp(self):
+ self.op_type = "mul"
+ self.inputs = {
+ 'X': np.random.random((32, 84)).astype("float32"),
+ 'Y': np.random.random((84, 100)).astype("float32")
+ }
+ self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+ def test_check_output(self):
+ self.check_output()
+
+ def test_check_grad_normal(self):
+ self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+ def test_check_grad_ingore_x(self):
+ self.check_grad(
+ ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+ def test_check_grad_ingore_y(self):
+ self.check_grad(
+ ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+ ```
+
+上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
+
+- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
+- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
+
+### 反向operator单测
+
+而反向测试中:
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+ - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+ - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+ - 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
+
+
+### 编译和执行
+
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+
+请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试:
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+或者:
+
+```bash
+ctest -R test_mul_op
+```
+
+## 注意事项
+
+- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OPERATOR(B, ...)`等,这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义,基本格式如下:
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真,或者比较对象A=B,则检查通过,否则会终止程序运行,向用户反馈相应的错误提示信息。
+为了确保提示友好易懂,开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方,必须有详略得当的备注解释!**错误提示信息**不能为空!
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了?为什么错了?
+ - 例如:`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的?实际的输入是怎样的?
+ - 例如:`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见?
+ - 例如:`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点,根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单,不能给用户提供有效的提示!
+
+问题示例1 :未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 :提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么?
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写,不易理解!
+
+问题示例:
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+ "Fail to find eltwise_fwd_pd in device context"); //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口:Op内部如果出现Output = ShareDataWith(Input)
+问题示例:
+```cpp
+auto *out = ctx.Output("Out");
+auto *in = ctx.Input("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input),相当于operator图的中有一条隐藏边,连接了Input和Output,这条边无法在图分析中表达,引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作,性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen,gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量,请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`
+
+正确示例:
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+ "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查,要写明反向Op的名字
+
+正确示例:
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/source/advanced_usage/development/nvvp1.png b/doc/fluid/advanced_usage/development/nvvp1.png
similarity index 100%
rename from source/advanced_usage/development/nvvp1.png
rename to doc/fluid/advanced_usage/development/nvvp1.png
diff --git a/source/advanced_usage/development/nvvp2.png b/doc/fluid/advanced_usage/development/nvvp2.png
similarity index 100%
rename from source/advanced_usage/development/nvvp2.png
rename to doc/fluid/advanced_usage/development/nvvp2.png
diff --git a/source/advanced_usage/development/nvvp3.png b/doc/fluid/advanced_usage/development/nvvp3.png
similarity index 100%
rename from source/advanced_usage/development/nvvp3.png
rename to doc/fluid/advanced_usage/development/nvvp3.png
diff --git a/source/advanced_usage/development/nvvp4.png b/doc/fluid/advanced_usage/development/nvvp4.png
similarity index 100%
rename from source/advanced_usage/development/nvvp4.png
rename to doc/fluid/advanced_usage/development/nvvp4.png
diff --git a/source/advanced_usage/development/pprof_1.png b/doc/fluid/advanced_usage/development/pprof_1.png
similarity index 100%
rename from source/advanced_usage/development/pprof_1.png
rename to doc/fluid/advanced_usage/development/pprof_1.png
diff --git a/source/advanced_usage/development/pprof_2.png b/doc/fluid/advanced_usage/development/pprof_2.png
similarity index 100%
rename from source/advanced_usage/development/pprof_2.png
rename to doc/fluid/advanced_usage/development/pprof_2.png
diff --git a/source/advanced_usage/development/timeline.jpeg b/doc/fluid/advanced_usage/development/timeline.jpeg
similarity index 100%
rename from source/advanced_usage/development/timeline.jpeg
rename to doc/fluid/advanced_usage/development/timeline.jpeg
diff --git a/doc/fluid/advanced_usage/development/timeline_cn.md b/doc/fluid/advanced_usage/development/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..faf39f276dbddcd4961407ba2d082c9826051cbe
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/timeline_cn.md
@@ -0,0 +1,32 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+ **提示:**
+ 请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
+
+ ```python
+ for pass_id in range(pass_num):
+ for batch_id, data in enumerate(train_reader()):
+ if pass_id == 0 and batch_id == 5:
+ profiler.start_profiler("All")
+ elif pass_id == 0 and batch_id == 10:
+ profiler.stop_profiler("total", "/tmp/profile")
+ exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[])
+ ...
+ ```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. 打开chrome浏览器,访问,用`load`按钮来加载生成的`timeline`文件。
+
+ 
+
+1. 结果如下图所示,可以放到来查看timetime的细节信息。
+
+ 
diff --git a/source/advanced_usage/development/tracing.jpeg b/doc/fluid/advanced_usage/development/tracing.jpeg
similarity index 100%
rename from source/advanced_usage/development/tracing.jpeg
rename to doc/fluid/advanced_usage/development/tracing.jpeg
diff --git a/doc/fluid/advanced_usage/development/write_docs.rst b/doc/fluid/advanced_usage/development/write_docs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4231f2bb5cd800c0cd86835b5d07e491fcde4989
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/write_docs.rst
@@ -0,0 +1,136 @@
+#############
+如何贡献文档
+#############
+
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的,PaddlePaddle.org工具可以帮助我们实现这一编译过程,并提供更好的预览效果。
+
+如何构建文档
+============
+
+PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
+
+我们建议使用PaddlePaddle.org工具来构建文档。
+
+使用PaddlePaddle.org工具
+------------------------
+这个是目前推荐的使用方法。除了可以自动编译文档,还可以直接在网页中预览文档,需要注意的是,采用后续说明的其它方式虽然也可以预览文档,但是文档的样式与官网文档是不一致的,使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。
+
+PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后即可用以下命令启动工具
+
+.. code-block:: bash
+
+ mkdir paddlepaddle # Create paddlepaddle working directory
+ cd paddlepaddle
+
+ # Clone the content repositories
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ git clone https://github.com/PaddlePaddle/book.git
+ git clone https://github.com/PaddlePaddle/models.git
+ git clone https://github.com/PaddlePaddle/Mobile.git
+
+ # Please specify the working directory through -v
+ docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 /.ppo_workspace/content。
+
+如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+.. code-block:: bash
+
+ mkdir paddlepaddle # Create paddlepaddle working directory
+ cd paddlepaddle
+
+ # Clone the content repositories and PaddlePaddle.org
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ git clone https://github.com/PaddlePaddle/book.git
+ git clone https://github.com/PaddlePaddle/models.git
+ git clone https://github.com/PaddlePaddle/Mobile.git
+ git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+ # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+ export CONTENT_DIR=
+ export ENV=''
+ cd PaddlePaddle.org/portal/
+ pip install -r requirements.txt
+ python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 /.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。
+
+不使用PaddlePaddle.org工具
+--------------------------
+
+使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。该方法与 `从源码编译PaddlePaddle `_ 相似,通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行,在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档,具体步骤如下:
+
+.. code-block:: bash
+
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+
+ # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+ docker build -t paddle:dev .
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+ # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+ bash -x /paddle/paddle/scripts/docker/build.sh
+
+注:上述命令把当前目录(源码根目录)映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后,会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录,分别进入这些目录下,执行以下命令:
+
+.. code-block:: bash
+
+ python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
+
+如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即
+
+.. code-block:: bash
+
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+ mkdir -p build
+ cd build
+ cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+ # 如果只需要构建使用文档,则执行以下命令
+ make -j $processors paddle_docs
+
+ # 如果只需要构建API,则执行以下命令
+ make -j $processors paddle_apis
+
+其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。
+
+编译完成后,同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录,分别进入这些子目录下,执行以下命令:
+
+.. code-block:: bash
+
+ python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
+
+.. image:: src/doc_en.png
+ :align: center
+ :scale: 60 %
+
+如何书写文档
+============
+
+PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。
+
+如何更新www.paddlepaddle.org
+============================
+
+更新的文档以PR的形式提交到github中,提交方式参见 `如何贡献文档 `_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和
+`英文文档 `_ 。
+
+
+.. _cmake: https://cmake.org/
+.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/fluid/advanced_usage/index.rst b/doc/fluid/advanced_usage/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..89166573eebca045e948046c69f3b7a3e0031d58
--- /dev/null
+++ b/doc/fluid/advanced_usage/index.rst
@@ -0,0 +1,22 @@
+########
+进阶使用
+########
+
+
+.. todo::
+
+ Complete this guide
+
+.. toctree::
+ :maxdepth: 2
+
+ deploy/index_anakin.rst
+ deploy/index_mobile.rst
+ development/contribute_to_paddle.md
+ development/write_docs.rst
+ development/new_op.md
+ development/cpu_profiling_cn.md
+ development/gpu_profiling_cn.rst
+ development/host_memory_profiling_cn.md
+ development/timeline_cn.md
+ benchmark.rst
diff --git a/doc/fluid/advanced_usage/pics/anakin_fm_ch.png b/doc/fluid/advanced_usage/pics/anakin_fm_ch.png
new file mode 100644
index 0000000000000000000000000000000000000000..52d4992a22397119af949aa7c11a9ea6365c167c
Binary files /dev/null and b/doc/fluid/advanced_usage/pics/anakin_fm_ch.png differ
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..435d6e10fb02e9b2a8147f37da33e8848cc9b98a
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,25 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+ "${BINARY_BUILD_DIR_EN}/conf.py"
+ @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+ html
+ ${BINARY_BUILD_DIR_EN}
+ ${SPHINX_CACHE_DIR_EN}
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+.. autoclass:: paddle.fluid.average.WeightedAverage
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0076394543c2f87e90fa1ea989d7b5cbf468a6f7
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,15 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+.. autofunction:: paddle.fluid.backward.append_backward
+ :noindex:
+
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aeefbb95a46e5d5ed46375e388a720fad2711779
--- /dev/null
+++ b/doc/fluid/api/clip.rst
@@ -0,0 +1,43 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
+
+ErrorClipByValue
+----------------
+
+.. autoclass:: paddle.fluid.clip.ErrorClipByValue
+ :members:
+ :noindex:
+
+.. _api_fluid_clip_GradientClipByValue:
+
+GradientClipByValue
+-------------------
+
+.. autoclass:: paddle.fluid.clip.GradientClipByValue
+ :members:
+ :noindex:
+
+.. _api_fluid_clip_GradientClipByNorm:
+
+GradientClipByNorm
+------------------
+
+.. autoclass:: paddle.fluid.clip.GradientClipByNorm
+ :members:
+ :noindex:
+
+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
+GradientClipByGlobalNorm
+------------------------
+
+.. autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0
--- /dev/null
+++ b/doc/fluid/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+.. autofunction:: paddle.v2.data_type.dense_array
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.integer_value
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.integer_value_sequence
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_binary_vector
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_float_vector
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_non_value_slot
+ :noindex:
+
+.. autofunction:: paddle.v2.data_type.sparse_value_slot
+ :noindex:
+
+.. autoclass:: paddle.v2.data_type.InputType
+ :members:
+ :noindex:
+
+DataFeeder
+==========
+
+.. automodule:: paddle.v2.data_feeder
+ :members:
+ :noindex:
+
+Reader
+======
+
+.. automodule:: paddle.reader
+ :members:
+ :noindex:
+
+.. automodule:: paddle.reader.creator
+ :members:
+ :noindex:
+
+minibatch
+=========
+
+.. automodule:: paddle.v2.minibatch
+ :members:
+ :noindex:
diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/fluid/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+.. automodule:: paddle.dataset
+ :members:
+ :noindex:
+
+mnist
++++++
+
+.. automodule:: paddle.dataset.mnist
+ :members:
+ :noindex:
+
+cifar
++++++
+
+.. automodule:: paddle.dataset.cifar
+ :members:
+ :noindex:
+
+conll05
++++++++
+
+.. automodule:: paddle.dataset.conll05
+ :members: get_dict,get_embedding,test
+ :noindex:
+
+imdb
+++++
+
+.. automodule:: paddle.dataset.imdb
+ :members:
+ :noindex:
+
+imikolov
+++++++++
+
+.. automodule:: paddle.dataset.imikolov
+ :members:
+ :noindex:
+
+movielens
++++++++++
+
+.. automodule:: paddle.dataset.movielens
+ :members:
+ :noindex:
+
+.. autoclass:: paddle.dataset.movielens.MovieInfo
+ :noindex:
+
+.. autoclass:: paddle.dataset.movielens.UserInfo
+ :noindex:
+
+sentiment
++++++++++
+
+.. automodule:: paddle.dataset.sentiment
+ :members:
+ :noindex:
+
+uci_housing
++++++++++++
+
+.. automodule:: paddle.dataset.uci_housing
+ :members:
+ :noindex:
+
+wmt14
++++++
+
+.. automodule:: paddle.dataset.wmt14
+ :members:
+ :noindex:
+
+wmt16
++++++
+
+.. automodule:: paddle.dataset.wmt16
+ :members:
+ :noindex:
diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/fluid/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+.. automodule:: paddle.v2.image
+ :members:
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8
--- /dev/null
+++ b/doc/fluid/api/data_feeder.rst
@@ -0,0 +1,16 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
+
+DataFeeder
+----------
+
+.. autoclass:: paddle.fluid.data_feeder.DataFeeder
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f23ecc1f80030f20359ce9675130a167722606c9
--- /dev/null
+++ b/doc/fluid/api/executor.rst
@@ -0,0 +1,40 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:
+
+Executor
+--------
+
+.. autoclass:: paddle.fluid.executor.Executor
+ :members:
+ :noindex:
+
+.. _api_fluid_executor_global_scope:
+
+global_scope
+------------
+
+.. autofunction:: paddle.fluid.executor.global_scope
+ :noindex:
+
+.. _api_fluid_executor_scope_guard:
+
+scope_guard
+-----------
+
+.. autofunction:: paddle.fluid.executor.scope_guard
+ :noindex:
+
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
+
+.. autofunction:: paddle.fluid.executor._switch_scope
+ :noindex:
+
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f76c7aab7be0b9703642bbf9de26cc298c849fb3
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,338 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+.. autoclass:: paddle.fluid.Program
+ :members:
+ :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+.. autoclass:: paddle.fluid.Operator
+ :members:
+ :noindex:
+
+.. _api_fluid_Parameter:
+
+Parameter
+---------
+
+.. autoclass:: paddle.fluid.Parameter
+ :members:
+ :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+.. autofunction:: paddle.fluid.default_startup_program
+ :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+.. autofunction:: paddle.fluid.default_main_program
+ :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+.. autofunction:: paddle.fluid.program_guard
+ :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+.. autofunction:: paddle.fluid.get_var
+ :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+.. autoclass:: paddle.fluid.Executor
+ :members:
+ :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+.. autofunction:: paddle.fluid.global_scope
+ :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+.. autofunction:: paddle.fluid.scope_guard
+ :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+.. autofunction:: paddle.fluid._switch_scope
+ :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+.. autoclass:: paddle.fluid.Trainer
+ :members:
+ :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+.. autoclass:: paddle.fluid.BeginEpochEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+.. autoclass:: paddle.fluid.EndEpochEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+.. autoclass:: paddle.fluid.BeginStepEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+.. autoclass:: paddle.fluid.EndStepEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+.. autoclass:: paddle.fluid.CheckpointConfig
+ :members:
+ :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+.. autoclass:: paddle.fluid.Inferencer
+ :members:
+ :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+.. autoclass:: paddle.fluid.DistributeTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+.. autoclass:: paddle.fluid.InferenceTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+.. autofunction:: paddle.fluid.memory_optimize
+ :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+.. autofunction:: paddle.fluid.release_memory
+ :noindex:
+
+.. _api_fluid_DistributeTranspilerConfig:
+
+DistributeTranspilerConfig
+--------------------------
+
+.. autoclass:: paddle.fluid.DistributeTranspilerConfig
+ :members:
+ :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+.. autoclass:: paddle.fluid.ParallelExecutor
+ :members:
+ :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+.. autoclass:: paddle.fluid.ExecutionStrategy
+ :members:
+ :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+.. autoclass:: paddle.fluid.BuildStrategy
+ :members:
+ :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+.. autofunction:: paddle.fluid.create_lod_tensor
+ :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+.. autofunction:: paddle.fluid.create_random_int_lodtensor
+ :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+.. autoclass:: paddle.fluid.LoDTensor
+ :members:
+ :noindex:
+
+.. _api_fluid_LoDTensorArray:
+
+LoDTensorArray
+--------------
+
+.. autoclass:: paddle.fluid.LoDTensorArray
+ :members:
+ :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+.. autoclass:: paddle.fluid.CPUPlace
+ :members:
+ :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+.. autoclass:: paddle.fluid.CUDAPlace
+ :members:
+ :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+.. autoclass:: paddle.fluid.CUDAPinnedPlace
+ :members:
+ :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+.. autoclass:: paddle.fluid.Tensor
+ :members:
+ :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+.. autoclass:: paddle.fluid.ParamAttr
+ :members:
+ :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+.. autoclass:: paddle.fluid.WeightNormParamAttr
+ :members:
+ :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+.. autoclass:: paddle.fluid.DataFeeder
+ :members:
+ :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+.. autoclass:: paddle.fluid.Scope
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..02efce2bf8392c62a7600c272bedcadc6563f927
--- /dev/null
+++ b/doc/fluid/api/gen_doc.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.fluid as fluid
+
+
+def parse_arg():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--submodules', nargs="*")
+ parser.add_argument(
+ 'module', type=str, help='Generate the documentation of which module')
+ return parser.parse_args()
+
+
+class DocGenerator(object):
+ def __init__(self, module_name=None, stream=sys.stdout):
+ if module_name == "":
+ module_name = None
+ self.stream = stream
+ if module_name is None:
+ self.module_name = "fluid"
+ else:
+ self.module_name = "fluid." + module_name
+ if module_name is None:
+ self.module = fluid
+ else:
+ if not hasattr(fluid, module_name):
+ raise ValueError("Cannot find fluid.{0}".format(module_name))
+ else:
+ self.module = getattr(fluid, module_name)
+ self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+ self._print_header_(self.module_name, dot='=', is_title=True)
+
+ def print_submodule(self, submodule_name):
+ submodule = getattr(self.module, submodule_name)
+ if submodule is None:
+ raise ValueError("Cannot find submodule {0}".format(submodule_name))
+ self.print_section(submodule_name)
+
+ for item in submodule.__all__:
+ self.print_item(item)
+
+ def print_current_module(self):
+ for item in self.module.__all__:
+ self.print_item(item)
+
+ def print_section(self, name):
+ self._print_header_(name, dot='=', is_title=False)
+
+ def print_item(self, name):
+ item = getattr(self.module, name, None)
+ if item is None:
+ return
+ if isinstance(item, types.TypeType):
+ self.print_class(name)
+ elif isinstance(item, types.FunctionType):
+ self.print_method(name)
+ else:
+ pass
+
+ def print_class(self, name):
+ self._print_ref_(name)
+ self._print_header_(name, dot='-', is_title=False)
+ self.stream.write('''.. autoclass:: paddle.{0}.{1}
+ :members:
+ :noindex:
+
+'''.format(self.module_name, name))
+
+ def print_method(self, name):
+ self._print_ref_(name)
+ self._print_header_(name, dot='-', is_title=False)
+ self.stream.write('''.. autofunction:: paddle.{0}.{1}
+ :noindex:
+
+'''.format(self.module_name, name))
+
+ def _print_header_(self, name, dot, is_title):
+ dot_line = dot * len(name)
+ if is_title:
+ self.stream.write(dot_line)
+ self.stream.write('\n')
+ self.stream.write(name)
+ self.stream.write('\n')
+ self.stream.write(dot_line)
+ self.stream.write('\n')
+ self.stream.write('\n')
+
+ def _print_ref_(self, name):
+ self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+ self.module_name.split(".")), name))
+
+
+def main():
+ args = parse_arg()
+ gen = DocGenerator(args.module)
+ if args.submodules is None:
+ gen.print_current_module()
+ else:
+ for submodule_name in args.submodules:
+ gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/source/api_reference/gen_doc.sh b/doc/fluid/api/gen_doc.sh
similarity index 100%
rename from source/api_reference/gen_doc.sh
rename to doc/fluid/api/gen_doc.sh
diff --git a/source/api_reference/index.rst b/doc/fluid/api/index_en.rst
similarity index 100%
rename from source/api_reference/index.rst
rename to doc/fluid/api/index_en.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607
--- /dev/null
+++ b/doc/fluid/api/initializer.rst
@@ -0,0 +1,131 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:
+
+Constant
+--------
+
+.. autoclass:: paddle.fluid.initializer.Constant
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_Uniform:
+
+Uniform
+-------
+
+.. autoclass:: paddle.fluid.initializer.Uniform
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_Normal:
+
+Normal
+------
+
+.. autoclass:: paddle.fluid.initializer.Normal
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_Xavier:
+
+Xavier
+------
+
+.. autoclass:: paddle.fluid.initializer.Xavier
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_Bilinear:
+
+Bilinear
+--------
+
+.. autoclass:: paddle.fluid.initializer.Bilinear
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+.. autoclass:: paddle.fluid.initializer.MSRA
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
+force_init_on_cpu
+-----------------
+
+.. autofunction:: paddle.fluid.initializer.force_init_on_cpu
+ :noindex:
+
+.. _api_fluid_initializer_init_on_cpu:
+
+init_on_cpu
+-----------
+
+.. autofunction:: paddle.fluid.initializer.init_on_cpu
+ :noindex:
+
+.. _api_fluid_initializer_ConstantInitializer:
+
+ConstantInitializer
+-------------------
+
+.. autoclass:: paddle.fluid.initializer.ConstantInitializer
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_UniformInitializer:
+
+UniformInitializer
+------------------
+
+.. autoclass:: paddle.fluid.initializer.UniformInitializer
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_NormalInitializer:
+
+NormalInitializer
+-----------------
+
+.. autoclass:: paddle.fluid.initializer.NormalInitializer
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_XavierInitializer:
+
+XavierInitializer
+-----------------
+
+.. autoclass:: paddle.fluid.initializer.XavierInitializer
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_BilinearInitializer:
+
+BilinearInitializer
+-------------------
+
+.. autoclass:: paddle.fluid.initializer.BilinearInitializer
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+.. autoclass:: paddle.fluid.initializer.MSRAInitializer
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a31930af8552a0fea51235f5e44d39e44d42d7f9
--- /dev/null
+++ b/doc/fluid/api/io.rst
@@ -0,0 +1,79 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:
+
+save_vars
+---------
+
+.. autofunction:: paddle.fluid.io.save_vars
+ :noindex:
+
+.. _api_fluid_io_save_params:
+
+save_params
+-----------
+
+.. autofunction:: paddle.fluid.io.save_params
+ :noindex:
+
+.. _api_fluid_io_save_persistables:
+
+save_persistables
+-----------------
+
+.. autofunction:: paddle.fluid.io.save_persistables
+ :noindex:
+
+.. _api_fluid_io_load_vars:
+
+load_vars
+---------
+
+.. autofunction:: paddle.fluid.io.load_vars
+ :noindex:
+
+.. _api_fluid_io_load_params:
+
+load_params
+-----------
+
+.. autofunction:: paddle.fluid.io.load_params
+ :noindex:
+
+.. _api_fluid_io_load_persistables:
+
+load_persistables
+-----------------
+
+.. autofunction:: paddle.fluid.io.load_persistables
+ :noindex:
+
+.. _api_fluid_io_save_inference_model:
+
+save_inference_model
+--------------------
+
+.. autofunction:: paddle.fluid.io.save_inference_model
+ :noindex:
+
+.. _api_fluid_io_load_inference_model:
+
+load_inference_model
+--------------------
+
+.. autofunction:: paddle.fluid.io.load_inference_model
+ :noindex:
+
+.. _api_fluid_io_get_inference_program:
+
+get_inference_program
+---------------------
+
+.. autofunction:: paddle.fluid.io.get_inference_program
+ :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed0777c6ff82d58e174c12c0c6bc1c716b6e7a59
--- /dev/null
+++ b/doc/fluid/api/layers.rst
@@ -0,0 +1,1700 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+============
+fluid.layers
+============
+
+control_flow
+============
+
+.. _api_fluid_layers_While:
+
+While
+-----
+
+.. autoclass:: paddle.fluid.layers.While
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_Switch:
+
+Switch
+------
+
+.. autoclass:: paddle.fluid.layers.Switch
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_increment:
+
+increment
+---------
+
+.. autofunction:: paddle.fluid.layers.increment
+ :noindex:
+
+.. _api_fluid_layers_array_write:
+
+array_write
+-----------
+
+.. autofunction:: paddle.fluid.layers.array_write
+ :noindex:
+
+.. _api_fluid_layers_create_array:
+
+create_array
+------------
+
+.. autofunction:: paddle.fluid.layers.create_array
+ :noindex:
+
+.. _api_fluid_layers_less_than:
+
+less_than
+---------
+
+.. autofunction:: paddle.fluid.layers.less_than
+ :noindex:
+
+.. _api_fluid_layers_equal:
+
+equal
+-----
+
+.. autofunction:: paddle.fluid.layers.equal
+ :noindex:
+
+.. _api_fluid_layers_array_read:
+
+array_read
+----------
+
+.. autofunction:: paddle.fluid.layers.array_read
+ :noindex:
+
+.. _api_fluid_layers_array_length:
+
+array_length
+------------
+
+.. autofunction:: paddle.fluid.layers.array_length
+ :noindex:
+
+.. _api_fluid_layers_IfElse:
+
+IfElse
+------
+
+.. autoclass:: paddle.fluid.layers.IfElse
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_DynamicRNN:
+
+DynamicRNN
+----------
+
+.. autoclass:: paddle.fluid.layers.DynamicRNN
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_StaticRNN:
+
+StaticRNN
+---------
+
+.. autoclass:: paddle.fluid.layers.StaticRNN
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
+reorder_lod_tensor_by_rank
+--------------------------
+
+.. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
+ :noindex:
+
+.. _api_fluid_layers_ParallelDo:
+
+ParallelDo
+----------
+
+.. autoclass:: paddle.fluid.layers.ParallelDo
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_Print:
+
+Print
+-----
+
+.. autofunction:: paddle.fluid.layers.Print
+ :noindex:
+
+.. _api_fluid_layers_is_empty:
+
+is_empty
+--------
+
+.. autofunction:: paddle.fluid.layers.is_empty
+ :noindex:
+
+device
+======
+
+io
+==
+
+.. _api_fluid_layers_data:
+
+data
+----
+
+.. autofunction:: paddle.fluid.layers.data
+ :noindex:
+
+.. _api_fluid_layers_open_recordio_file:
+
+open_recordio_file
+------------------
+
+.. autofunction:: paddle.fluid.layers.open_recordio_file
+ :noindex:
+
+.. _api_fluid_layers_open_files:
+
+open_files
+----------
+
+.. autofunction:: paddle.fluid.layers.open_files
+ :noindex:
+
+.. _api_fluid_layers_read_file:
+
+read_file
+---------
+
+.. autofunction:: paddle.fluid.layers.read_file
+ :noindex:
+
+.. _api_fluid_layers_shuffle:
+
+shuffle
+-------
+
+.. autofunction:: paddle.fluid.layers.shuffle
+ :noindex:
+
+.. _api_fluid_layers_batch:
+
+batch
+-----
+
+.. autofunction:: paddle.fluid.layers.batch
+ :noindex:
+
+.. _api_fluid_layers_double_buffer:
+
+double_buffer
+-------------
+
+.. autofunction:: paddle.fluid.layers.double_buffer
+ :noindex:
+
+.. _api_fluid_layers_random_data_generator:
+
+random_data_generator
+---------------------
+
+.. autofunction:: paddle.fluid.layers.random_data_generator
+ :noindex:
+
+.. _api_fluid_layers_py_reader:
+
+py_reader
+---------
+
+.. autofunction:: paddle.fluid.layers.py_reader
+ :noindex:
+
+.. _api_fluid_layers_Preprocessor:
+
+Preprocessor
+------------
+
+.. autoclass:: paddle.fluid.layers.Preprocessor
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_load:
+
+load
+----
+
+.. autofunction:: paddle.fluid.layers.load
+ :noindex:
+
+nn
+==
+
+.. _api_fluid_layers_fc:
+
+fc
+--
+
+.. autofunction:: paddle.fluid.layers.fc
+ :noindex:
+
+.. _api_fluid_layers_embedding:
+
+embedding
+---------
+
+.. autofunction:: paddle.fluid.layers.embedding
+ :noindex:
+
+.. _api_fluid_layers_dynamic_lstm:
+
+dynamic_lstm
+------------
+
+.. autofunction:: paddle.fluid.layers.dynamic_lstm
+ :noindex:
+
+.. _api_fluid_layers_dynamic_lstmp:
+
+dynamic_lstmp
+-------------
+
+.. autofunction:: paddle.fluid.layers.dynamic_lstmp
+ :noindex:
+
+.. _api_fluid_layers_dynamic_gru:
+
+dynamic_gru
+-----------
+
+.. autofunction:: paddle.fluid.layers.dynamic_gru
+ :noindex:
+
+.. _api_fluid_layers_gru_unit:
+
+gru_unit
+--------
+
+.. autofunction:: paddle.fluid.layers.gru_unit
+ :noindex:
+
+.. _api_fluid_layers_linear_chain_crf:
+
+linear_chain_crf
+----------------
+
+.. autofunction:: paddle.fluid.layers.linear_chain_crf
+ :noindex:
+
+.. _api_fluid_layers_crf_decoding:
+
+crf_decoding
+------------
+
+.. autofunction:: paddle.fluid.layers.crf_decoding
+ :noindex:
+
+.. _api_fluid_layers_cos_sim:
+
+cos_sim
+-------
+
+.. autofunction:: paddle.fluid.layers.cos_sim
+ :noindex:
+
+.. _api_fluid_layers_cross_entropy:
+
+cross_entropy
+-------------
+
+.. autofunction:: paddle.fluid.layers.cross_entropy
+ :noindex:
+
+.. _api_fluid_layers_square_error_cost:
+
+square_error_cost
+-----------------
+
+.. autofunction:: paddle.fluid.layers.square_error_cost
+ :noindex:
+
+.. _api_fluid_layers_chunk_eval:
+
+chunk_eval
+----------
+
+.. autofunction:: paddle.fluid.layers.chunk_eval
+ :noindex:
+
+.. _api_fluid_layers_sequence_conv:
+
+sequence_conv
+-------------
+
+.. autofunction:: paddle.fluid.layers.sequence_conv
+ :noindex:
+
+.. _api_fluid_layers_conv2d:
+
+conv2d
+------
+
+.. autofunction:: paddle.fluid.layers.conv2d
+ :noindex:
+
+.. _api_fluid_layers_conv3d:
+
+conv3d
+------
+
+.. autofunction:: paddle.fluid.layers.conv3d
+ :noindex:
+
+.. _api_fluid_layers_sequence_pool:
+
+sequence_pool
+-------------
+
+.. autofunction:: paddle.fluid.layers.sequence_pool
+ :noindex:
+
+.. _api_fluid_layers_sequence_softmax:
+
+sequence_softmax
+----------------
+
+.. autofunction:: paddle.fluid.layers.sequence_softmax
+ :noindex:
+
+.. _api_fluid_layers_softmax:
+
+softmax
+-------
+
+.. autofunction:: paddle.fluid.layers.softmax
+ :noindex:
+
+.. _api_fluid_layers_pool2d:
+
+pool2d
+------
+
+.. autofunction:: paddle.fluid.layers.pool2d
+ :noindex:
+
+.. _api_fluid_layers_pool3d:
+
+pool3d
+------
+
+.. autofunction:: paddle.fluid.layers.pool3d
+ :noindex:
+
+.. _api_fluid_layers_batch_norm:
+
+batch_norm
+----------
+
+.. autofunction:: paddle.fluid.layers.batch_norm
+ :noindex:
+
+.. _api_fluid_layers_beam_search_decode:
+
+beam_search_decode
+------------------
+
+.. autofunction:: paddle.fluid.layers.beam_search_decode
+ :noindex:
+
+.. _api_fluid_layers_conv2d_transpose:
+
+conv2d_transpose
+----------------
+
+.. autofunction:: paddle.fluid.layers.conv2d_transpose
+ :noindex:
+
+.. _api_fluid_layers_conv3d_transpose:
+
+conv3d_transpose
+----------------
+
+.. autofunction:: paddle.fluid.layers.conv3d_transpose
+ :noindex:
+
+.. _api_fluid_layers_sequence_expand:
+
+sequence_expand
+---------------
+
+.. autofunction:: paddle.fluid.layers.sequence_expand
+ :noindex:
+
+.. _api_fluid_layers_sequence_pad:
+
+sequence_pad
+------------
+
+.. autofunction:: paddle.fluid.layers.sequence_pad
+ :noindex:
+
+.. _api_fluid_layers_lstm_unit:
+
+lstm_unit
+---------
+
+.. autofunction:: paddle.fluid.layers.lstm_unit
+ :noindex:
+
+.. _api_fluid_layers_reduce_sum:
+
+reduce_sum
+----------
+
+.. autofunction:: paddle.fluid.layers.reduce_sum
+ :noindex:
+
+.. _api_fluid_layers_reduce_mean:
+
+reduce_mean
+-----------
+
+.. autofunction:: paddle.fluid.layers.reduce_mean
+ :noindex:
+
+.. _api_fluid_layers_reduce_max:
+
+reduce_max
+----------
+
+.. autofunction:: paddle.fluid.layers.reduce_max
+ :noindex:
+
+.. _api_fluid_layers_reduce_min:
+
+reduce_min
+----------
+
+.. autofunction:: paddle.fluid.layers.reduce_min
+ :noindex:
+
+.. _api_fluid_layers_reduce_prod:
+
+reduce_prod
+-----------
+
+.. autofunction:: paddle.fluid.layers.reduce_prod
+ :noindex:
+
+.. _api_fluid_layers_sequence_first_step:
+
+sequence_first_step
+-------------------
+
+.. autofunction:: paddle.fluid.layers.sequence_first_step
+ :noindex:
+
+.. _api_fluid_layers_sequence_last_step:
+
+sequence_last_step
+------------------
+
+.. autofunction:: paddle.fluid.layers.sequence_last_step
+ :noindex:
+
+.. _api_fluid_layers_dropout:
+
+dropout
+-------
+
+.. autofunction:: paddle.fluid.layers.dropout
+ :noindex:
+
+.. _api_fluid_layers_split:
+
+split
+-----
+
+.. autofunction:: paddle.fluid.layers.split
+ :noindex:
+
+.. _api_fluid_layers_ctc_greedy_decoder:
+
+ctc_greedy_decoder
+------------------
+
+.. autofunction:: paddle.fluid.layers.ctc_greedy_decoder
+ :noindex:
+
+.. _api_fluid_layers_edit_distance:
+
+edit_distance
+-------------
+
+.. autofunction:: paddle.fluid.layers.edit_distance
+ :noindex:
+
+.. _api_fluid_layers_l2_normalize:
+
+l2_normalize
+------------
+
+.. autofunction:: paddle.fluid.layers.l2_normalize
+ :noindex:
+
+.. _api_fluid_layers_matmul:
+
+matmul
+------
+
+.. autofunction:: paddle.fluid.layers.matmul
+ :noindex:
+
+.. _api_fluid_layers_topk:
+
+topk
+----
+
+.. autofunction:: paddle.fluid.layers.topk
+ :noindex:
+
+.. _api_fluid_layers_warpctc:
+
+warpctc
+-------
+
+.. autofunction:: paddle.fluid.layers.warpctc
+ :noindex:
+
+.. _api_fluid_layers_sequence_reshape:
+
+sequence_reshape
+----------------
+
+.. autofunction:: paddle.fluid.layers.sequence_reshape
+ :noindex:
+
+.. _api_fluid_layers_transpose:
+
+transpose
+---------
+
+.. autofunction:: paddle.fluid.layers.transpose
+ :noindex:
+
+.. _api_fluid_layers_im2sequence:
+
+im2sequence
+-----------
+
+.. autofunction:: paddle.fluid.layers.im2sequence
+ :noindex:
+
+.. _api_fluid_layers_nce:
+
+nce
+---
+
+.. autofunction:: paddle.fluid.layers.nce
+ :noindex:
+
+.. _api_fluid_layers_hsigmoid:
+
+hsigmoid
+--------
+
+.. autofunction:: paddle.fluid.layers.hsigmoid
+ :noindex:
+
+.. _api_fluid_layers_beam_search:
+
+beam_search
+-----------
+
+.. autofunction:: paddle.fluid.layers.beam_search
+ :noindex:
+
+.. _api_fluid_layers_row_conv:
+
+row_conv
+--------
+
+.. autofunction:: paddle.fluid.layers.row_conv
+ :noindex:
+
+.. _api_fluid_layers_multiplex:
+
+multiplex
+---------
+
+.. autofunction:: paddle.fluid.layers.multiplex
+ :noindex:
+
+.. _api_fluid_layers_layer_norm:
+
+layer_norm
+----------
+
+.. autofunction:: paddle.fluid.layers.layer_norm
+ :noindex:
+
+.. _api_fluid_layers_softmax_with_cross_entropy:
+
+softmax_with_cross_entropy
+--------------------------
+
+.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+ :noindex:
+
+.. _api_fluid_layers_smooth_l1:
+
+smooth_l1
+---------
+
+.. autofunction:: paddle.fluid.layers.smooth_l1
+ :noindex:
+
+.. _api_fluid_layers_one_hot:
+
+one_hot
+-------
+
+.. autofunction:: paddle.fluid.layers.one_hot
+ :noindex:
+
+.. _api_fluid_layers_autoincreased_step_counter:
+
+autoincreased_step_counter
+--------------------------
+
+.. autofunction:: paddle.fluid.layers.autoincreased_step_counter
+ :noindex:
+
+.. _api_fluid_layers_reshape:
+
+reshape
+-------
+
+.. autofunction:: paddle.fluid.layers.reshape
+ :noindex:
+
+.. _api_fluid_layers_lod_reset:
+
+lod_reset
+---------
+
+.. autofunction:: paddle.fluid.layers.lod_reset
+ :noindex:
+
+.. _api_fluid_layers_lrn:
+
+lrn
+---
+
+.. autofunction:: paddle.fluid.layers.lrn
+ :noindex:
+
+.. _api_fluid_layers_pad:
+
+pad
+---
+
+.. autofunction:: paddle.fluid.layers.pad
+ :noindex:
+
+.. _api_fluid_layers_label_smooth:
+
+label_smooth
+------------
+
+.. autofunction:: paddle.fluid.layers.label_smooth
+ :noindex:
+
+.. _api_fluid_layers_roi_pool:
+
+roi_pool
+--------
+
+.. autofunction:: paddle.fluid.layers.roi_pool
+ :noindex:
+
+.. _api_fluid_layers_dice_loss:
+
+dice_loss
+---------
+
+.. autofunction:: paddle.fluid.layers.dice_loss
+ :noindex:
+
+.. _api_fluid_layers_image_resize:
+
+image_resize
+------------
+
+.. autofunction:: paddle.fluid.layers.image_resize
+ :noindex:
+
+.. _api_fluid_layers_image_resize_short:
+
+image_resize_short
+------------------
+
+.. autofunction:: paddle.fluid.layers.image_resize_short
+ :noindex:
+
+.. _api_fluid_layers_resize_bilinear:
+
+resize_bilinear
+---------------
+
+.. autofunction:: paddle.fluid.layers.resize_bilinear
+ :noindex:
+
+.. _api_fluid_layers_gather:
+
+gather
+------
+
+.. autofunction:: paddle.fluid.layers.gather
+ :noindex:
+
+.. _api_fluid_layers_random_crop:
+
+random_crop
+-----------
+
+.. autofunction:: paddle.fluid.layers.random_crop
+ :noindex:
+
+.. _api_fluid_layers_mean_iou:
+
+mean_iou
+--------
+
+.. autofunction:: paddle.fluid.layers.mean_iou
+ :noindex:
+
+.. _api_fluid_layers_relu:
+
+relu
+----
+
+.. autofunction:: paddle.fluid.layers.relu
+ :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+.. autofunction:: paddle.fluid.layers.log
+ :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+.. autofunction:: paddle.fluid.layers.crop
+ :noindex:
+
+.. _api_fluid_layers_rank_loss:
+
+rank_loss
+---------
+
+.. autofunction:: paddle.fluid.layers.rank_loss
+ :noindex:
+
+.. _api_fluid_layers_prelu:
+
+prelu
+-----
+
+.. autofunction:: paddle.fluid.layers.prelu
+ :noindex:
+
+.. _api_fluid_layers_flatten:
+
+flatten
+-------
+
+.. autofunction:: paddle.fluid.layers.flatten
+ :noindex:
+
+ops
+===
+
+.. _api_fluid_layers_mean:
+
+mean
+----
+
+.. autofunction:: paddle.fluid.layers.mean
+ :noindex:
+
+.. _api_fluid_layers_mul:
+
+mul
+---
+
+.. autofunction:: paddle.fluid.layers.mul
+ :noindex:
+
+.. _api_fluid_layers_scale:
+
+scale
+-----
+
+.. autofunction:: paddle.fluid.layers.scale
+ :noindex:
+
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+
+.. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+ :noindex:
+
+.. _api_fluid_layers_elementwise_add:
+
+elementwise_add
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_add
+ :noindex:
+
+.. _api_fluid_layers_elementwise_div:
+
+elementwise_div
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_div
+ :noindex:
+
+.. _api_fluid_layers_elementwise_sub:
+
+elementwise_sub
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_sub
+ :noindex:
+
+.. _api_fluid_layers_elementwise_mul:
+
+elementwise_mul
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_mul
+ :noindex:
+
+.. _api_fluid_layers_elementwise_max:
+
+elementwise_max
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_max
+ :noindex:
+
+.. _api_fluid_layers_elementwise_min:
+
+elementwise_min
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_min
+ :noindex:
+
+.. _api_fluid_layers_elementwise_pow:
+
+elementwise_pow
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_pow
+ :noindex:
+
+.. _api_fluid_layers_clip:
+
+clip
+----
+
+.. autofunction:: paddle.fluid.layers.clip
+ :noindex:
+
+.. _api_fluid_layers_clip_by_norm:
+
+clip_by_norm
+------------
+
+.. autofunction:: paddle.fluid.layers.clip_by_norm
+ :noindex:
+
+.. _api_fluid_layers_logical_and:
+
+logical_and
+-----------
+
+.. autofunction:: paddle.fluid.layers.logical_and
+ :noindex:
+
+.. _api_fluid_layers_logical_or:
+
+logical_or
+----------
+
+.. autofunction:: paddle.fluid.layers.logical_or
+ :noindex:
+
+.. _api_fluid_layers_logical_xor:
+
+logical_xor
+-----------
+
+.. autofunction:: paddle.fluid.layers.logical_xor
+ :noindex:
+
+.. _api_fluid_layers_logical_not:
+
+logical_not
+-----------
+
+.. autofunction:: paddle.fluid.layers.logical_not
+ :noindex:
+
+.. _api_fluid_layers_uniform_random_batch_size_like:
+
+uniform_random_batch_size_like
+------------------------------
+
+.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+ :noindex:
+
+.. _api_fluid_layers_gaussian_random:
+
+gaussian_random
+---------------
+
+.. autofunction:: paddle.fluid.layers.gaussian_random
+ :noindex:
+
+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+ :noindex:
+
+.. _api_fluid_layers_scatter:
+
+scatter
+-------
+
+.. autofunction:: paddle.fluid.layers.scatter
+ :noindex:
+
+.. _api_fluid_layers_sum:
+
+sum
+---
+
+.. autofunction:: paddle.fluid.layers.sum
+ :noindex:
+
+.. _api_fluid_layers_slice:
+
+slice
+-----
+
+.. autofunction:: paddle.fluid.layers.slice
+ :noindex:
+
+.. _api_fluid_layers_shape:
+
+shape
+-----
+
+.. autofunction:: paddle.fluid.layers.shape
+ :noindex:
+
+.. _api_fluid_layers_maxout:
+
+maxout
+------
+
+.. autofunction:: paddle.fluid.layers.maxout
+ :noindex:
+
+.. _api_fluid_layers_sigmoid:
+
+sigmoid
+-------
+
+.. autofunction:: paddle.fluid.layers.sigmoid
+ :noindex:
+
+.. _api_fluid_layers_logsigmoid:
+
+logsigmoid
+----------
+
+.. autofunction:: paddle.fluid.layers.logsigmoid
+ :noindex:
+
+.. _api_fluid_layers_exp:
+
+exp
+---
+
+.. autofunction:: paddle.fluid.layers.exp
+ :noindex:
+
+.. _api_fluid_layers_tanh:
+
+tanh
+----
+
+.. autofunction:: paddle.fluid.layers.tanh
+ :noindex:
+
+.. _api_fluid_layers_tanh_shrink:
+
+tanh_shrink
+-----------
+
+.. autofunction:: paddle.fluid.layers.tanh_shrink
+ :noindex:
+
+.. _api_fluid_layers_softshrink:
+
+softshrink
+----------
+
+.. autofunction:: paddle.fluid.layers.softshrink
+ :noindex:
+
+.. _api_fluid_layers_sqrt:
+
+sqrt
+----
+
+.. autofunction:: paddle.fluid.layers.sqrt
+ :noindex:
+
+.. _api_fluid_layers_abs:
+
+abs
+---
+
+.. autofunction:: paddle.fluid.layers.abs
+ :noindex:
+
+.. _api_fluid_layers_ceil:
+
+ceil
+----
+
+.. autofunction:: paddle.fluid.layers.ceil
+ :noindex:
+
+.. _api_fluid_layers_floor:
+
+floor
+-----
+
+.. autofunction:: paddle.fluid.layers.floor
+ :noindex:
+
+.. _api_fluid_layers_cos:
+
+cos
+---
+
+.. autofunction:: paddle.fluid.layers.cos
+ :noindex:
+
+.. _api_fluid_layers_sin:
+
+sin
+---
+
+.. autofunction:: paddle.fluid.layers.sin
+ :noindex:
+
+.. _api_fluid_layers_round:
+
+round
+-----
+
+.. autofunction:: paddle.fluid.layers.round
+ :noindex:
+
+.. _api_fluid_layers_reciprocal:
+
+reciprocal
+----------
+
+.. autofunction:: paddle.fluid.layers.reciprocal
+ :noindex:
+
+.. _api_fluid_layers_square:
+
+square
+------
+
+.. autofunction:: paddle.fluid.layers.square
+ :noindex:
+
+.. _api_fluid_layers_softplus:
+
+softplus
+--------
+
+.. autofunction:: paddle.fluid.layers.softplus
+ :noindex:
+
+.. _api_fluid_layers_softsign:
+
+softsign
+--------
+
+.. autofunction:: paddle.fluid.layers.softsign
+ :noindex:
+
+.. _api_fluid_layers_brelu:
+
+brelu
+-----
+
+.. autofunction:: paddle.fluid.layers.brelu
+ :noindex:
+
+.. _api_fluid_layers_leaky_relu:
+
+leaky_relu
+----------
+
+.. autofunction:: paddle.fluid.layers.leaky_relu
+ :noindex:
+
+.. _api_fluid_layers_soft_relu:
+
+soft_relu
+---------
+
+.. autofunction:: paddle.fluid.layers.soft_relu
+ :noindex:
+
+.. _api_fluid_layers_elu:
+
+elu
+---
+
+.. autofunction:: paddle.fluid.layers.elu
+ :noindex:
+
+.. _api_fluid_layers_relu6:
+
+relu6
+-----
+
+.. autofunction:: paddle.fluid.layers.relu6
+ :noindex:
+
+.. _api_fluid_layers_pow:
+
+pow
+---
+
+.. autofunction:: paddle.fluid.layers.pow
+ :noindex:
+
+.. _api_fluid_layers_stanh:
+
+stanh
+-----
+
+.. autofunction:: paddle.fluid.layers.stanh
+ :noindex:
+
+.. _api_fluid_layers_hard_sigmoid:
+
+hard_sigmoid
+------------
+
+.. autofunction:: paddle.fluid.layers.hard_sigmoid
+ :noindex:
+
+.. _api_fluid_layers_swish:
+
+swish
+-----
+
+.. autofunction:: paddle.fluid.layers.swish
+ :noindex:
+
+.. _api_fluid_layers_uniform_random:
+
+uniform_random
+--------------
+
+.. autofunction:: paddle.fluid.layers.uniform_random
+ :noindex:
+
+.. _api_fluid_layers_hard_shrink:
+
+hard_shrink
+-----------
+
+.. autofunction:: paddle.fluid.layers.hard_shrink
+ :noindex:
+
+.. _api_fluid_layers_cumsum:
+
+cumsum
+------
+
+.. autofunction:: paddle.fluid.layers.cumsum
+ :noindex:
+
+.. _api_fluid_layers_thresholded_relu:
+
+thresholded_relu
+----------------
+
+.. autofunction:: paddle.fluid.layers.thresholded_relu
+ :noindex:
+
+tensor
+======
+
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+.. autofunction:: paddle.fluid.layers.create_tensor
+ :noindex:
+
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+.. autofunction:: paddle.fluid.layers.create_parameter
+ :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+.. autofunction:: paddle.fluid.layers.create_global_var
+ :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+.. autofunction:: paddle.fluid.layers.cast
+ :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+.. autofunction:: paddle.fluid.layers.concat
+ :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+.. autofunction:: paddle.fluid.layers.sums
+ :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+.. autofunction:: paddle.fluid.layers.assign
+ :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+ :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+.. autofunction:: paddle.fluid.layers.fill_constant
+ :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+.. autofunction:: paddle.fluid.layers.argmin
+ :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+.. autofunction:: paddle.fluid.layers.argmax
+ :noindex:
+
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+.. autofunction:: paddle.fluid.layers.argsort
+ :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+.. autofunction:: paddle.fluid.layers.ones
+ :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+.. autofunction:: paddle.fluid.layers.zeros
+ :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+.. autofunction:: paddle.fluid.layers.reverse
+ :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+.. autofunction:: paddle.fluid.layers.exponential_decay
+ :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+.. autofunction:: paddle.fluid.layers.natural_exp_decay
+ :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+.. autofunction:: paddle.fluid.layers.inverse_time_decay
+ :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+.. autofunction:: paddle.fluid.layers.polynomial_decay
+ :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+.. autofunction:: paddle.fluid.layers.piecewise_decay
+ :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+.. autofunction:: paddle.fluid.layers.noam_decay
+ :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+.. autofunction:: paddle.fluid.layers.append_LARS
+ :noindex:
+
+detection
+=========
+
+.. _api_fluid_layers_prior_box:
+
+prior_box
+---------
+
+.. autofunction:: paddle.fluid.layers.prior_box
+ :noindex:
+
+.. _api_fluid_layers_multi_box_head:
+
+multi_box_head
+--------------
+
+.. autofunction:: paddle.fluid.layers.multi_box_head
+ :noindex:
+
+.. _api_fluid_layers_bipartite_match:
+
+bipartite_match
+---------------
+
+.. autofunction:: paddle.fluid.layers.bipartite_match
+ :noindex:
+
+.. _api_fluid_layers_target_assign:
+
+target_assign
+-------------
+
+.. autofunction:: paddle.fluid.layers.target_assign
+ :noindex:
+
+.. _api_fluid_layers_detection_output:
+
+detection_output
+----------------
+
+.. autofunction:: paddle.fluid.layers.detection_output
+ :noindex:
+
+.. _api_fluid_layers_ssd_loss:
+
+ssd_loss
+--------
+
+.. autofunction:: paddle.fluid.layers.ssd_loss
+ :noindex:
+
+.. _api_fluid_layers_detection_map:
+
+detection_map
+-------------
+
+.. autofunction:: paddle.fluid.layers.detection_map
+ :noindex:
+
+.. _api_fluid_layers_rpn_target_assign:
+
+rpn_target_assign
+-----------------
+
+.. autofunction:: paddle.fluid.layers.rpn_target_assign
+ :noindex:
+
+.. _api_fluid_layers_anchor_generator:
+
+anchor_generator
+----------------
+
+.. autofunction:: paddle.fluid.layers.anchor_generator
+ :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+.. autofunction:: paddle.fluid.layers.iou_similarity
+ :noindex:
+
+.. _api_fluid_layers_box_coder:
+
+box_coder
+---------
+
+.. autofunction:: paddle.fluid.layers.box_coder
+ :noindex:
+
+.. _api_fluid_layers_polygon_box_transform:
+
+polygon_box_transform
+---------------------
+
+.. autofunction:: paddle.fluid.layers.polygon_box_transform
+ :noindex:
+
+metric_op
+=========
+
+.. _api_fluid_layers_accuracy:
+
+accuracy
+--------
+
+.. autofunction:: paddle.fluid.layers.accuracy
+ :noindex:
+
+.. _api_fluid_layers_auc:
+
+auc
+---
+
+.. autofunction:: paddle.fluid.layers.auc
+ :noindex:
+
+tensor
+======
+
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+.. autofunction:: paddle.fluid.layers.create_tensor
+ :noindex:
+
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+.. autofunction:: paddle.fluid.layers.create_parameter
+ :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+.. autofunction:: paddle.fluid.layers.create_global_var
+ :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+.. autofunction:: paddle.fluid.layers.cast
+ :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+.. autofunction:: paddle.fluid.layers.concat
+ :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+.. autofunction:: paddle.fluid.layers.sums
+ :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+.. autofunction:: paddle.fluid.layers.assign
+ :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+ :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+.. autofunction:: paddle.fluid.layers.fill_constant
+ :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+.. autofunction:: paddle.fluid.layers.argmin
+ :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+.. autofunction:: paddle.fluid.layers.argmax
+ :noindex:
+
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+.. autofunction:: paddle.fluid.layers.argsort
+ :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+.. autofunction:: paddle.fluid.layers.ones
+ :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+.. autofunction:: paddle.fluid.layers.zeros
+ :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+.. autofunction:: paddle.fluid.layers.reverse
+ :noindex:
+
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f54b2e2eb7ead353215c5dbd529293794e37123
--- /dev/null
+++ b/doc/fluid/api/metrics.rst
@@ -0,0 +1,88 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:
+
+MetricBase
+----------
+
+.. autoclass:: paddle.fluid.metrics.MetricBase
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_CompositeMetric:
+
+CompositeMetric
+---------------
+
+.. autoclass:: paddle.fluid.metrics.CompositeMetric
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+.. autoclass:: paddle.fluid.metrics.Precision
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+.. autoclass:: paddle.fluid.metrics.Recall
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
+Accuracy
+--------
+
+.. autoclass:: paddle.fluid.metrics.Accuracy
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_ChunkEvaluator:
+
+ChunkEvaluator
+--------------
+
+.. autoclass:: paddle.fluid.metrics.ChunkEvaluator
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_EditDistance:
+
+EditDistance
+------------
+
+.. autoclass:: paddle.fluid.metrics.EditDistance
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_DetectionMAP:
+
+DetectionMAP
+------------
+
+.. autoclass:: paddle.fluid.metrics.DetectionMAP
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Auc:
+
+Auc
+---
+
+.. autoclass:: paddle.fluid.metrics.Auc
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
new file mode 100644
index 0000000000000000000000000000000000000000..059733af18517257b6821d95fd628a9e13e6e98e
--- /dev/null
+++ b/doc/fluid/api/nets.rst
@@ -0,0 +1,39 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+
+.. autofunction:: paddle.fluid.nets.simple_img_conv_pool
+ :noindex:
+
+.. _api_fluid_nets_sequence_conv_pool:
+
+sequence_conv_pool
+------------------
+
+.. autofunction:: paddle.fluid.nets.sequence_conv_pool
+ :noindex:
+
+.. _api_fluid_nets_glu:
+
+glu
+---
+
+.. autofunction:: paddle.fluid.nets.glu
+ :noindex:
+
+.. _api_fluid_nets_scaled_dot_product_attention:
+
+scaled_dot_product_attention
+----------------------------
+
+.. autofunction:: paddle.fluid.nets.scaled_dot_product_attention
+ :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dfd587afd704ecbd17fb14d1fef0752d9313048b
--- /dev/null
+++ b/doc/fluid/api/optimizer.rst
@@ -0,0 +1,169 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:
+
+SGD
+---
+
+.. autoclass:: paddle.fluid.optimizer.SGD
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Momentum:
+
+Momentum
+--------
+
+.. autoclass:: paddle.fluid.optimizer.Momentum
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Adagrad:
+
+Adagrad
+-------
+
+.. autoclass:: paddle.fluid.optimizer.Adagrad
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Adam:
+
+Adam
+----
+
+.. autoclass:: paddle.fluid.optimizer.Adam
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Adamax:
+
+Adamax
+------
+
+.. autoclass:: paddle.fluid.optimizer.Adamax
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagrad:
+
+DecayedAdagrad
+--------------
+
+.. autoclass:: paddle.fluid.optimizer.DecayedAdagrad
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+.. autoclass:: paddle.fluid.optimizer.Ftrl
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
+SGDOptimizer
+------------
+
+.. autoclass:: paddle.fluid.optimizer.SGDOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_MomentumOptimizer:
+
+MomentumOptimizer
+-----------------
+
+.. autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_AdagradOptimizer:
+
+AdagradOptimizer
+----------------
+
+.. autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_AdamOptimizer:
+
+AdamOptimizer
+-------------
+
+.. autoclass:: paddle.fluid.optimizer.AdamOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
+AdamaxOptimizer
+---------------
+
+.. autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
+DecayedAdagradOptimizer
+-----------------------
+
+.. autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+.. autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
+Adadelta
+--------
+
+.. autoclass:: paddle.fluid.optimizer.Adadelta
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_ModelAverage:
+
+ModelAverage
+------------
+
+.. autoclass:: paddle.fluid.optimizer.ModelAverage
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..33035bbc7ca5c8d000adeaf1cb79806a3ea64604
--- /dev/null
+++ b/doc/fluid/api/param_attr.rst
@@ -0,0 +1,25 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
+
+ParamAttr
+---------
+
+.. autoclass:: paddle.fluid.param_attr.ParamAttr
+ :members:
+ :noindex:
+
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+.. autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c750a2d588df56728ac7f73051ab7a9e44dee232
--- /dev/null
+++ b/doc/fluid/api/profiler.rst
@@ -0,0 +1,47 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
+
+cuda_profiler
+-------------
+
+.. autofunction:: paddle.fluid.profiler.cuda_profiler
+ :noindex:
+
+.. _api_fluid_profiler_reset_profiler:
+
+reset_profiler
+--------------
+
+.. autofunction:: paddle.fluid.profiler.reset_profiler
+ :noindex:
+
+.. _api_fluid_profiler_profiler:
+
+profiler
+--------
+
+.. autofunction:: paddle.fluid.profiler.profiler
+ :noindex:
+
+.. _api_fluid_profiler_start_profiler:
+
+start_profiler
+--------------
+
+.. autofunction:: paddle.fluid.profiler.start_profiler
+ :noindex:
+
+.. _api_fluid_profiler_stop_profiler:
+
+stop_profiler
+-------------
+
+.. autofunction:: paddle.fluid.profiler.stop_profiler
+ :noindex:
+
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717
--- /dev/null
+++ b/doc/fluid/api/recordio_writer.rst
@@ -0,0 +1,23 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+ :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+ :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5b3004a783930cc1ccac1c4db30603eee6e52769
--- /dev/null
+++ b/doc/fluid/api/regularizer.rst
@@ -0,0 +1,43 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_L1Decay:
+
+L1Decay
+-------
+
+.. autoclass:: paddle.fluid.regularizer.L1Decay
+ :members:
+ :noindex:
+
+.. _api_fluid_regularizer_L2Decay:
+
+L2Decay
+-------
+
+.. autoclass:: paddle.fluid.regularizer.L2Decay
+ :members:
+ :noindex:
+
+.. _api_fluid_regularizer_L1DecayRegularizer:
+
+L1DecayRegularizer
+------------------
+
+.. autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
+ :members:
+ :noindex:
+
+.. _api_fluid_regularizer_L2DecayRegularizer:
+
+L2DecayRegularizer
+------------------
+
+.. autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b6e169ff5d1bed9338745874fbc570e5be5f316b
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,68 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_memory_optimize:
+
+memory_optimize
+---------------
+
+.. autofunction:: paddle.fluid.transpiler.memory_optimize
+ :noindex:
+
+.. _api_fluid_transpiler_release_memory:
+
+release_memory
+--------------
+
+.. autofunction:: paddle.fluid.transpiler.release_memory
+ :noindex:
+
+.. _api_fluid_transpiler_HashName:
+
+HashName
+--------
+
+.. autoclass:: paddle.fluid.transpiler.HashName
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_RoundRobin:
+
+RoundRobin
+----------
+
+.. autoclass:: paddle.fluid.transpiler.RoundRobin
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_DistributeTranspilerConfig:
+
+DistributeTranspilerConfig
+--------------------------
+
+.. autoclass:: paddle.fluid.transpiler.DistributeTranspilerConfig
+ :members:
+ :noindex:
+
diff --git a/source/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/beginners_guide/basics/image_classification/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/image_classification/.gitignore
rename to doc/fluid/beginners_guide/basics/image_classification/.gitignore
diff --git a/doc/fluid/beginners_guide/basics/image_classification/image b/doc/fluid/beginners_guide/basics/image_classification/image
new file mode 120000
index 0000000000000000000000000000000000000000..557ef69573cf03e06c1053970cf22695f3674ace
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/image_classification/image
@@ -0,0 +1 @@
+../../../../../external/book/03.image_classification/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/image_classification/index.md b/doc/fluid/beginners_guide/basics/image_classification/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..18ab749ec38e835f14299c09c03192919bda41bb
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/image_classification/index.md
@@ -0,0 +1 @@
+../../../../../external/book/03.image_classification/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/index.rst b/doc/fluid/beginners_guide/basics/index.rst
similarity index 100%
rename from source/beginners_guide/basics/index.rst
rename to doc/fluid/beginners_guide/basics/index.rst
diff --git a/source/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/label_semantic_roles/.gitignore
rename to doc/fluid/beginners_guide/basics/label_semantic_roles/.gitignore
diff --git a/doc/fluid/beginners_guide/basics/label_semantic_roles/image b/doc/fluid/beginners_guide/basics/label_semantic_roles/image
new file mode 120000
index 0000000000000000000000000000000000000000..524699b463e79ccef236b5b82e75520411f19f3c
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/label_semantic_roles/image
@@ -0,0 +1 @@
+../../../../../external/book/07.label_semantic_roles/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md b/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..8e482e13129cade3153b79fc4c334a8bff858af5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/label_semantic_roles/index.md
@@ -0,0 +1 @@
+../../../../../external/book/07.label_semantic_roles/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/learning_materials.md b/doc/fluid/beginners_guide/basics/learning_materials.md
new file mode 100644
index 0000000000000000000000000000000000000000..a27499c6ed8d1149c6d519006086febbcae943fa
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/learning_materials.md
@@ -0,0 +1,54 @@
+# 学习资料
+
+## 要读的第一本书
+基础理论习得的最直接来源就是书本。按机器学习理论、深度学习理论、编程语言三方面划分,这里推荐如下书籍辅助您。
+
+
+### 机器学习理论
+
+在开启深度学习之前,您需要先行掌握机器学习的理论。深度学习是机器学习中的一个分支,两者内在的理论基础存在强关联。
+机器学习理论的书籍教材比较多,这里推荐一本易懂易学的书籍,可以重点关注神经网络部分。
+
+书名:《机器学习》(周志华著,清华大学出版社,2016年版)
+
+### 深度学习理论
+
+打好机器学习的理论功底后,您可以开始钻研深度学习的理论。通常深度学习理论会给人留下抽象难懂的印象,且和数学结合紧密。
+为了让您能够顺利入门,这里推荐一份易学易用的教材,无论深度学习理论还是数学理论即可一本搞定。
+
+书名:《Deep Learning(深度学习)》(Goodfellow, Bengio, Courville合著,赵申剑、黎彧君、符天凡和李凯合译,人民邮电出版社,2017年版)
+此书电子版在Github上已经开源,详情可参考此链接 [《深度学习》](https://github.com/exacity/deeplearningbook-chinese)
+
+### 编程语言
+
+Python方向:这里推荐您学习Python,一方面各大主流深度学习框架的主力支撑编程语言均为Python;另一方面,对比其他语言,Python较为简单易学。
+Python的教材种类较多,这里推荐一本实操和理论性都兼顾的教材,只要完成书中52个习题,跑代码然后发现问题解决,就能逐步上手。
+
+书名:《“笨办法”学Python》(Zed Shaw著,王巍巍译,人民邮电出版社,2014年11月版)
+
+
+C++方向:C++语言在底层框架中使用较多,您逐步掌握开源框架的基本操作后,在更高阶的框架应用中会用到这个技能点。
+同前面提到的Python一样,学习C++时需要多上手操作。这里推荐迅速上手C++的书籍,不但能够学习功能和结构,还提供了解决方案的示例。
+
+书名:《Essential C++》【美】李普曼(Lippman,S.B.)著,侯捷译,电子工业出版社2013年8月版
+
+
+
+## 要看的视频公开课
+
+在学习一门新技术的同时,除了看书,如果有老师面对面教授,可以更快更好的学会知识。相比于线下授课,视频公开课能够在省钱省力的同时,达到易学易掌握的效果。
+目前深度学习的课程多是公开免费的,通过学习您可以更轻松的理解深度学习中的抽象理论,并在实操方面不绕弯路。
+综合课程生动性、可操作性、紧凑性、连续性这些特点,这里推荐如下课程,同步附上网址,便于您查找学习。
+
+### 理论知识详解视频课
+[机器学习](http://open.163.com/special/opencourse/machinelearning.html) 斯坦福大学教授吴恩达公开课程,包含相关算法的详细讲解。
+
+[AI技术](https://ai.baidu.com/paddlepaddle/player?id=13) 百度推出的“AI核心技术掌握”课程,每节课在20-30分钟左右,从AI技术到深度学习进行全面细致的解读。
+
+[深度学习](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) 台湾李宏毅教授的在线课程,其中是英文课程,会结合国外的科研成果,但也适合新手入门和理解深度学习。
+
+[编程语言](https://ai.baidu.com/paddlepaddle/openCourses) Python操作课程,从基础到进阶操作都提供详细说明,每节课时长20分钟左右。
+
+### PaddlePaddle实操视频课
+掌握好理论基础,具备编程能力后,您可以开始使用PaddlePaddle Fluid进行实操,从初阶开始学习,向着中高阶努力。
+目前已有PaddlePaddle官方视频公开课在官网呈现,内含PaddlePaddle实战、PaddlePaddle应用场景和机器学习模型讲解课程,帮助开发者从零开始使用PaddlePaddle,从简单场景逐步过渡到工业级应用。[点击这里](http://ai.baidu.com/paddlepaddle/openCourses)您即可开始视频课的学习之旅。
diff --git a/source/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/beginners_guide/basics/machine_translation/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/machine_translation/.gitignore
rename to doc/fluid/beginners_guide/basics/machine_translation/.gitignore
diff --git a/doc/fluid/beginners_guide/basics/machine_translation/image b/doc/fluid/beginners_guide/basics/machine_translation/image
new file mode 120000
index 0000000000000000000000000000000000000000..0101c21f5870c3a796cda5f1eaaaa61855a7442f
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/machine_translation/image
@@ -0,0 +1 @@
+../../../../../external/book/08.machine_translation/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/machine_translation/index.md b/doc/fluid/beginners_guide/basics/machine_translation/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..fad1225ac49b1084e9d9a6e8e1df9367053c346b
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/machine_translation/index.md
@@ -0,0 +1 @@
+../../../../../external/book/08.machine_translation/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/beginners_guide/basics/recommender_system/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/recommender_system/.gitignore
rename to doc/fluid/beginners_guide/basics/recommender_system/.gitignore
diff --git a/doc/fluid/beginners_guide/basics/recommender_system/image b/doc/fluid/beginners_guide/basics/recommender_system/image
new file mode 120000
index 0000000000000000000000000000000000000000..af4f41218de1544bcbb7709e44146e615e4f9804
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/recommender_system/image
@@ -0,0 +1 @@
+../../../../../external/book/05.recommender_system/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/recommender_system/index.md b/doc/fluid/beginners_guide/basics/recommender_system/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..2bbbdc54e0b27d2a437530b255091312390371d0
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/recommender_system/index.md
@@ -0,0 +1 @@
+../../../../../external/book/05.recommender_system/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/understand_sentiment/.gitignore
rename to doc/fluid/beginners_guide/basics/understand_sentiment/.gitignore
diff --git a/doc/fluid/beginners_guide/basics/understand_sentiment/image b/doc/fluid/beginners_guide/basics/understand_sentiment/image
new file mode 120000
index 0000000000000000000000000000000000000000..13bacf9fb90da1516d1f8163e3705458966c284a
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/understand_sentiment/image
@@ -0,0 +1 @@
+../../../../../external/book/06.understand_sentiment/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/understand_sentiment/index.md b/doc/fluid/beginners_guide/basics/understand_sentiment/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..db728d7ba2f547d759dd9854546cb818974920d5
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/understand_sentiment/index.md
@@ -0,0 +1 @@
+../../../../../external/book/06.understand_sentiment/README.cn.md
\ No newline at end of file
diff --git a/source/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/beginners_guide/basics/word2vec/.gitignore
similarity index 100%
rename from source/beginners_guide/basics/word2vec/.gitignore
rename to doc/fluid/beginners_guide/basics/word2vec/.gitignore
diff --git a/doc/fluid/beginners_guide/basics/word2vec/image b/doc/fluid/beginners_guide/basics/word2vec/image
new file mode 120000
index 0000000000000000000000000000000000000000..fe0098012579714af6fa6fdf27afd370021cd29d
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/word2vec/image
@@ -0,0 +1 @@
+../../../../../external/book/04.word2vec/image
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/basics/word2vec/index.md b/doc/fluid/beginners_guide/basics/word2vec/index.md
new file mode 120000
index 0000000000000000000000000000000000000000..19186f4fee4a763bc1e4efcfa812694ca3975372
--- /dev/null
+++ b/doc/fluid/beginners_guide/basics/word2vec/index.md
@@ -0,0 +1 @@
+../../../../../external/book/04.word2vec/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/index.rst b/doc/fluid/beginners_guide/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b15106692b45e8cd476763add88c0b71c8b96871
--- /dev/null
+++ b/doc/fluid/beginners_guide/index.rst
@@ -0,0 +1,15 @@
+########
+新手入门
+########
+
+.. todo::
+
+ 新手入门的导引文字,需要完善。
+
+.. toctree::
+ :maxdepth: 2
+
+ install/install_doc.md
+ quick_start/index.rst
+ basics/index.rst
+ basics/learning_materials.md
diff --git a/doc/fluid/beginners_guide/install/install_doc.md b/doc/fluid/beginners_guide/install/install_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..f034904683c908225e3b5cc08866c7145b414285
--- /dev/null
+++ b/doc/fluid/beginners_guide/install/install_doc.md
@@ -0,0 +1,1503 @@
+# **安装说明**
+本说明将指导您在*64位台式机或笔记本电脑*上编译和安装PaddlePaddle,目前PaddlePaddle支持以下环境:
+
+* *Ubuntu 14.04 /16.04 /18.04*
+* *CentOS 7 / 6*
+* *MacOS 10.12 / 10.13*
+* *Windows7 / 8/ 10(专业版/企业版)*
+
+请确保您的环境满足以上条件
+如在安装或编译过程中遇到问题请参见[FAQ](#FAQ)
+
+
+## **安装PaddlePaddle**
+
+* Ubuntu下安装PaddlePaddle
+* CentOS下安装PaddlePaddle
+* MacOS下安装PaddlePaddle
+* Windows下安装PaddlePaddle
+
+***
+### **Ubuntu下安装PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Ubuntu系统下安装PaddlePaddle,我们支持的Ubuntu系统需满足以下要求:
+
+请注意:在其他系统上的尝试可能会导致安装失败。
+
+* *Ubuntu 14.04 /16.04 /18.04*
+
+#### 确定要安装的PaddlePaddle版本
+
+* 仅支持CPU的PaddlePaddle。如果您的计算机没有 NVIDIA® GPU,则只能安装此版本。如果您的计算机有GPU,
+也推荐您先安装CPU版本的PaddlePaddle,来检测您本地的环境是否适合。
+
+* 支持GPU的PaddlePaddle。为了使PaddlePaddle程序运行更加迅速,我们通过GPU对PaddlePaddle程序进行加速,但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU(具体安装流程和配置请务必参见NVIDIA官方文档:[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/),[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/))
+ * *CUDA 工具包9.0配合cuDNN v7*
+ * *CUDA 工具包8.0配合cuDNN v7*
+ * *GPU运算能力超过1.0的硬件设备*
+
+
+
+#### 选择如何安装PaddlePaddle
+在Ubuntu的系统下我们提供4种安装方式:
+
+* Docker安装
+* pip安装
+* 源码编译安装
+* Docker源码编译安装
+
+
+我们更加推荐**使用Docker进行安装**,因为我们在把工具和配置都安装在一个 Docker image 里,这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+**使用pip安装**,我们为您提供pip安装方法,但它更依赖您的本机环境,可能会出现和您本机环境相关的一些问题。
+
+
+
+从[**源码编译安装**](#ubt_source)以及[**使用Docker进行源码编译安装**](#ubt_docker),这是一种通过将PaddlePaddle源代码编译成为二进制文件,然后在安装这个二进制文件的过程,相比使用我们为您编译过的已经通过测试的二进制文件形式的PaddlePaddle,手动编译更为复杂,我们将在说明的最后详细为您解答。
+
+##### ***使用Docker进行安装***
+
+
+
+为了更好的使用Docker并避免发生问题,我们推荐使用**最高版本的Docker**,关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+
+
+
+> 请注意,要安装和使用支持 GPU 的PaddlePaddle版本,您必须先安装[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+
+
+
+如果已经**正确安装Docker**,即可以开始**使用Docker安装PaddlePaddle**
+
+1. 使用以下指令拉取我们为您预安装好PaddlePaddle的镜像:
+
+
+ * 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:0.15.0`
+
+
+ * 对于需要**GPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For GPU*的镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:0.15.0-gpu-cuda9.0-cudnn7`
+
+
+ * 您也可以通过以下指令拉取任意的我们提供的Docker镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
+ > (请把[tag]替换为[镜像表](#dockers)中的内容)
+
+2. 使用以下指令用已经拉取的镜像构建并进入Docker容器:
+
+ `docker run --name [Name of container] -it -v $PWD:/paddle /bin/bash`
+
+ > 上述命令中,--name [Name of container] 设定Docker的名称;-it 参数说明容器已和本机交互式运行; -v $PWD:/paddle 指定将当前路径(Linux中PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 /paddle 目录; `` 指定需要使用的image名称,如果您需要使用我们的镜像请使用`hub.baidubce.com/paddlepaddle/paddle:[tag]` 注:tag的意义同第二步;/bin/bash是在Docker中要执行的命令。
+
+3. (可选:当您需要第二次进入Docker容器中)使用如下命令使用PaddlePaddle:
+
+ `docker start [Name of container]`
+ > 启动之前创建的容器。
+
+ `docker attach [Name of container]`
+ > 进入启动的容器。
+
+至此您已经成功使用Docker安装PaddlePaddle,您只需要进入Docker容器后运行PaddlePaddle即可,更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注:PaddlePaddle Docker镜像为了减小体积,默认没有安装`vim`,您可以在容器中执行 `apt-get install -y vim` 安装后,在容器中编辑代码。
+
+
+
+##### ***使用pip安装PaddlePaddle***
+
+您可以直接粘贴以下命令到命令行来安装PaddlePaddle(适用于ubuntu16.04及以上安装CPU-ONLY的版本),如果出现问题,您可以参照后面的解释对命令作出适应您系统的更改:
+
+ apt update && apt install -y python-dev python-pip && pip install paddlepaddle
+
+
+首先,我们使用以下指令来**检测本机的环境**是否适合安装PaddlePaddle:
+
+`uname -m && cat /etc/*release`
+> 上面的命令将会显示本机的操作系统和位数信息,请确保您的计算机和本教程的要求一致。
+
+
+其次,您的电脑需要满足以下要求:
+
+* Python2.7.x (dev)
+* Pip >= 9.0.1
+
+ > 您的Ubuntu上可能已经安装pip请使用pip -V来确认我们建议使用pip 9.0.1或更高版本来安装
+
+ 更新apt的源: `apt update`
+
+ 使用以下命令安装或升级Python和pip到需要的版本: `sudo apt install python-dev python-pip`
+ > 即使您的环境中已经有Python2.7也需要安装Python dev。
+
+现在,让我们来安装PaddlePaddle:
+
+1. 使用pip install来安装PaddlePaddle
+
+ * 对于需要**CPU版本PaddlePaddle**的用户:`pip install paddlepaddle`
+
+
+ * 对于需要**GPU版本PaddlePaddle**的用户:`pip install paddlepaddle-gpu`
+ > 1. 为防止出现nccl.h找不到的问题请首先按照以下命令安装nccl2(这里提供的是ubuntu 16.04,CUDA8,cuDNN v7下nccl2的安装指令),更多版本的安装信息请参考NVIDIA[官方网站](https://developer.nvidia.com/nccl/nccl-download):
+ a. `wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`
+ b. `sudo apt-get install libnccl2=2.2.13-1+cuda8.0 libnccl-dev=2.2.13-1+cuda8.0`
+ > 2. 如果您不规定pypi包版本号,我们默认为您提供支持Cuda 8/cuDNN v7的PaddlePaddle版本。
+
+
+ 对于出现`Cannot uninstall 'six'.`问题的用户,可是由于您的系统中已有的Python安装问题造成的,请使用`pip install paddlepaddle --ignore-installed six`(CPU)或`pip install paddlepaddle --ignore-installed six`(GPU)解决。
+
+ * 对于有**其他要求**的用户:`pip install paddlepaddle==[版本号]`
+ > `版本号`参见[安装包列表](#whls)或者您如果需要获取并安装**最新的PaddlePaddle开发分支**,可以从[多版本whl包列表](#ciwhls)或者我们的[CI系统](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 中下载最新的whl安装包和c-api开发包并安装。如需登录,请点击“Log in as guest”。
+
+
+
+
+现在您已经完成使用`pip install` 来安装的PaddlePaddle的过程。
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+* ***GPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle-gpu`
+
+
+### **CentOS下安装PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及CentOS系统下安装PaddlePaddle,我们支持的CentOS系统需满足以下要求:
+
+
+请注意:在其他系统上的尝试可能会导致安装失败。
+
+* *CentOS 6 / 7*
+
+#### 确定要安装的PaddlePaddle版本
+* 仅支持CPU的PaddlePaddle。如果您的计算机没有 NVIDIA® GPU,则只能安装此版本。如果您的计算机有GPU,
+推荐您先安装CPU版本的PaddlePaddle,来检测您本地的环境是否适合。
+
+* 支持GPU的PaddlePaddle,为了使PaddlePaddle程序运行的更加迅速,我们通过GPU对PaddlePaddle程序进行加速,但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU(具体安装流程和配置请务必参见NVIDIA官方文档:[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/),[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/))
+ * *CUDA 工具包9.0配合cuDNN v7*
+ * *CUDA 工具包8.0配合cuDNN v7*
+ * *GPU运算能力超过1.0的硬件设备*
+
+
+
+#### 选择如何安装PaddlePaddle
+在CentOS的系统下我们提供4种安装方式:
+
+* Docker安装(不支持GPU版本)
+* pip安装
+* 源码编译安装(不支持CentOS 6的所有版本以及CentOS 7的GPU版本)
+* Docker源码编译安装(不支持GPU版本)
+
+
+我们更加推荐**使用Docker进行安装**,因为我们在把工具和配置都安装在一个 Docker image 里,这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+
+**使用pip安装**,我们为您提供pip安装方法,但它更依赖您的本机环境,可能会出现和您本机环境相关的一些问题。
+
+从[**源码编译安装**](#ct_source)以及[**使用Docker进行源码编译安装**](#ct_docker),这是一种通过将PaddlePaddle源代码编译成为二进制文件,然后在安装这个二进制文件的过程,相比使用我们为您编译过的已经通过测试的二进制文件形式的PaddlePaddle,手动编译更为复杂,我们将在说明的最后详细为您解答。
+
+##### ***使用Docker进行安装***
+
+
+
+为了更好的使用Docker并避免发生问题,我们推荐使用**最高版本的Docker**,关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)
+
+
+> 请注意,要安装和使用支持 GPU 的PaddlePaddle版本,您必须先安装[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker安装PaddlePaddle**
+
+1. 使用以下指令拉取我们为您预安装好PaddlePaddle的镜像:
+
+
+ * 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:0.15.0`
+
+
+
+
+ * 您也可以通过以下指令拉取任意的我们提供的Docker镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
+ > (请把[tag]替换为[镜像表](#dockers)中的内容)
+
+
+2. 使用以下指令用已经拉取的镜像构建并进入Docker容器:
+
+ `docker run --name [Name of container] -it -v $PWD:/paddle /bin/bash`
+
+ > 上述命令中,--name [Name of container] 设定Docker的名称;-it 参数说明容器已和本机交互式运行; -v $PWD:/paddle 指定将当前路径(Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185))挂载到容器内部的 /paddle 目录; `` 指定需要使用的image名称,如果您需要使用我们的镜像请使用`hub.baidubce.com/paddlepaddle/paddle:[tag]` 注:tag的意义同第二步,/bin/bash是在Docker中要执行的命令。
+
+3. (可选:当您需要第二次进入Docker容器中)使用如下命令使用PaddlePaddle:
+
+ `docker start [Name of container]`
+ > 启动之前创建的容器。
+
+ `docker attach [Name of container]`
+ > 进入启动的容器。
+
+至此您已经成功使用Docker安装PaddlePaddle,您只需要进入Docker容器后运行PaddlePaddle即可,更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+> 注:PaddlePaddle Docker镜像为了减小体积,默认没有安装`vim`,您可以在容器中执行 `apt-get install -y vim` 安装后,在容器中编辑代码。
+
+
+
+##### ***使用pip安装PaddlePaddle***
+
+您可以直接粘贴以下命令到命令行来安装PaddlePaddle(适用于CentOS7安装CPU-ONLY的版本),如果出现问题,您可以参照后面的解释对命令作出适应您系统的更改:
+
+ yum update && yum install -y epel-release && yum install -y python-devel python-pip && pip install paddlepaddle && export LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
+
+首先,我们使用以下指令来**检测本机的环境**是否适合安装PaddlePaddle:
+
+`uname -m && cat /etc/*release`
+> 上面的命令将会显示本机的操作系统和位数信息,请确保您的计算机和本教程的要求一致。
+
+
+其次,您的计算机需要满足以下要求:
+
+* Python2.7.x (devel)
+
+ > CentOS6需要编译Python2.7成[共享库](#FAQ)。
+
+
+* Pip >= 9.0.1
+
+ > 您的CentOS上可能已经安装pip请使用pip -V来确认我们建议使用pip 9.0.1或更高版本来安装。
+
+ 更新yum的源: `yum update` 并安装拓展源以安装pip: `yum install -y epel-release`
+
+ 使用以下命令安装或升级Python和pip到需要的版本: `sudo yum install python-devel python-pip`
+ > 即使您的环境中已经有`Python2.7`也需要安装`python devel`。
+
+下面将说明如何安装PaddlePaddle:
+
+1. 使用pip install来安装PaddlePaddle:
+
+ * 对于需要**CPU版本PaddlePaddle**的用户:`pip install paddlepaddle`
+
+
+ * 对于需要**GPU版本PaddlePaddle**的用户: `pip install paddlepaddle-gpu`
+ > 1. 为防止出现nccl.h找不到的问题请首先按照NVIDIA[官方网站](https://developer.nvidia.com/nccl/nccl-download)的指示正确安装nccl2
+ > 2. 如果您不规定pypi包版本号,我们默认为您提供支持Cuda 8/cuDNN v7的PaddlePaddle版本。
+
+ 对于出现`Cannot uninstall 'six'.`问题的用户,可是由于您的系统中已有的Python安装问题造 成的,请使用`pip install paddlepaddle --ignore-installed six`(CPU)或`pip install paddlepaddle-gpu --ignore-installed six`(GPU)解决。
+
+ * 对于有**其他要求**的用户:`pip install paddlepaddle==[版本号]`
+ > `版本号`参见[安装包列表](#whls)或者您如果需要获取并安装**最新的PaddlePaddle开发分支**,可以从我们的[CI系统](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 中下载最新的whl安装包和c-api开发包并安装。如需登录,请点击“Log in as guest”。
+
+
+
+
+现在您已经完成通过`pip install` 来安装的PaddlePaddle的过程。
+
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入Python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+* ***GPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle-gpu`
+
+
+
+
+
+### **MacOS下安装PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及MacOS系统下安装PaddlePaddle,我们支持的MacOS系统需满足以下要求。
+
+请注意:在其他系统上的尝试可能会导致安装失败。
+
+* *MacOS 10.12/10.13*
+
+#### 确定要安装的PaddlePaddle版本
+
+* 仅支持CPU的PaddlePaddle。
+
+
+
+#### 选择如何安装PaddlePaddle
+在MacOS的系统下我们提供3种安装方式:
+
+* Docker安装(不支持GPU版本)
+* Docker源码编译安装(不支持GPU版本)
+
+
+我们更加推荐**使用Docker进行安装**,因为我们在把工具和配置都安装在一个 Docker image 里,这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+
+
+
+
+##### ***使用Docker进行安装***
+
+
+
+为了更好的使用Docker并避免发生问题,我们推荐使用**最高版本的Docker**,关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+> 请注意,在MacOS系统下登陆docker需要使用您的dockerID进行登录,否则将出现`Authenticate Failed`错误。
+
+如果已经**正确安装Docker**,即可以开始**使用Docker安装PaddlePaddle**
+
+1. 使用以下指令拉取我们为您预安装好PaddlePaddle的镜像:
+
+
+ * 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:0.15.0`
+
+
+ * 您也可以通过以下指令拉取任意的我们提供的Docker镜像:
+
+ `docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
+ > (请把[tag]替换为[镜像表](#dockers)中的内容)
+
+2. 使用以下指令用已经拉取的镜像构建并进入Docker容器:
+
+ `docker run --name [Name of container] -it -v $PWD:/paddle /bin/bash`
+
+ > 上述命令中,--name [Name of container] 设定Docker的名称;-it 参数说明容器已和本机交互式运行; -v $PWD:/paddle 指定将当前路径(Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185))挂载到容器内部的 /paddle 目录; `` 指定需要使用的image名称,如果您需要使用我们的镜像请使用`hub.baidubce.com/paddlepaddle/paddle:[tag]` 注:tag的意义同第二步;/bin/bash是在Docker中要执行的命令。
+
+3. (可选:当您需要第二次进入Docker容器中)使用如下命令使用PaddlePaddle:
+
+ `docker start [Name of container]`
+ > 启动之前创建的容器。
+
+ `docker attach [Name of container]`
+ > 进入启动的容器。
+
+
+至此您已经成功使用Docker安装PaddlePaddle,您只需要进入Docker容器后运行PaddlePaddle即可,更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注:PaddlePaddle Docker镜像为了减小体积,默认没有安装`vim`,您可以在容器中执行 `apt-get install -y vim` 安装后,在容器中编辑代码。
+
+
+
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+
+
+
+
+### **Windows下安装PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Windows系统下安装PaddlePaddle,我们支持的Windows系统需满足以下要求。
+
+请注意:在其他系统上的尝试可能会导致安装失败。
+
+* *Windows 7/8 and Windows 10 专业版/企业版*
+
+#### 确定要安装的PaddlePaddle版本
+
+* Windows下我们目前仅提供支持CPU的PaddlePaddle。
+
+
+#### 选择如何安装PaddlePaddle
+在Windows系统下请使用我们为您提供的[一键安装包](http://paddle-windows-0150.bj.bcebos.com/PaddlePaddle-windows-0.15.0.zip)进行安装
+
+> 我们提供的一键安装包将基于Docker为您进行便捷的安装流程
+
+
+我们之所以使用**基于Docker的安装方式**,是因为我们在把工具和配置都安装在一个 Docker image 里,这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+
+
+
+
+
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+
+
+
+
+
+
+## **从源码编译PaddlePaddle**
+我们也为您提供了从源码编译的方式,但不推荐您使用这种方式,这是因为您的本机环境多种多样,在编译源码时易出现复杂的本说明中覆盖以外问题而造成安装失败。
+
+***
+### **Ubuntu下从源码编译PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Ubuntu系统下编译PaddlePaddle,我们支持的Ubuntu系统需满足以下要求:
+
+* Ubuntu 14.04/16.04/18.04(这涉及到相关工具是否能被正常安装)
+
+#### 确定要编译的PaddlePaddle版本
+* **仅支持CPU的PaddlePaddle**,如果您的系统没有 NVIDIA® GPU,则必须安装此版本。而此版本较GPU版本更加容易安
+因此即使您的计算机上拥有GPU我们也推荐您先安装CPU版本的PaddlePaddle来检测您本地的环境是否适合。
+
+* **支持GPU的PaddlePaddle**,为了使得PaddlePaddle程序运行的更加迅速,我们通常使用GPU对PaddlePaddle程序进行加速,但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA® GPU(具体安装流程和配置请务必参见NVIDIA官方文档:[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/),[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/))
+ * *CUDA 工具包9.0配合cuDNN v7*
+ * *CUDA 工具包8.0配合cuDNN v7*
+ * *GPU运算能力超过1.0的硬件设备*
+
+#### 选择如何编译PaddlePaddle
+在Ubuntu的系统下我们提供2种编译方式:
+
+* Docker源码编译
+* 直接本机源码编译
+
+我们更加推荐**使用Docker进行编译**,因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+
+我们也提供了可以从**本机直接源码编译**的方法,但是由于在本机上的情况更加复杂,我们只对特定系统提供了支持。
+
+
+
+
+##### ***使用Docker进行编译***
+为了更好的使用Docker并避免发生问题,我们推荐使用**最高版本的Docker**,关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)
+
+
+> 请注意,要安装和使用支持 GPU 的PaddlePaddle版本,您必须先安装[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker编译PaddlePaddle**:
+
+1. 请首先选择您希望储存PaddlePaddle的路径,然后在该路径下使用以下命令将PaddlePaddle的源码从github克隆到本地当前目录下名为Paddle的文件夹中:
+
+ `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+2. 进入Paddle目录下: `cd Paddle`
+
+3. 利用我们提供的镜像(使用该命令您可以不必提前下载镜像):
+
+ `docker run --name paddle-test -v $PWD:/paddle --network=host -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash`
+ > --name paddle-test为您创建的Docker容器命名为paddle-test,-v $PWD:/paddle 将当前目录挂载到Docker容器中的/paddle目录下(Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)),-it 与宿主机保持交互状态,`hub.baidubce.com/paddlepaddle/paddle:latest-dev` 使用名为`hub.baidubce.com/paddlepaddle/paddle:latest-dev`的镜像创建Docker容器,/bin/bash 进入容器后启动/bin/bash命令。
+
+4. 进入Docker后进入paddle目录下:`cd paddle`
+
+5. 切换到较稳定release分支下进行编译:
+
+ `git checkout release/0.15.0`
+
+6. 创建并进入/paddle/build路径下:
+
+ `mkdir -p /paddle/build && cd /paddle/build`
+
+7. 使用以下命令安装相关依赖:
+
+ `pip install protobuf==3.1.0`
+ > 安装protobuf 3.1.0。
+
+ `apt install patchelf`
+ > 安装patchelf,PatchELF 是一个小而实用的程序,用于修改ELF可执行文件的动态链接器和RPATH。
+
+8. 执行cmake:
+
+ >具体编译选项含义请参见[编译选项表](#Compile)
+
+
+ * 对于需要编译**CPU版本PaddlePaddle**的用户:
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`
+
+
+ * 对于需要编译**GPU版本PaddlePaddle**的用户:
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF`
+
+
+9. 执行编译:
+
+ `make -j$(nproc)`
+ > 使用多核编译
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包: `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包:
+
+ `pip install (whl包的名字)`
+
+至此您已经成功使用Docker安装PaddlePaddle,您只需要进入Docker容器后运行PaddlePaddle即可,更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注:PaddlePaddle Docker镜像为了减小体积,默认没有安装`vim`,您可以在容器中执行 `apt-get install -y vim` 安装后,在容器中编辑代码。
+
+恭喜您,现在您已经完成使用Docker编译PaddlePaddle的过程。
+
+
+
+
+##### ***本机编译***
+
+1. 检查您的计算机和操作系统是否符合我们支持的编译标准: `uname -m && cat /etc/*release`
+
+2. 更新`apt`的源: `apt update`
+
+2. 我们支持使用virtualenv进行编译安装,首先请使用以下命令创建一个名为`paddle-venv`的虚环境:
+
+ * 安装Python-dev: `apt install python-dev`
+
+ * 安装pip: `apt install python-pip` (请保证拥有9.0.1及以上版本的pip)
+
+ * 安装虚环境`virtualenv`以及`virtualenvwrapper`并创建名为`paddle-venv`的虚环境:
+
+ 1. `apt install virtualenv` 或 `pip install virtualenv`
+ 2. `apt install virtualenvwrapper` 或 `pip install virtualenvwrapper`
+ 3. 找到`virtualenvwrapper.sh`: `find / -name virtualenvwrapper.sh`
+ 4. 查看`virtualenvwrapper.sh`中的安装方法: `cat virtualenvwrapper.sh`
+ 5. 按照`virtualenvwrapper.sh`中的安装方法安装`virtualwrapper`
+ 6. 创建名为`paddle-venv`的虚环境: `mkvirtualenv paddle-venv`
+
+
+3. 进入虚环境:`workon paddle-venv`
+
+
+4. **执行编译前**请您确认在虚环境中安装有[编译依赖表](#third_party)中提到的相关依赖:
+
+ * 这里特别提供`patchELF`的安装方法,其他的依赖可以使用`apt install`或者`pip install` 后跟依赖名称和版本安装:
+
+ `apt install patchelf`
+ > 不能使用apt安装的用户请参见patchElF github[官方文档](https://gist.github.com/ruario/80fefd174b3395d34c14)
+
+5. 将PaddlePaddle的源码clone在当下目录下的Paddle的文件夹中,并进入Padde目录下:
+
+ - `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+ - `cd Paddle`
+
+6. 切换到较稳定release分支下进行编译:
+
+ `git checkout release/0.15.0`
+
+7. 并且请创建并进入一个叫build的目录下:
+
+ `mkdir build && cd build`
+
+8. 执行cmake:
+
+ >具体编译选项含义请参见[编译选项表](#Compile)
+
+
+ * 对于需要编译**CPU版本PaddlePaddle**的用户:
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`.
+
+
+ * 对于需要编译**GPU版本PaddlePaddle**的用户:(*仅支持ubuntu16.04/14.04*)
+
+ 1. 请确保您已经正确安装nccl2,或者按照以下指令安装nccl2(这里提供的是ubuntu 16.04,CUDA8,cuDNN7下nccl2的安装指令),更多版本的安装信息请参考NVIDIA[官方网站](https://developer.nvidia.com/nccl/nccl-download):
+ i. `wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`
+ ii. `sudo apt-get install libnccl2=2.2.13-1+cuda8.0 libnccl-dev=2.2.13-1+cuda8.0`
+
+ 2. 如果您已经正确安装了`nccl2`,就可以开始cmake了:
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=ON -DWITH_TESTING=OFF`
+
+9. 使用以下命令来编译:
+
+ `make -j$(nproc)`
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包: `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包:
+
+ `pip install (whl包的名字)`
+
+恭喜您,现在您已经完成使本机编译PaddlePaddle的过程了。
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入Python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+* ***GPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle-gpu`
+
+
+
+### **CentOS下从源码编译PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及CentOS系统下编译PaddlePaddle,我们支持的Ubuntu系统需满足以下要求:
+
+* CentOS 7 / 6(这涉及到相关工具是否能被正常安装)
+
+#### 确定要编译的PaddlePaddle版本
+* **仅支持CPU的PaddlePaddle**。
+
+
+
+#### 选择如何编译PaddlePaddle
+我们在CentOS的系统下提供2种编译方式:
+
+* Docker源码编译(不支持CentOS 6 / 7的GPU版本)
+* 直接本机源码编译(不支持CentOS 6的全部版本以及CentOS 7的GPU版本)
+
+我们更加推荐**使用Docker进行编译**,因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+
+同样对于那些出于各种原因不能够安装Docker的用户我们也提供了可以从**本机直接源码编译**的方法,但是由于在本机上的情况更加复杂,因此我们只支持特定的系统。
+
+
+
+
+
+##### ***使用Docker进行编译***
+
+为了更好的使用Docker并避免发生问题,我们推荐使用**最高版本的Docker**,关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+
+
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker编译PaddlePaddle**啦:
+
+1. 请首先选择您希望储存PaddlePaddle的路径,然后在该路径下使用以下命令将PaddlePaddle的源码从github克隆到本地当前目录下名为Paddle的文件夹中:
+
+ `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+2. 进入Paddle目录下: `cd Paddle`
+
+3. 利用我们提供的镜像(使用该命令您可以不必提前下载镜像):
+
+ `docker run --name paddle-test -v $PWD:/paddle --network=host -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash`
+ > --name paddle-test为您创建的Docker容器命名为paddle-test,-v $PWD:/paddle 将当前目录挂载到Docker容器中的/paddle目录下(Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)),-it 与宿主机保持交互状态,`hub.baidubce.com/paddlepaddle/paddle` 使用名为`hub.baidubce.com/paddlepaddle/paddle:latest-dev`的镜像创建Docker容器,/bin/bash 进入容器后启动/bin/bash命令。
+
+4. 进入Docker后进入paddle目录下:`cd paddle`
+
+5. 切换到较稳定release分支下进行编译:
+
+ `git checkout release/0.15.0`
+
+6. 创建并进入/paddle/build路径下:
+
+ `mkdir -p /paddle/build && cd /paddle/build`
+
+7. 使用以下命令安装相关依赖:
+
+ `pip install protobuf==3.1.0`
+ > 安装protobuf 3.1.0。
+
+ `apt install patchelf`
+ > 安装patchelf,PatchELF 是一个小而实用的程序,用于修改ELF可执行文件的动态链接器和RPATH。
+
+8. 执行cmake:
+
+ >具体编译选项含义请参见[编译选项表](#Compile)
+
+
+ * 对于需要编译**CPU版本PaddlePaddle**的用户:
+
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`
+
+
+ >> 我们目前不支持CentOS下GPU版本PaddlePaddle的编译
+
+9. 执行编译:
+
+ `make -j$(nproc)`
+ > 使用多核编译
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包: `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包:
+
+ `pip install (whl包的名字)`
+
+至此您已经成功使用Docker安装PaddlePaddle,您只需要进入Docker容器后运行PaddlePaddle即可,更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注:PaddlePaddle Docker镜像为了减小体积,默认没有安装`vim`,您可以在容器中执行 `apt-get install -y vim` 安装后,在容器中编辑代码。
+
+恭喜您,现在您已经完成使用Docker编译PaddlePaddle的过程。
+
+
+
+
+
+
+
+
+
+##### ***本机编译***
+
+1. 检查您的计算机和操作系统是否符合我们支持的编译标准: `uname -m && cat /etc/*release`
+
+2. 更新`yum`的源: `yum update`, 并添加必要的yum源:`yum install -y epel-release`
+
+3. 安装必要的工具`bzip2`以及`make`: `yum install -y bzip2` , `yum install -y make`
+
+2. 我们支持使用virtualenv进行编译安装,首先请使用以下命令创建一个名为`paddle-venv`的虚环境:
+
+ * 安装Python-dev: `yum install python-devel`
+
+ * 安装pip: `yum install python-pip` (请保证拥有9.0.1及以上的pip版本)
+
+ * 安装虚环境`virtualenv`以及`virtualenvwrapper`并创建名为`paddle-venv`的虚环境:
+
+ 1. `pip install virtualenv` 或 `pip install virtualenv`
+ 2. `pip install virtualenvwrapper` 或 `pip install virtualenvwrapper`
+ 3. 找到`virtualenvwrapper.sh`: `find / -name virtualenvwrapper.sh`
+ 4. 查看`virtualenvwrapper.sh`中的安装方法: `cat vitualenvwrapper.sh`
+ 5. 安装`virtualwrapper`
+ 6. 创建名为`paddle-venv`的虚环境: `mkvirtualenv paddle-venv`
+
+
+3. 进入虚环境:`workon paddle-venv`
+
+
+4. **执行编译前**请您确认在虚环境中安装有[编译依赖表](#third_party)中提到的相关依赖:
+
+ * 这里特别提供`patchELF`的安装方法,其他的依赖可以使用`yum install`或者`pip install` 后跟依赖名称和版本安装:
+
+ `yum install patchelf`
+ > 不能使用apt安装的用户请参见patchElF github[官方文档](https://gist.github.com/ruario/80fefd174b3395d34c14)
+
+5. 将PaddlePaddle的源码clone在当下目录下的Paddle的文件夹中,并进入Padde目录下:
+
+ - `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+ - `cd Paddle`
+
+6. 切换到较稳定release分支下进行编译:
+
+ `git checkout release/0.15.0`
+
+7. 并且请创建并进入一个叫build的目录下:
+
+ `mkdir build && cd build`
+
+8. 执行cmake:
+
+ >具体编译选项含义请参见[编译选项表](#Compile)
+
+
+ * 对于需要编译**CPU版本PaddlePaddle**的用户:
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`.
+
+
+
+
+9. 使用以下命令来编译:
+
+ `make -j$(nproc)`
+
+10. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包: `cd /paddle/build/python/dist`
+
+11. 在当前机器或目标机器安装编译好的`.whl`包:
+
+ `pip install (whl包的名字)`
+
+恭喜您,现在您已经完成使本机编译PaddlePaddle的过程了。
+
+
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入Python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+
+
+
+
+### **MacOS下从源码编译PaddlePaddle**
+
+本说明将介绍如何在*64位台式机或笔记本电脑*以及MacOS系统下编译PaddlePaddle,我们支持的MacOS系统需满足以下要求:
+
+* MacOS 10.12/10.13(这涉及到相关工具是否能被正常安装)
+
+#### 确定要编译的PaddlePaddle版本
+* **仅支持CPU的PaddlePaddle**。
+
+
+
+#### 选择如何编译PaddlePaddle
+在MacOS 10.12/10.13的系统下我们提供1种编译方式:
+
+
+* Docker源码编译
+
+
+
+
+
+我们更加推荐**使用Docker进行编译**,因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题,其他人可以复现问题以便帮助。另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+
+
+
+
+
+
+
+
+
+
+##### ***使用Docker进行编译***
+
+为了更好的使用Docker并避免发生问题,我们推荐使用**最高版本的Docker**,关于**安装和使用Docker**的细节请参阅Docker[官方文档](https://docs.docker.com/install/)。
+> 请注意,在MacOS系统下登陆docker需要使用您的dockerID进行登录,否则将出现`Authenticate Failed`错误。
+
+
+当您已经**正确安装Docker**后你就可以开始**使用Docker编译PaddlePaddle**啦:
+
+1. 进入Mac的终端
+
+2. 请选择您希望储存PaddlePaddle的路径,然后在该路径下使用以下命令将PaddlePaddle的源码从github克隆到本地当前目录下名为Paddle的文件夹中:
+
+ `git clone https://github.com/PaddlePaddle/Paddle.git`
+
+3. 进入Paddle目录下: `cd Paddle`
+
+4. 利用我们提供的镜像(使用该命令您可以不必提前下载镜像):
+
+ `docker run --name paddle-test -v $PWD:/paddle --network=host -it hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash`
+ > --name paddle-test为您创建的Docker容器命名为paddle-test,-v $PWD:/paddle 将当前目录挂载到Docker容器中的/paddle目录下(Linux中PWD变量会展开为当前路径的[绝对路径](https://baike.baidu.com/item/绝对路径/481185)),-it 与宿主机保持交互状态,`hub.baidubce.com/paddlepaddle/paddle:latest-dev` 使用名为`hub.baidubce.com/paddlepaddle/paddle:latest-dev`的镜像创建Docker容器,/bin/bash 进入容器后启动/bin/bash命令。
+
+5. 进入Docker后进入paddle目录下:`cd paddle`
+
+7. 切换到较稳定release分支下进行编译:
+
+ `git checkout release/0.15.0`
+
+8. 创建并进入/paddle/build路径下:
+
+ `mkdir -p /paddle/build && cd /paddle/build`
+
+9. 使用以下命令安装相关依赖:
+
+ `pip install protobuf==3.1.0`
+ > 安装protobuf 3.1.0。
+
+ `apt install patchelf`
+ > 安装patchelf,PatchELF 是一个小而实用的程序,用于修改ELF可执行文件的动态链接器和RPATH。
+
+10. 执行cmake:
+
+ >具体编译选项含义请参见[编译选项表](#Compile)
+
+
+ * 对于需要编译**CPU版本PaddlePaddle**的用户:
+
+ `cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF`
+ > 我们目前不支持CentOS下GPU版本PaddlePaddle的编译
+
+
+
+
+11. 执行编译:
+
+ `make -j$(nproc)`
+ > 使用多核编译
+
+12. 编译成功后进入`/paddle/build/python/dist`目录下找到生成的`.whl`包: `cd /paddle/build/python/dist`
+
+13. 在当前机器或目标机器安装编译好的`.whl`包:
+
+ `pip install (whl包的名字)`
+
+至此您已经成功使用Docker安装PaddlePaddle,您只需要进入Docker容器后运行PaddlePaddle即可,更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
+
+> 注:PaddlePaddle Docker镜像为了减小体积,默认没有安装`vim`,您可以在容器中执行 `apt-get install -y vim` 安装后,在容器中编辑代码。
+
+恭喜您,现在您已经完成使用Docker编译PaddlePaddle的过程。
+
+
+
+
+
+##### ***验证安装***
+安装完成后您可以使用:`python` 进入Python解释器,然后使用`import paddle.fluid` 验证是否安装成功。
+
+
+##### ***如何卸载PaddlePaddle***
+请使用以下命令卸载PaddlePaddle:
+
+* ***CPU版本的PaddlePaddle***: `pip uninstall PaddlePaddle`
+
+
+
+
+
+
+
+## **FAQ**
+- CentOS6下如何编译python2.7为共享库?
+
+ > 使用以下指令:
+
+ ./configure --prefix=/usr/local/python2.7 --enable-shared
+ make && make install
+
+
+
+- Ubuntu18.04下libidn11找不到?
+
+ > 使用以下指令:
+
+ apt install libidn11
+
+- Ubuntu编译时出现大量的代码段不能识别?
+
+ > 这可能是由于cmake版本不匹配造成的,请在gcc的安装目录下使用以下指令:
+
+ apt install gcc-4.8 g++-4.8
+ cp gcc gcc.bak
+ cp g++ g++.bak
+ rm gcc
+ rm g++
+ ln -s gcc-4.8 gcc
+ ln -s g++-4.8 g++
+
+
+
+
+
+
+- 遇到paddlepaddle*.whl is not a supported wheel on this platform?
+ > 出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。 请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准, 需要使用最新的pip (>9.0.0) 才可以安装。您可以执行以下指令更新您的pip:
+
+ pip install --upgrade pip
+
+ > 或者:
+
+ python -c "import pip; print(pip.pep425tags.get_supported())"
+
+ > 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包 (本地)是 linux_x86_64, 可以重命名这个whl包为 manylinux1_x86_64 再安装。
+
+- 使用Docker编译出现问题?
+
+ > 请参照GitHub上[Issue12079](https://github.com/PaddlePaddle/Paddle/issues/12079)
+
+- 什么是 Docker?
+
+ 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机?
+
+ 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+ 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
+
+ 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
+
+- 可以选择不用Docker吗?
+
+ 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难?
+
+ 理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。
+ 这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 可以用 IDE 吗?
+
+ 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+ 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+ (global-set-key "\C-cc" 'compile)
+ (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+ 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗?
+
+ 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/paddle/scripts/paddle_build.sh)。这个脚本调用`make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo?
+
+ 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢?
+
+ Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[issue627](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够?
+
+ 本文中的例子里, `docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。 `docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 [这篇文章](https://zaiste.net/posts/removing_docker_containers) 来清理这些内容。
+
+- 在DockerToolbox下使用book时`http://localhost:8888/`无法打开?
+
+ 需要将localhost替换成虚拟机ip,一般需要在浏览器中输入:`http://192.168.99.100:8888/`
+
+- pip install gpu版本的PaddlePaddle后运行出现SegmentFault如下:
+
+ @ 0x7f6c8d214436 paddle::platform::EnforceNotMet::EnforceNotMet()
+
+ @ 0x7f6c8dfed666 paddle::platform::GetCUDADeviceCount()
+
+ @ 0x7f6c8d2b93b6 paddle::framework::InitDevices()
+
+ 出现这个问题原因主要是由于您的显卡驱动低于对应CUDA版本的要求,请保证您的显卡驱动支持所使用的CUDA版本
+
+
+
+## 附录
+
+### **编译依赖表**
+
+
+
+
+
+ 依赖包名称 |
+ 版本 |
+ 说明 |
+ 安装命令 |
+
+
+
+
+ CMake |
+ 3.4 |
+ |
+ |
+
+
+ GCC |
+ 4.8 / 5.4 |
+ 推荐使用CentOS的devtools2 |
+ |
+
+
+ Python |
+ 2.7.x. |
+ 依赖libpython2.7.so |
+ apt install python-dev 或 yum install python-devel |
+
+
+ SWIG |
+ 最低 2.0 |
+ |
+ apt install swig 或 yum install swig |
+
+
+ wget |
+ any |
+ |
+ apt install wget 或 yum install wget |
+
+
+ openblas |
+ any |
+ |
+ |
+
+
+ pip |
+ 最低9.0.1 |
+ |
+ apt install python-pip 或 yum install Python-pip |
+
+
+ numpy |
+ >=1.12.0 |
+ |
+ pip install numpy==1.14.0 |
+
+
+ protobuf |
+ 3.1.0 |
+ |
+ pip install protobuf==3.1.0 |
+
+
+ wheel |
+ any |
+ |
+ pip install wheel |
+
+
+ patchELF |
+ any |
+ |
+ apt install patchelf 或参见github patchELF 官方文档 |
+
+
+ go |
+ >=1.8 |
+ 可选 |
+ |
+
+
+
+
+
+
+***
+
+
+### **编译选项表**
+
+
+
+
+
+ 选项 |
+ 说明 |
+ 默认值 |
+
+
+
+
+ WITH_GPU |
+ 是否支持GPU |
+ ON |
+
+
+ WITH_C_API |
+ 是否仅编译CAPI |
+ OFF |
+
+
+ WITH_DOUBLE |
+ 是否使用双精度浮点数 |
+ OFF |
+
+
+ WITH_DSO |
+ 是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库 |
+ ON |
+
+
+ WITH_AVX |
+ 是否编译含有AVX指令集的PaddlePaddle二进制文件 |
+ ON |
+
+
+ WITH_PYTHON |
+ 是否内嵌PYTHON解释器 |
+ ON |
+
+
+ WITH_STYLE_CHECK |
+ 是否编译时进行代码风格检查 |
+ ON |
+
+
+ WITH_TESTING |
+ 是否开启单元测试 |
+ OFF |
+
+
+ WITH_DOC |
+ 是否编译中英文文档 |
+ OFF |
+
+
+ WITH_SWIG_PY |
+ 是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练 |
+ Auto |
+
+ WITH_GOLANG |
+ 是否编译go语言的可容错parameter server |
+ OFF |
+
+
+ WITH_MKL |
+ 是否使用MKL数学库,如果为否则是用OpenBLAS |
+ ON |
+
+
+
+
+
+
+
+
+
+**BLAS**
+
+PaddlePaddle支持 [MKL](https://software.intel.com/en-us/mkl) 和 [OpenBlAS](http://www.openblas.net) 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,还会下载MKL-DNN数学库,详细参考[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake) 。
+
+如果关闭MKL,则会使用OpenBLAS作为BLAS库。
+
+**CUDA/cuDNN**
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 使用参数 `-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。 我们推荐使用最新版本的cuDNN。
+
+**编译选项的设置**
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( `/usr/liby` 和 `/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用`-D`命令可以设置,例如:
+
+> `cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5`
+
+**注意**:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录( rm -rf )后,再指定。
+
+
+***
+
+
+### **安装包列表**
+
+
+
+
+
+ 版本号 |
+ 版本说明 |
+
+
+
+
+ paddlepaddle-gpu==0.15.0 |
+ 使用CUDA 9.0和cuDNN 7编译的0.15.0版本 |
+
+
+ paddlepaddle-gpu==0.15.0.post87 |
+ 使用CUDA 8.0和cuDNN 7编译的0.15.0版本 |
+
+
+ paddlepaddle-gpu==0.15.0.post85 |
+ 使用CUDA 8.0和cuDNN 5编译的0.15.0版本 |
+
+
+ paddlepaddle-gpu==0.13.0 |
+ 使用CUDA 9.0和cuDNN 7编译的0.13.0版本 |
+
+
+ paddlepaddle-gpu==0.12.0 |
+ 使用CUDA 8.0和cuDNN 5编译的0.12.0版本 |
+
+
+ paddlepaddle-gpu==0.11.0.post87 |
+ 使用CUDA 8.0和cuDNN 7编译的0.11.0版本 |
+
+
+ paddlepaddle-gpu==0.11.0.post85 |
+ 使用CUDA 8.0和cuDNN 5编译的0.11.0版本 |
+
+
+ paddlepaddle-gpu==0.11.0 |
+ 使用CUDA 7.5和cuDNN 5编译的0.11.0版本 |
+
+
+
+
+
+
+您可以在 [Release History](https://pypi.org/project/paddlepaddle-gpu/#history) 中找到PaddlePaddle-gpu的各个发行版本。
+
+***
+
+
+### **安装镜像表及简介**
+
+
+
+
+ 版本号 |
+ 版本说明 |
+
+
+
+
+ hub.baidubce.com/paddlepaddle/paddle:latest |
+ 最新的预先安装好PaddlePaddle CPU版本的镜像 |
+
+
+ hub.baidubce.com/paddlepaddle/paddle:latest-dev |
+ 最新的PaddlePaddle的开发环境 |
+
+
+ hub.baidubce.com/paddlepaddle/paddle:[Version] |
+ 将version换成具体的版本,历史版本的预安装好PaddlePaddle的镜像 |
+
+
+ hub.baidubce.com/paddlepaddle/paddle:latest-gpu |
+ 最新的预先安装好PaddlePaddle GPU版本的镜像 |
+
+
+
+
+
+
+您可以在 [DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/) 中找到PaddlePaddle的各个发行的版本的docker镜像。
+
+
+
+
+***
+
+
+### **多版本whl包列表**
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+### 在Docker中执行PaddlePaddle训练程序
+
+***
+
+假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序: `train.py` (可以参考
+[PaddlePaddleBook](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html)
+编写),就可以使用下面的命令开始执行训练:
+
+ cd /home/work
+ docker run -it -v $PWD:/work hub.baidubce.com/paddlepaddle/paddle /work/train.py
+
+上述命令中,`-it` 参数说明容器已交互式运行;`-v $PWD:/work`
+指定将当前路径(Linux中PWD变量会展开为当前路径的绝对路径)挂载到容器内部的:`/work`
+目录: `hub.baidubce.com/paddlepaddle/paddle` 指定需要使用的容器; 最后`/work/train.py`为容器内执行的命令,即运行训练程序。
+
+当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码:
+
+ docker run -it -v $PWD:/work hub.baidubce.com/paddlepaddle/paddle /bin/bash
+ cd /work
+ python train.py
+
+**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** `apt-get install -y vim` **安装后,在容器中编辑代码。**
+
+
+
+### 使用Docker启动PaddlePaddle Book教程
+
+***
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
+
+`docker run -p 8888:8888 hub.baidubce.com/paddlepaddle/book`
+
+国内用户可以使用下面的镜像源来加速访问:
+
+`docker run -p 8888:8888 hub.baidubce.com/paddlepaddle/book`
+
+然后在浏览器中输入以下网址:
+
+`http://localhost:8888/`
+
+就这么简单,享受您的旅程!如有其他问题请参见[FAQ](#FAQ)
+
+
+### 使用Docker执行GPU训练
+
+***
+
+为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+`nvidia-docker run -it -v $PWD:/work hub.baidubce.com/paddlepaddle/paddle:latest-gpu /bin/bash`
+
+**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:**
+
+ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') \
+ $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+ docker run ${CUDA_SO} \
+ ${DEVICES} -it hub.baidubce.com/paddlepaddle/paddle:latest-gpu
+
+
+**关于AVX:**
+
+AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独[编译](/build_from_source_cn.html) PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX:
+
+`if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi`
+
+如果输出是No,就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..0074b2df726b61a02f9a8e98116b639ab7e562e4
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -0,0 +1 @@
+../../../../../external/book/01.fit_a_line/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/quick_start/fit_a_line/image b/doc/fluid/beginners_guide/quick_start/fit_a_line/image
new file mode 120000
index 0000000000000000000000000000000000000000..ae7c57fe36c2e50f67f81b6797af80df03455c12
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/fit_a_line/image
@@ -0,0 +1 @@
+../../../../../external/book/01.fit_a_line/image
\ No newline at end of file
diff --git a/source/beginners_guide/quick_start/index.rst b/doc/fluid/beginners_guide/quick_start/index.rst
similarity index 100%
rename from source/beginners_guide/quick_start/index.rst
rename to doc/fluid/beginners_guide/quick_start/index.rst
diff --git a/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..c8b9a16180e19dabfebdbc07f8145e7e4c873a63
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -0,0 +1 @@
+../../../../../external/book/02.recognize_digits/README.cn.md
\ No newline at end of file
diff --git a/doc/fluid/beginners_guide/quick_start/recognize_digits/image b/doc/fluid/beginners_guide/quick_start/recognize_digits/image
new file mode 120000
index 0000000000000000000000000000000000000000..2343a4bf23c308fcd0fe7fad0894f8c346aef07c
--- /dev/null
+++ b/doc/fluid/beginners_guide/quick_start/recognize_digits/image
@@ -0,0 +1 @@
+../../../../../external/book/02.recognize_digits/image
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d0dacb104f148c2aeb323365cbd6f014ae00ed5a
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,225 @@
+从源码编译
+======================
+
+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle,我们需要
+
+1. 一台电脑,可以装的是 Linux, Windows 或者 MacOS 操作系统
+2. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
+可以在 `这里 `__ 找到,您也可以
+在 `这里 `__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。
+
+如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
+
+编译PaddlePaddle,需要执行:
+
+.. code-block:: bash
+
+ # 1. 获取源码
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+ # 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像
+ docker build -t paddle:dev .
+ # 3. 执行下面的命令编译CPU-Only的二进制
+ docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+ # 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步)
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+注:
+
+- 上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI `__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
+
+编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
+
+.. code-block:: bash
+
+ pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle,有两种方法:
+
+.. code-block:: bash
+
+ 1. 先卸载之前的版本,再重新安装
+ pip uninstall paddlepaddle
+ pip install build/python/dist/*.whl
+
+ 2. 直接升级到更新的版本
+ pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
+
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ):
+
+.. code-block:: bash
+
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+ ./paddle/scripts/paddle_build.sh build
+ cd build
+ ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+ 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机?
+
+ 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+ 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
+
+ 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗?
+
+ 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难?
+
+ 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗?
+
+ 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+ 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+ .. code-block:: emacs
+
+ (global-set-key "\C-cc" 'compile)
+ (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+ 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗?
+
+ 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+ 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+ Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 `_ 。
+
+- 磁盘不够
+
+ 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container `_ 来清理这些内容。
+
+
+.. _compile_deps:
+
+附录:编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+ :header: "依赖", "版本", "说明"
+ :widths: 10, 15, 30
+
+ "CMake", ">=3.2", ""
+ "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+ "Python", "2.7.x", "依赖libpython2.7.so"
+ "pip", ">=9.0", ""
+ "numpy", "", ""
+ "SWIG", ">=2.0", ""
+ "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+附录:编译选项
+----------------
+
+PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考
+`官方文档 `_ 。
+
+在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如:
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=OFF
+
+.. csv-table:: 编译选项说明
+ :header: "选项", "说明", "默认值"
+ :widths: 1, 7, 2
+
+ "WITH_GPU", "是否支持GPU", "ON"
+ "WITH_C_API", "是否仅编译CAPI", "OFF"
+ "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+ "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON"
+ "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+ "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+ "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+ "WITH_TESTING", "是否开启单元测试", "OFF"
+ "WITH_DOC", "是否编译中英文文档", "OFF"
+ "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
+ "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
+ "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL `_ 和
+`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,
+还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 `_ 。
+
+如果关闭MKL,则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。**
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..664b68da8b7dd3e005ebf3ec34de77729e5ab355
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@@ -0,0 +1,237 @@
+Build from Sources
+==========================
+
+.. _requirements:
+
+Requirements
+----------------
+
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+2. Docker.
+
+Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image.
+We run all the tools by running this image.
+
+.. _build_step:
+
+How To Build
+----------------
+
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here `_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here `__
+Or you can build your own image from source as the optional step below:
+
+If you don't wish to use docker,you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
+.. code-block:: bash
+
+ # 1. clone the source code
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+ # 2. Optional: build development docker image from source
+ docker build -t paddle:dev .
+ # 3. Run the following command to build a CPU-Only binaries
+ docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+ # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
+
+NOTE:
+
+- The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container.
+
+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI `__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+ pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+ 1. uninstall and reinstall
+ pip uninstall paddlepaddle
+ pip install build/python/dist/*.whl
+
+ 2. upgrade directly
+ pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
+
+If you wish to run only one unit test, like :code:`test_sum_op`:
+
+.. code-block:: bash
+
+ docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+ ./paddle/scripts/paddle_build.sh build
+ cd build
+ ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+---------------------------
+
+- What is Docker?
+
+ If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+ Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+ Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+ Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+ Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer. This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+ It takes you ten minutes to read `an introductory article `_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+ Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+ Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
+
+ .. code-block:: emacs
+
+ (global-set-key "\C-cc" 'compile)
+ (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+
+ so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+ Our building Docker image runs a `Bash script `_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+ An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+ On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to `this issue `_ for details.
+
+- Not enough disk space
+
+ Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to `this article `_ .
+
+.. _compile_deps:
+
+Appendix: Compile Dependencies
+-------------------------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+ :header: "Dependency", "Version", "Description"
+ :widths: 10, 15, 30
+
+ "CMake", ">=3.2", ""
+ "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+ "Python", "2.7.x", "Need libpython2.7.so"
+ "pip", ">=9.0", ""
+ "numpy", "", ""
+ "SWIG", ">=2.0", ""
+ "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Appendix: Build Options
+-------------------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here `__ 。
+
+
+You can add :code:`-D` argument to pass such options, like:
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=OFF
+
+.. csv-table:: Bool Type Options
+ :header: "Option", "Description", "Default"
+ :widths: 1, 7, 2
+
+ "WITH_GPU", "Build with GPU support", "ON"
+ "WITH_C_API", "Build only CAPI", "OFF"
+ "WITH_DOUBLE", "Build with double precision", "OFF"
+ "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+ "WITH_AVX", "Build with AVX support", "ON"
+ "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+ "WITH_STYLE_CHECK", "Check code style when building", "ON"
+ "WITH_TESTING", "Build unit tests", "OFF"
+ "WITH_DOC", "Build documentations", "OFF"
+ "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+ "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
+ "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL `_ and
+`OpenBlAS `_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details `_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..106c86bace075764c84bc2a7f7cb09d466fa8794
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@@ -0,0 +1,146 @@
+使用Docker安装运行
+================================
+
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 `_ 获得基本的Docker安装和使用方法。
+
+如果您在使用Windows,可以参考
+`这篇 `_
+教程,完成在Windows上安装和使用Docker。
+
+在了解Docker的基本使用方法之后,即可开始下面的步骤:
+
+.. _docker_pull:
+
+获取PaddlePaddle的Docker镜像
+------------------------------
+
+执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl:
+
+ .. code-block:: bash
+
+ docker pull paddlepaddle/paddle
+
+对于国内用户,我们提供了加速访问的镜像源:
+
+ .. code-block:: bash
+
+ docker pull docker.paddlepaddlehub.com/paddle
+
+下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
+
+ .. code-block:: bash
+
+ docker pull paddlepaddle/paddle:latest-gpu
+ docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+选择下载使用不同的BLAS库的Docker镜像:
+
+ .. code-block:: bash
+
+ # 默认是使用MKL的镜像
+ docker pull paddlepaddle/paddle
+ # 使用OpenBLAS的镜像
+ docker pull paddlepaddle/paddle:latest-openblas
+
+下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令:
+
+ .. code-block:: bash
+
+ docker pull paddlepaddle/paddle:[tag]
+ # 比如:
+ docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
+
+假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
+`PaddlePaddleBook `_
+编写),就可以使用下面的命令开始执行训练:
+
+ .. code-block:: bash
+
+ cd /home/work
+ docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work`
+指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work`
+目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py`
+为容器内执行的命令,即运行训练程序。
+
+当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码:
+
+ .. code-block:: bash
+
+ docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+ cd /work
+ python train.py
+
+**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。**
+
+.. _docker_run_book:
+
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
+
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
+如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。
+
+我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
+
+ .. code-block:: bash
+
+ docker run -p 8888:8888 paddlepaddle/book
+
+国内用户可以使用下面的镜像源来加速访问:
+
+ .. code-block:: bash
+
+ docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+然后在浏览器中输入以下网址:
+
+ .. code-block:: text
+
+ http://localhost:8888/
+
+就这么简单,享受您的旅程!
+
+.. _docker_run_gpu:
+
+使用Docker执行GPU训练
+------------------------------
+
+为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用
+`nvidia-docker `_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
+
+ .. code-block:: bash
+
+ nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:**
+
+ .. code-block:: bash
+
+ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**关于AVX:**
+
+AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
+
+以下指令能检查Linux电脑是否支持AVX:
+
+ .. code-block:: bash
+
+ if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+如果输出是No,就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..25aecb8d0da9feb00006da6259b529b7011d91cb
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@@ -0,0 +1,153 @@
+Run in Docker Containers
+=================================
+
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here `_ .
+
+If you are using Windows, please refer to
+`this `_
+tutorial to start running docker under windows.
+
+After you've read above tutorials you may proceed the following steps.
+
+.. _docker_pull:
+
+Pull PaddlePaddle Docker Image
+------------------------------
+
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
+
+ .. code-block:: bash
+
+ docker pull paddlepaddle/paddle
+
+For users in China, we provide a faster mirror:
+
+ .. code-block:: bash
+
+ docker pull docker.paddlepaddlehub.com/paddle
+
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
+
+ .. code-block:: bash
+
+ docker pull paddlepaddle/paddle:latest-gpu
+ docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
+
+Choose between different BLAS version:
+
+ .. code-block:: bash
+
+ # image using MKL by default
+ docker pull paddlepaddle/paddle
+ # image using OpenBLAS
+ docker pull paddlepaddle/paddle:latest-openblas
+
+
+If you want to use legacy versions, choose a tag from
+`DockerHub `_
+and run:
+
+ .. code-block:: bash
+
+ docker pull paddlepaddle/paddle:[tag]
+ # i.e.
+ docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
+
+.. _docker_run:
+
+Launch your training program in Docker
+--------------------------------------
+
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to
+`PaddlePaddleBook `_
+for more samples), then run the following command:
+
+ .. code-block:: bash
+
+ cd /home/work
+ docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
+
+Also, you can go into the container shell, run or debug your code
+interactively:
+
+ .. code-block:: bash
+
+ docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+ cd /work
+ python train.py
+
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
+
+.. _docker_run_book:
+
+PaddlePaddle Book
+------------------
+
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+ .. code-block:: bash
+
+ docker run -p 8888:8888 paddlepaddle/book
+
+For users in China, we provide a faster mirror:
+
+ .. code-block:: bash
+
+ docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
+Then, you would back and paste the address into the local browser:
+
+ .. code-block:: text
+
+ http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+.. _docker_run_gpu:
+
+Train with Docker with GPU
+------------------------------
+
+We recommend using
+`nvidia-docker `_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
+
+ .. code-block:: bash
+
+ nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
+
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
+
+ .. code-block:: bash
+
+ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+ .. code-block:: bash
+
+ if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a9305ac4b6578c14a962f223c647a71e3b8a72b
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1,56 @@
+安装与编译
+==========
+
+.. _install_steps:
+
+PaddlePaddle针对不同的用户群体提供了多种安装方式。
+
+专注深度学习模型开发
+--------------------
+
+PaddlePaddle提供了多种python wheel包,可通过pip一键安装:
+
+.. toctree::
+ :maxdepth: 1
+
+ pip_install_cn.rst
+
+这是最便捷的安装方式,请根据机器配置和系统选择对应的安装包。
+
+关注底层框架
+-------------
+
+PaddlePaddle提供了基于Docker的安装方式,请参照以下教程:
+
+.. toctree::
+ :maxdepth: 1
+
+ docker_install_cn.rst
+
+我们推荐在Docker中运行PaddlePaddle,该方式具有以下优势:
+
+- 无需单独安装第三方依赖
+- 方便分享运行时环境,易于问题的复现
+
+对于有定制化二进制文件需求的用户,我们同样提供了从源码编译安装PaddlePaddle的方法:
+
+.. toctree::
+ :maxdepth: 1
+
+ build_from_source_cn.rst
+
+.. warning::
+
+ 需要提醒的是,这种安装方式会涉及到一些第三方库的下载、编译及安装,整个安装过程耗时较长。
+
+
+常见问题汇总
+--------------
+
+如果在安装过程中遇到了问题,请先尝试在下面的页面寻找答案:
+
+:ref:`常见问题解答 `
+
+如果问题没有得到解决,欢迎向PaddlePaddle社区反馈问题:
+
+`创建issue `_
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7990bacbd6966e88e8763e9c5709e410f7e9fed4
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1,56 @@
+install and Compile
+======================
+
+.. _install_steps:
+
+PaddlePaddle provides various methods of installation for many different users
+
+Focus on Deep Learning Model Development
+----------------------------------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+ :maxdepth: 1
+
+ pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+------------------------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
+
+.. toctree::
+ :maxdepth: 1
+
+ docker_install_en.rst
+
+We recommend running PaddlePaddle in Docker. This method has the following advantages:
+
+- Does not require installation of third-party dependencies.
+- Easy to share runtime environment.
+
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
+
+.. toctree::
+ :maxdepth: 1
+
+ build_from_source_en.rst
+
+.. warning::
+
+ One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
+
+FAQ
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 `
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community:
+
+`创建issue `_
diff --git a/source/beginners_guide/install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
similarity index 100%
rename from source/beginners_guide/install/paddleci.png
rename to doc/fluid/build_and_install/paddleci.png
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..095da19cd41d29bfa72ab23abd24bec45f925a86
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@@ -0,0 +1,105 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip `_
+完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。
+
+ .. code-block:: bash
+
+ pip install paddlepaddle
+
+当前的默认版本为0.12.0,cpu_avx_openblas,您可以通过指定版本号来安装其它版本,例如:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle==0.11.0
+
+
+如果需要安装支持GPU的版本(cuda8.0_cudnn5_avx_openblas),需要执行:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+
+当前的默认版本也是0.12.0,PaddlePaddle针对不同需求提供了更多版本的安装包,部分列表如下:
+
+================================= ========================================
+版本号 版本说明
+================================= ========================================
+paddlepaddle-gpu==0.12.0 使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87 使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8 使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+================================= ========================================
+
+您可以在 `Release History `_ 中找到paddlepaddle-gpu的各个发行版本。
+
+如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装,
+您可以从下面的表格中找到需要的版本:
+
+如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载:
+
+.. image:: paddleci.png
+ :scale: 50 %
+ :align: center
+
+.. csv-table:: 各个版本最新的whl包
+ :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
+ :widths: 1, 3, 3
+
+ "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_"
+ "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+ :header: "依赖", "版本", "说明"
+ :widths: 10, 15, 30
+
+ "操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上"
+ "Python", "2.7.x", "暂时不支持Python3"
+ "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+ "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+ "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+ 出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip:
+
+ .. code-block:: bash
+
+ pip install --upgrade pip
+
+ 如果仍然存在问题,可以执行:
+
+ .. code-block:: bash
+
+ python -c "import pip; print(pip.pep425tags.get_supported())"
+
+ 获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 `_ 链接中找到。
+
+ 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@@ -0,0 +1,123 @@
+Install using pip
+================================
+
+You can use current widely used Python package management
+tool `pip `_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+ .. code-block:: bash
+
+ pip install paddlepaddle
+
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+================================= ========================================
+版本号 版本说明
+================================= ========================================
+paddlepaddle-gpu==0.12.0 0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87 0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8 0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0 0.11.0 built with CUDA 7.5 and cuDNN 5
+================================= ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History `_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+ :scale: 50 %
+ :align: center
+
+.. csv-table:: whl package of each version
+ :header: "version", "cp27-cp27mu", "cp27-cp27m"
+ :widths: 1, 3, 3
+
+ "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__"
+ "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 `_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+ :header: "Dependency", "version", "description"
+ :widths: 10, 15, 30
+
+ "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later"
+ "Python", "2.7.x", "Currently Python3 is not supported"
+ "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+ "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+ "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+
+ The main cause of this issue is that your current platform is
+ not supported. Please check that you are using Python 2.7 series.
+ Besides, pypi only supports manylinux1 standard, you'll need to
+ upgrade your pip to >9.0.0. Then run the below command:
+
+ .. code-block:: bash
+
+ pip install --upgrade pip
+
+ If the problem still exists, run the following command:
+
+ .. code-block:: bash
+
+ python -c "import pip; print(pip.pep425tags.get_supported())"
+
+ Then you'll get supported package suffixes, then check if it matches
+ the file name of the whl package. You can find default whl package at
+ `here `_
+
+ If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+ you'll need to update pip to the latest version; If your system supports
+ manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+ file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/fluid/design/algorithm/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/design/algorithm/images/asgd.gif differ
diff --git a/doc/fluid/design/algorithm/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/design/algorithm/images/theta_star.gif differ
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_cn.rst
@@ -0,0 +1,7 @@
+梯度更新算法
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_en.rst
@@ -0,0 +1,7 @@
+Gradient Update Algorithm
+--------------------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
new file mode 100644
index 0000000000000000000000000000000000000000..28ad6495d97515442eb8af2050158829814acd33
--- /dev/null
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -0,0 +1,74 @@
+# Averaging Parameter in PaddlePaddle
+
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
+
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for 
. The averaging is done as follows:
+
+
+
+
+
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+
+### How to perform Parameter Averaging in PaddlePaddle
+
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+ 1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+ 2. However, saving all N instances of the parameters in memory is not feasible.
+ 3. Therefore, an approximation algorithm is used.
+
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+
+During the testing/saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+
+### How to implement Averaging of Parameter in PaddlePaddle
+
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+
+ **Advantages**:
+ - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+ - Makes it easy for the users to customize and extend the framework.
+
+ **Disadvantages**:
+ - Implementation requires re-writing the averaging methodology in Python.
+
+### Low-Level implementation
+
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Python API implementation for ParameterAverageOptimizer
+
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
+
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+
+The proposal is to add the op immediately while building the computation graph.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ded0ad22f4013a521bf3bee260565dc5cf855ae
--- /dev/null
+++ b/doc/fluid/design/concepts/README.md
@@ -0,0 +1,174 @@
+A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
+
+Here are some initial thoughts. Your comments are welcome!
+
+# Required CMake Function
+
+I think we need only the following few CMake functions to make a project description mean and clean:
+
+
+
+
+C++ |
+CUDA C++ |
+Go |
+
+
+
+
+cc_library |
+nv_library |
+go_library |
+
+
+cc_binary |
+nv_binary |
+go_binary |
+
+
+ cc_test |
+ nv_test |
+ go_test |
+
+
+
+
+
+- The `_library` functions generate .a files from source code.
+- The `_binary` functions generate executable binary files.
+- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
+
+The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
+
+Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
+
+Also,
+
+- to describe external dependencies, we need `external_library`.
+- to build shared libraries, we need `shared_library`.
+
+## An Example Project
+
+Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files:
+
+- tensor.h
+- tensor.cc
+- tensor_test.cc
+- ops.h
+- ops.cu
+- ops_test.cu
+- api.go
+- api_test.go
+
+Suppose that ops.cu depends on CUDNN.
+
+```cmake
+# cc_binary parses tensor.cc and figures out that target also depend
+# on tensor.h.
+cc_binary(tensor
+ SRCS
+ tensor.cc)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+cc_test(tensor_test
+ SRCS
+ tensor_test.cc
+ DEPS
+ tensor)
+
+# I don't have a clear idea what parameters external_library need to
+# have. @gangliao as a CMake expert would have better ideas.
+external_library(cudnn
+ ....)
+
+# Suppose that ops.cu depends on external target CUDNN. Also, ops.cu
+# include global functions that take Tensor as their parameters, so
+# ops depend on tensor. This implies that if any of tensor.{h.cc},
+# ops.{h,cu} is changed, ops need to be re-built.
+nv_library(ops
+ SRCS
+ ops.cu
+ DEPS
+ tensor
+ cudnn) # cudnn is defined later.
+
+nv_test(ops_test
+ SRCS
+ ops_test.cu
+ DEPS
+ ops)
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both. This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+go_library(api
+ SRCS
+ api.go
+ DEPS
+ tensor # Because ops depend on tensor, this line is optional.
+ ops)
+
+go_test(api_test
+ SRCS
+ api_test.go
+ DEPS
+ api)
+
+
+# This builds libapi.so. shared_library might use CMake target
+# api_shared so to distinguish it from above target api.
+shared_library(api
+ DEPS
+ api)
+
+```
+
+## Implementation
+
+As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
+
+## Using Package Manager For Go
+
+Building Go binaries and libraries need to satisfy their dependencies, generally
+we can do `go get ./...` to download and compile all external dependencies. The
+problems are:
+
+1. `go get` will always get the latest code from the default branch of the
+ remote repo, so changes of dependents might break the build. This is very
+ different with what we already have in `cmake/external` which download a
+ specific version or commit id of the dependency.
+1. Some locations can not access external dependencies through the internet, as mentioned
+ in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
+ tools can package the dependencies as a "vendor" package, which can be mirrored
+ at many cloud file hosting, so users what to compile paddle by themselves can
+ download this "vendor" package from a mirror site.
+
+### Choose A Suitable Tool
+
+As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
+list dozens of Go package managers. We choose the tool using following principles:
+
+- Most "active" projects with more stars, more pull requests or commits
+- Widely used project
+
+After comparing all these projects, we shall choose between the most popular
+tools: Godep and Glide.
+
+Here's a brief comparison between Godep and Glide
+: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about using `Godep`. There's also a new "official" pakcage
+management tool has been started at: https://github.com/golang/dep to resolve
+such problems, but it's currently at Alpha stage. So the best choice now is
+glide obviously.
+
+### Manage Go Packages
+
+- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
+ is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
+ with their commit id. Builds will "lock" to these packages if we don't `glide up`
+ them
+- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
+ will download the code corresponding to `go/glide.lock`. If we put a vendor folder
+ under `go/`, cmake will just check the commit id to the packages under the folder,
+ if commit id matches, there will be no download at all.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
new file mode 100644
index 0000000000000000000000000000000000000000..3757cd055c818be1e63ee8c0f000f4dd299b59f4
--- /dev/null
+++ b/doc/fluid/design/concepts/block.md
@@ -0,0 +1,375 @@
+# Design Doc: Block and Scope
+
+## The Representation of Computation
+
+Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation:
+
+- Caffe, Torch, and Paddle: sequences of layers.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
+- PaddlePaddle: nested blocks, like C++ and Java programs.
+
+## Block in Programming Languages and Deep Learning
+
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
+
+Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
+
+
+
+
+programming languages |
+PaddlePaddle |
+
+
+
+
+for, while loop |
+RNN, WhileOp |
+
+
+if, if-else, switch |
+IfElseOp, SwitchOp |
+
+
+sequential execution |
+a sequence of layers |
+
+
+
+
+
+A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
+
+## Stack Frames and the Scope Hierarchy
+
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
+
+
+
+
+programming languages |
+PaddlePaddle |
+
+
+
+
+stack |
+scope hierarchy |
+
+
+stack frame |
+scope |
+
+
+push at entering block |
+push at entering block |
+
+
+pop at leaving block |
+destroy when minibatch completes |
+
+
+
+
+
+1. In traditional programs:
+
+ - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
+ - After the execution leaves the right curly brace, the runtime pops the frame.
+ - The maximum number of frames in the stack is the maximum depth of nested blocks.
+
+1. In PaddlePaddle
+
+ - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
+ - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*.
+ - The height of the highest tree is the maximum depth of nested blocks.
+ - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
+
+## Use Blocks in C++ and PaddlePaddle Programs
+
+Let us consolidate the discussion by presenting some examples.
+
+### Blocks with `if-else` and `IfElseOp`
+
+The following C++ programs shows how blocks are used with the `if-else` structure:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+ int z = x + y;
+ o1 = z;
+ o2 = pd::layer::softmax(z);
+} else {
+ int d = pd::layer::fc(z);
+ o1 = d;
+ o2 = d+1;
+}
+
+```
+
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+ d = pd.layer.add_scalar(x, y)
+ ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+ d = pd.layer.fc(z)
+ ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
+
+
+### Blocks with `for` and `RNNOp`
+
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
+
+```python
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+ h = rnn.memory(init = m)
+ h_prev = rnn.previous_memory(h)
+ a = layer.fc(W, x)
+ b = layer.fc(U, h_prev)
+ s = pd.add(a, b)
+ act = pd.sigmoid(s)
+ rnn.update_memory(h, act)
+ rnn.output(a, b)
+o1, o2 = rnn()
+```
+has its equivalent C++ program as follows
+
+```c++
+int* x = {10, 20, 30};
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
+
+int mem[sizeof(x) / sizeof(x[0]) + 1];
+int o1[sizeof(x) / sizeof(x[0]) + 1];
+int o2[sizeof(x) / sizeof(x[0]) + 1];
+for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
+ int x = x[i-1];
+ if (i == 1) mem[0] = m;
+ int a = W * x;
+ int b = Y * mem[i-1];
+ int s = fc_out + hidden_out;
+ int act = sigmoid(sum);
+ mem[i] = act;
+ o1[i] = act;
+ o2[i] = hidden_out;
+}
+```
+
+## Compilation and Execution
+
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
+
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
+
+## The "Binary Executable File Format"
+
+The definition of the protobuf message is as follows:
+
+```protobuf
+message BlockDesc {
+ repeated VarDesc vars = 1;
+ repeated OpDesc ops = 2;
+}
+```
+
+The step net in above RNN example would look like
+
+```
+BlockDesc {
+ vars = {
+ VarDesc {...} // x
+ VarDesc {...} // h
+ VarDesc {...} // fc_out
+ VarDesc {...} // hidden_out
+ VarDesc {...} // sum
+ VarDesc {...} // act
+ }
+ ops = {
+ OpDesc {...} // matmul
+ OpDesc {...} // add_two
+ OpDesc {...} // sigmoid
+ }
+};
+```
+
+Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
+
+```
+OpDesc {
+ inputs = {0} // the index of x in vars of BlockDesc above
+ outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
+ attrs {
+ "states" : {1} // the index of h
+ "step_net" :
+ }
+};
+```
+
+This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
+
+
+## The Compilation of Blocks
+
+During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
+
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
+
+```python
+a = pd.Variable(shape=[20, 20])
+b = pd.fc(a, params=["fc.w", "fc.b"])
+
+rnn = pd.create_rnn()
+with rnn.stepnet():
+ x = a.as_step_input()
+ # reuse fc's parameter
+ fc_without_b = pd.get_variable("fc.w")
+ rnn.output(fc_without_b)
+
+out = rnn()
+```
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+
+In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
+
+To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
+
+`SymbolTable` can do the following:
+
+- store the definitions (some names and attributes) of variables and operators,
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+
+
+```c++
+// Information in SymbolTable is enough to trace the dependency graph. So maybe
+// the Eval() interface takes a SymbolTable is enough.
+class SymbolTable {
+ public:
+ SymbolTable(SymbolTable* parent) : parent_(parent) {}
+
+ OpDesc* NewOp(const string& name="");
+
+ // TODO determine whether name is generated by python or C++.
+ // Currently assume that a unique name will be generated by C++ if the
+ // argument name is left default.
+ VarDesc* Var(const string& name="");
+
+ // find a VarDesc by name, if recursive is true, find parent's SymbolTable
+ // recursively.
+ // this interface is introduced to support InferShape, find protobuf messages
+ // of variables and operators, pass pointers into InferShape.
+ //
+ // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
+ // be proposed and embedded into pybind to enable python operation on C++ pointers.
+ VarDesc* FindVar(const string& name, bool recursive=true);
+
+ OpDesc* FindOp(const string& name);
+
+ BlockDesc Compile() const;
+
+ private:
+ SymbolTable* parent_;
+
+ map ops_;
+ map vars_;
+};
+```
+
+After all the description of variables and operators is added into SymbolTable,
+the block has enough information to run.
+
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
+
+
+```c++
+namespace {
+
+class Block : OperatorBase {
+public:
+ Block(const BlockDesc& desc) desc_(desc) {}
+
+ void InferShape(const framework::Scope& scope) const override {
+ if (!symbols_ready_) {
+ CreateVariables(scope);
+ CreateOperators();
+ }
+ // should run InferShape first.
+ for (auto& op : runtime_table_.ops()) {
+ op->InferShape(scope);
+ }
+ }
+
+ void Run(const framework::Scope& scope,
+ const platform::Place& place) const override {
+ PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
+ for (auto& op : runtime_table_.ops()) {
+ op->Run(scope, place);
+ }
+ }
+
+ void CreateVariables(const framework::Scope& scope);
+ void CreateOperators();
+
+ // some other necessary interfaces of NetOp are listed below
+ // ...
+
+private:
+ BlockDesc desc_;
+ bool symbols_ready_{false};
+};
+```
+
+## The Execution of Blocks
+
+Block inherits from OperatorBase, which has a Run method.
+Block's Run method will run its operators sequentially.
+
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
+
+The definition of Eval is as follows:
+
+```c++
+// clean a block description by targets using the corresponding dependency graph.
+// return a new BlockDesc with minimal number of operators.
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
+// to a cluster.
+BlockDesc Prune(const BlockDesc& desc, vector targets);
+
+void Block::Eval(const vector& targets,
+ const framework::Scope& scope,
+ const platform::DeviceContext& dev_ctx) {
+ BlockDesc min_desc = Prune(desc_, targets);
+ Block min_block(min_desc);
+ min_block.Run(scope, dev_ctx);
+}
+```
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..aabc1ba75a67c5767d409bd6e7e6240dec86b16c
--- /dev/null
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -0,0 +1,204 @@
+# C++ Data Feeding
+
+While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
+
+In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
+
+## Overview
+
+
+
+## Reader
+
+In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
+
+
+### ReaderBase
+
+`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
+
+```cpp
+class ReaderBase {
+ public:
+ // Reads the next batch of data. (A 'batch' can be only one instance)
+ // If the next batch doesn't exist, it throws an exception
+ virtual void ReadNext(std::vector* out) = 0;
+
+ // Checks whether the next instance exists.
+ virtual bool HasNext() = 0;
+
+ // Reinitializes the reader and read the file from the beginning.
+ virtual void ReInit() = 0;
+
+ virtual ~ReaderBase();
+};
+```
+
+### FileReader
+
+`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
+
+```cpp
+class FileReader : public ReaderBase {
+ public:
+ explicit FileReader(const std::vector& dims);
+
+ void ReadNext(std::vector* out) override;
+
+ protected:
+ virtual void ReadNextImpl(std::vector* out) = 0;
+
+ private:
+ std::vector dims_;
+};
+```
+
+A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
+
+The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.
+
+### DecoratedReader
+
+A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+```cpp
+class DecoratedReader : public ReaderBase {
+ public:
+ explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+ PADDLE_ENFORCE_NOT_NULL(reader_);
+ }
+
+ void ReInit() override { reader_->ReInit(); }
+
+ bool HasNext() const override { return reader_->HasNext(); }
+
+ protected:
+ ReaderBase* reader_;
+};
+```
+
+Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
+
+### MultipleReader
+
+All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
+
+So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
+
+
+
+This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
+
+To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel.
+
+### ReaderHolder
+
+Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get("batch_reader");
+```
+
+We would have to write:
+
+```cpp
+var->Get("batch_reader");
+```
+
+This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some new ops are introduced:
+
+### Operators That Create Readers
+
+Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
+
+However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
+
+### OpenFilesOp
+
+The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
+
+To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
+
+### HasNextOp
+
+`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
+
+### ResetOp
+
+`ResetOp` is used to reset a reader via its `ReInit()` interface.
+
+### ReadOp
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
+
+## Program with Readers
+
+A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
+
+The ops of a `startup_program` with readers would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+double_buffer_reader = create_double_buffer_op(batch_reader)
+... (other initializers)
+```
+
+The forwarding ops of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
+ has_next = has_next_op(double_buffer_reader)
+ if_else_op(has_next) {
+ batch_data = read_op(double_buffer_reader)
+ ... (subsequent training ops)
+ } else {
+ reset_op(double_buffer_reader)
+ increase_op(pass_count)
+ not_completed = less_than_op(pass_count, reqiured_pass_num)
+ }
+}
+```
+
+A few important considerations for these programs are as follows:
+
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
+
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+
+### Simplify Configuration by MultiPassReader
+
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+
+With `MultiPassReader`, the startup program would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+
+The forwarding part of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+while_op(not_completed) {
+ batch_data = read_op(double_buffer_reader)
+ ... (subsequent training ops)
+ not_completed = has_next_op(double_buffer_reader)
+}
+```
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..3fcddf4dd90f826ee1a16713f4371fb010f8eac5
--- /dev/null
+++ b/doc/fluid/design/concepts/executor.md
@@ -0,0 +1,29 @@
+# Executor Design Doc
+
+## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
+
+## Executor
+
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
+
+### The interface
+```c++
+ Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
+
+### Running an Executor
+
+```
+ void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f86b99e5197c3e0b85fd76fe704520ef21b06d3
--- /dev/null
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -0,0 +1,128 @@
+# Design Doc: Functions, Operators, and Layers
+
+In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator.
+
+Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation.
+
+In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
+
+```c++
+template T add(T x, T y) { return x + y; }
+template T mul(T x, T y) { return x * y; }
+```
+
+Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation
+
+```c++
+#define MAKE_FUNCTION_OPERATOR(mul);
+```
+
+generates
+
+```c++
+template class mulOp : public OperatorBase {...};
+REGISTER_OP(mulOp, "mul");
+```
+
+so that in Python we can create operator mul by:
+
+```python
+X1 = Var()
+X2 = Var()
+Y = Var()
+paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
+```
+
+Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
+
+```c++
+template
+class FCOp : public OperatorBase {
+ public:
+ void Run(...) {
+ add(mul(Input("X"), Input("W")), Input("b"));
+ }
+};
+REGISTER_OP(FCOp, "fc");
+```
+
+We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API.
+
+Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
+
+```python
+def operator.mul(X1, X2):
+ O = Var()
+ paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
+ return O
+
+def operator.add(X1, X2):
+ O = Var()
+ paddle.cpp.create_operator("add", input={X1, X2}, output=O)
+ return O
+```
+
+Above code snippets are automatically generated. Given them, users can define
+
+```python
+def layer.fc(X):
+ W = Var()
+ b = Var()
+ return operator.add(operator.mul(X, W), b)
+```
+
+If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
+
+```python
+def layer.fc(X):
+ W = Var()
+ b = Var()
+ O1 = Var()
+ paddle.cpp.create_operator("mul", input=[X, W], output=O1)
+ O2 = Var()
+ paddle.cpp.create_operator("add", input=[O1, b], output=O2)
+ return O2
+```
+
+We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
+
+
+
+
+C++ functions/functors |
+mul |
+add |
+ |
+ |
+
+
+
+
+C++ operator class |
+mulOp |
+addOp |
+FCOp |
+ |
+
+
+Python binding |
+operator.mul |
+ operator.add |
+operator.fc |
+ |
+
+
+Python function |
+ |
+ |
+ |
+layer.fc |
+
+
+
+
+
+This is how we differentiate layer and operators in PaddlePaddle:
+
+- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
+- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/design/concepts/images/multiple_reader.png differ
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
new file mode 100644
index 0000000000000000000000000000000000000000..40753cb140540c08d9d4c449b8d377e315280436
--- /dev/null
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
@@ -0,0 +1,83 @@
+digraph G {
+ subgraph cluster_init {
+ label="Initialization"
+ startup_program [label="startup", shape=box]
+ node_w_g0 [label="W\nGPU0"]
+ startup_program -> node_w_g0 [label="Initialize"]
+ node_w_g1 [label="W\nGPU1"]
+ node_w_g0 -> node_w_g1 [label="broadcast"]
+ }
+
+ subgraph cluster_train {
+ label="forward_backward"
+
+ subgraph cluster_gpu0 {
+ label="GPU0"
+ fc_0 [label="fc\nGPU0", shape=box]
+ hidden_0 [label="hidden\nGPU0"]
+ node_w_g0 -> fc_0
+ fc_0 -> hidden_0
+ loss0 [label="loss\nGPU0"]
+ hidden_0 -> loss0 [label="many ops omitted"]
+ scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+ loss_g0 [label="loss_grad\nGPU0"]
+ scale_loss_0->loss_g0
+
+ fc_g_0 [label="w_grad\nGPU0", shape=box]
+ loss0 -> fc_g_0
+ loss_g0 -> fc_g_0
+ hidden_0 -> fc_g_0
+ }
+
+ subgraph cluster_gpu1 {
+ label="GPU1"
+ fc_1 [label="fc\nGPU1", shape=box]
+ hidden_1 [label="hidden\nGPU1"]
+ node_w_g1 -> fc_1
+ fc_1 -> hidden_1
+ loss1 [label="loss\nGPU1"]
+ hidden_1 -> loss1 [label="many ops omitted"]
+ scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+ loss_g1 [label="loss_grad\nGPU1"]
+ scale_loss_1->loss_g1
+
+ fc_g_1 [label="w_grad\nGPU1", shape=box]
+ loss1 -> fc_g_1
+ loss_g1 -> fc_g_1
+ hidden_1 -> fc_g_1
+ }
+ }
+
+ all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+ fc_g_0 -> all_reduce_w
+ fc_g_1 -> all_reduce_w
+
+ fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+ fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+ all_reduce_w -> fc_g_0_merged
+ all_reduce_w -> fc_g_1_merged
+
+ subgraph cluster_optimization {
+ label="Optimization"
+ subgraph cluster_opt_gpu0 {
+ label="GPU0"
+ sgd_0 [label="SGD Op\nGPU0", shape=box]
+
+ fc_g_0_merged -> sgd_0
+ node_w_g0 -> sgd_0
+ optimized_w_0 [label="Optimized W\nGPU0"]
+ sgd_0 -> optimized_w_0
+ }
+ subgraph cluster_opt_gpu1 {
+ label="GPU1"
+ sgd_1 [label="SGD Op\nGPU1", shape=box]
+
+ fc_g_1_merged -> sgd_1
+ node_w_g1 -> sgd_1
+ optimized_w_1 [label="Optimized W\nGPU0"]
+ sgd_1 -> optimized_w_1
+ }
+ }
+
+
+}
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211
Binary files /dev/null and b/doc/fluid/design/concepts/images/parallel_executor_overview.png differ
diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/design/concepts/images/readers.png differ
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc894937ff328e6002623275ca3c65e87b2bb0
--- /dev/null
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -0,0 +1,19 @@
+核心概念
+-------------
+
+.. toctree::
+ :maxdepth: 1
+
+ README.md
+ cpp_data_feeding.md
+ functions_operators_layers.md
+ program.md
+ variable.md
+ var_desc.md
+ tensor.md
+ tensor_array.md
+ lod_tensor.md
+ block.md
+ scope.md
+ executor.md
+ parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b85a3055746facaa642e8fc899976b58435f1ef2
--- /dev/null
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -0,0 +1,19 @@
+Core Concepts
+--------------------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ README.md
+ cpp_data_feeding.md
+ functions_operators_layers.md
+ program.md
+ variable.md
+ var_desc.md
+ tensor.md
+ tensor_array.md
+ lod_tensor.md
+ block.md
+ scope.md
+ executor.md
+ parallel_executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..748488f6d5f2f1272e87b89047570632418da8dc
--- /dev/null
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -0,0 +1,211 @@
+# Design Doc: LoD (Level-of-Detail) Tensor
+
+Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
+
+
+
+
+ |
+TensorFlow |
+PaddlePaddle |
+
+
+
+
+RNN |
+Support |
+Support |
+
+
+recursive RNN |
+Support |
+Support |
+
+
+padding zeros |
+ Must |
+No need |
+
+
+ blob data type |
+ Tensor |
+ LoDTensor |
+
+
+
+
+
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor.
+
+
+## The Challenge: Variable-length Sequences
+
+Most deep learning systems represent a mini-batch as a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor. Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector. Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
+
+Both examples show that the elements of sequences are usually of the same size. In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors. It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences. Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively. We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
+
+```
+3 1 2
+||| | ||
+```
+
+where each `|` represents a D-dimensional word vector. The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor. Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3 1 2
+3 2 4 1 2 3
+||| || |||| | || |||
+```
+
+### A Mini-Batch of Videos
+
+LoD tensors generalize to the case where elements are higher dimensional objects, like images. Suppose that a mini-batch contains videos of the same frame size 640x480. Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
+
+```
+3 1 2
+口口口 口 口口
+```
+
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
+
+### A Mini-Batch of Images
+
+In traditional cases like a mini-batch with N fixed-sized images, the LoD Tensor representation is as
+
+```
+1 1 1 1 1
+口口口口 ... 口
+```
+
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
+
+```
+口口口口 ... 口
+```
+
+### Model Parameters
+
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
+
+
+## The LoD Tensor
+
+Let us revisit above example of the 2-level LoD Tensor
+
+```
+3 1 2
+3 2 4 1 2 3
+||| || |||| | || |||
+```
+
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
+
+We can save the LoD index in the above example
+
+```
+3 1 2
+3 2 4 1 2 3
+```
+
+in a not-full 2D matrix:
+
+```c++
+typedef std::vector > LoD;
+```
+
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
+
+```
+3 2 4 1 2 3
+```
+
+into offsets
+
+```
+0 3 5 9 10 12 15
+ = = = = = =
+ 3 2+3 4+5 1+9 2+10 3+12
+```
+
+so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
+
+Similarly, the lengths in the top level LoD
+
+```
+3 1 2
+```
+
+are transformed into offsets of elements/words as follows:
+
+```
+0 3 4 6
+ = = =
+ 3 3+1 4+2
+```
+
+## Slicing of LoD Tensors
+
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch as the **-slice**.
+
+For example, the <2>-slice of above example is
+
+```
+10 15
+10 12 15
+ || |||
+```
+
+and the <2,0>-slice of above slice is
+
+```
+10 12
+ ||
+```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API.
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f88e27bed722e9f2f535e368926fe49b4e72e56
--- /dev/null
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -0,0 +1,104 @@
+# ParallelExecutor
+
+## Background
+
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs.
+
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+
+
+## Overview of MultiGPUs logic
+
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+
+
+
+There are several optimizations for this logic.
+
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready.
+ * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+ * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+
+
+## Static single assignment Graph
+
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+
+The data structure of `SSA` graph is:
+
+```c++
+struct VarHandleBase {
+ OpHandleBase* generated_op_;
+ vector pending_ops_;
+
+ string name;
+ Place place;
+ size_t version;
+};
+
+struct OpHandleBase {
+ vector inputs_;
+ vector outputs_;
+};
+
+struct SSAGraph {
+ // vars on each devices.
+ // * the vars in each map in vector is on different device.
+ // * the map is mapping a variable name to variable handles
+ // with different versions
+ vector>> vars_;
+
+ // All ops
+ vector ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+
+## Execute SSA Graph
+
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+
+## Synchronize GPU Kernels
+
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
+
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+
+The `wait` are implemented by two strategies:
+
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+
+## What's next?
+
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfcd21ecdb9d2844bf93ed98a56db09651077c40
--- /dev/null
+++ b/doc/fluid/design/concepts/program.md
@@ -0,0 +1,139 @@
+# Design Doc: PaddlePaddle Programs
+
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
+
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it.
+
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+ - some local variable definitions, and
+ - a sequence of operators
+
+The concept of block comes from usual programs. For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+ int i = 0;
+ if (i < 10) { // block 1
+ for (int j = 0; j < 10; j++) { // block 2
+ }
+ }
+ return 0;
+}
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block(): // block 1
+ d = pd.layer.add_scalar(x, y)
+ ie.output(d, pd.layer.softmax(d))
+with ie.false_block(): // block 2
+ d = pd.layer.fc(z)
+ ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
+message BlockDesc {
+ required int32 parent = 1;
+ repeated VarDesc vars = 2;
+ repeated OpDesc ops = 3;
+}
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+ repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
+
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
+message OpDesc {
+ AttrDesc attrs = 1;
+ ...
+}
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+
+```
+message AttrDesc {
+ required string name = 1;
+
+ enum AttrType {
+ INT = 1,
+ STRING = 2,
+ ...
+ BLOCK = ...
+ }
+ required AttrType type = 2;
+
+ optional int32 block = 10; // when type == BLOCK
+ ...
+}
+```
+
+## InferShape
+
+With this design, the InferShape function should take the following parameters:
+
+```c++
+void InferShape(int current_block,
+ int current_operator,
+ ProgramDesc* program // might change VarDesc values.
+ ) {
+ ...
+}
+```
+
+where
+
+- `current_block` indices into `ProgramDesc::blocks`,
+- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+ friend class LoDTensorBlockingQueueHolder;
+ private:
+ // `LoDTensorBlockingQueue` can only be constructed by
+ // `LoDTensorBlockingQueueHolder::InitOnce()`
+ LoDTensorBlockingQueue(size_t capacity, const std::vector& dims);
+
+ public:
+ size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+ size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+ void Close() { return queue_.Close(); }
+
+ bool IsClosed() const { return queue_.IsClosed(); }
+
+ // Block if Size() == Cap()
+ // Return false only when queue_.IsClosed() == true
+ bool Push(const std::vector &lod_tensor_vec);
+
+ // Block if Size() == 0.
+ // *Success == false when queue_.IsClosed() == true
+ std::vector Pop(bool *success = nullptr);
+
+ private:
+ // Use reader::BlockingQueue as the inner data structure
+ BlockingQueue> queue_;
+ std::vector dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:
+ // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+ // `InitOnce` can only called once, otherwise an exception would raise
+ void InitOnce(size_t capacity, const std::vector& dims) {
+ PADDLE_ENFORCE(queue_ == nullptr);
+ queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+ }
+
+ const std::shared_ptr& GetQueue() const { return queue_; }
+
+ private:
+ std::shared_ptr queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+ explicit PyReader(const std::shared_ptr& queue);
+
+ void ReadNext(std::vector* out) override {
+ bool success;
+ *out = queue_->Pop(&success);
+ if (!success) out->clear();
+ }
+
+ void ReInit() override { return; }
+
+ private:
+ std::shared_ptr queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+ using framework::OperatorBase::OperatorBase;
+ private:
+ void RunImpl(const framework::Scope& scope,
+ const platform::Place& dev_place) const override {
+ auto* out = scope.FindVar(Output("Out"))
+ ->template GetMutable();
+ if (out->Get() != nullptr) return;
+
+ const std::string& queue_name = Input("blocking_queue");
+ auto* queue_holder_var = scope.FindVar(queue_name);
+ PADDLE_ENFORCE(queue_holder_var != nullptr);
+ auto* queue_holder = queue_holder_var
+ ->template GetMutable();
+ out->Reset(new PyReader(queue_holder->GetQueue()));
+ }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+ queue_name = unique_name.generate("lod_tensor_blocking_queue")
+ var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+ feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+ out = create_var()
+ create_py_reader_op_with_queue_name(
+ inputs={'blocking_queue': queue_name},
+ outputs={'Out':[out]})
+ return out, feed_queue
+```
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcf76649357aaef80d6bc1a933ece8c4c1063547
--- /dev/null
+++ b/doc/fluid/design/concepts/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+ All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+ `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+ - `Create` is used to create a Variable by its name and add the mapping relation.
+ - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+ Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
+
+ Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+ Variable* Var(const std::string& name);
+ const Variable* FindVar(const std::string& name) const;
+
+ private:
+ std::unordered_map> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
+2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+ Scope(const std::shared_ptr& scope): parent_(scope) {}
+
+ Variable* FindVar(const std::string& name) const {
+ auto it = vars_.find(name);
+ if (it != vars_.end()) {
+ return it->second.get();
+ } else if (parent_ != nullptr) {
+ return parent_->FindVar(name);
+ } else {
+ return nullptr;
+ }
+ }
+
+ private:
+ std::shared_ptr parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+## Interface Design
+
+```cpp
+class Variable {
+ private:
+ Variable() = default;
+ friend class Scope;
+};
+
+class Scope {
+ private:
+ Scope(const std::shared_ptr& parent = nullptr);
+
+ public:
+ static std::shared_ptr Create(const std::shared_ptr& parent = nullptr);
+
+ // return nullptr if not found.
+ Variable* FindVar(const std::string& name) const;
+
+ // return if already contains same name variable.
+ Variable* Var(const std::string& name);
+
+ private:
+ std::shared_ptr parent_;
+ std::unordered_map> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/fluid/design/concepts/tensor.md b/doc/fluid/design/concepts/tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..0a27ac9bb6b03649d42e12100fda9e80a56e7f56
--- /dev/null
+++ b/doc/fluid/design/concepts/tensor.md
@@ -0,0 +1,189 @@
+# Tensor: An Unified Data Type in PaddlePaddle
+
+## Pain Point
+
+In this week, we discussed several potential weaknesses of PaddlePaddle caused by rapid iteration and development to promote new business products on the line in recent four years. For instance, current Matrix/Vector implementation in PaddlePaddle are long and tedious to read, which interfered seriously with the contribution of both fresh and professional engineers. More seriously for this issue, it will also become too challenging to maintain over time.
+
+
+## Learn from Majel
+
+Consequently, we decide to refactor PaddlePaddle step-by-step. First, refactor and replace Matrix/Vector to Tensor, a modern terminology in the deep learning system. Fortunately, we can learn from Majel how to define a Tensor.
+
+To simplify heterogeneous resource allocation in any dimensions (1-9) and types (double, float, float16), Majel consists of several primitives such as `Dim`, `Place` and `Array`, all of them are standard C++ class templates.
+
+1. `Place`: memory location [i.e. CPU/GPU].
+2. `Allocation`: heterogeneous resource allocator [i.e. 20MB in GPU].
+3. `Dim`: size of each dimension. [i.e. Dim<4>({10, 2, 5, 1})]
+4. `Array`: dynamic array consists of `Place`, `Dim`, and a pointer to memory.
+
+If you dig deeper into Majel source code, you will find Majel heavily use `boost.variant`. The variant class template is a safe, generic, stack-based discriminated union container, **offering a simple solution for manipulating an object from a heterogeneous set of types in a uniform manner**. Whereas standard containers such as std::vector may be thought of as "multi-value, single type," variant is "multi-type, single value."
+
+As a simple example, consider the following:
+
+```c++
+#include "boost/variant.hpp"
+#include
+
+class my_visitor : public boost::static_visitor
+{
+public:
+ int operator()(int i) const
+ {
+ return i;
+ }
+
+ int operator()(const std::string & str) const
+ {
+ return str.length();
+ }
+};
+
+int main()
+{
+ boost::variant< int, std::string > u("hello world");
+ std::cout << u; // output: hello world
+
+ int result = boost::apply_visitor( my_visitor(), u );
+ std::cout << result; // output: 11 (i.e., length of "hello world")
+}
+```
+
+In Majel, `DDimVar` is derived from `Dim`, `DArrayVar` is from `Array`.
+
+```c++
+template
+struct Dim {
+...
+int head;
+Dim tail;
+}
+```
+
+```c++
+template
+class Array : public Buffer {
+ ...
+private:
+ Dim size_;
+ Dim stride_;
+ T* ptr_;
+};
+```
+
+```c++
+typedef boost::variant Place;
+typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
+ Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
+typedef boost::variant<
+ Array,
+ Array,
+ Array,
+ Array,
+
+ Array,
+ Array,
+ Array,
+ Array,
+
+ Array,
+ Array,
+ Array,
+ Array > DArrayVar;
+```
+
+Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle.
+
+`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way:
+
+ ```c++
+ DArray arr = make_darray(make_ddim({2,3}), 0.0f);
+ ```
+ It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 .
+
+ The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this:
+
+ ```c++
+ arr[make_ddim({0, 1})] = 1.0;
+ ```
+
+## Implement Tensor in Paddle
+
+We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers.
+
+Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware.
+
+Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
+
+
+### Memory Management
+`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle.
+
+### Dim and Array
+#### Dim
+
+`Dim` decribes the dimension information of an array.
+
+`DDimVar` is an alias of a specializd class of boost.variant class template.
+
+`DDim` is introduced to represent a dynamically sized dimension.
+
+For example:
+
+```
+Dim<2> d1 = make_dim(3, 3);
+DDim d2 = make_ddim({1, 2, 3});
+```
+
+You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension.
+#### Array
+
+`Array` represents for a tensor with specific type and size.
+
+`DArrarVar` is an alias of a specialized class of boost.variant class template.
+
+`DArray` is introduced to represent a dynamically typed array.
+
+For example:
+
+```
+Array a1(Dim<2>(2, 2));
+DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace());
+```
+
+You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array.
+
+
+Please reference the section of `Learn from Majel` for more details.
+
+### ArrayView
+
+`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on.
+
+`ArrayView` is an encapsulation of `Array`, which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
+
+`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
+
+
+A global function `make_view` is provided to transform an array to corresponding arrayview.
+
+```
+template
+ArrayView make_view(const Array& in) {
+ return in;
+}
+```
+
+A global function `make_iterator` is provided to make iterator of an array.
+
+```
+template
+ViewIterator> make_iterator(const Array& in, Dim idx) {
+ return make_iterator(make_view(in), idx);
+}
+```
+
+### Basic Operations
+
+The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on.
+
+An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU).
diff --git a/doc/fluid/design/concepts/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md
new file mode 100644
index 0000000000000000000000000000000000000000..37e4f7b90f94fa3eb015e733999cd84c96b2239c
--- /dev/null
+++ b/doc/fluid/design/concepts/tensor_array.md
@@ -0,0 +1,271 @@
+# Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+
+```c++
+class TensorArray {
+ public:
+ explicit TensorArray(const LoDTensor&);
+ explicit TensorArray(size_t size);
+
+ private:
+ vector values_;
+};
+```
+
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is
+
+```python
+input = pd.data(...)
+num_steps = Var(12)
+
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+
+step = Var(1)
+
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+ wloop.break_if(pd.equal(step, num_steps)
+ pre_state = states.read(step-1, default_state)
+ step_input = step_inputs.read(step)
+ state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+ states.write(step, state)
+ step_outputs.write(step, state) # output state
+ step.update(state+1)
+
+output = step_outputs.stack()
+```
+
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+
+An RNN can be implemented with the following pseudocode
+
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+
+step = 1
+seq_len = 12
+while_loop {
+ if (step == seq_len) break;
+ states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+ output_segments[step] = states[step] // take state as output
+ step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+
+
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+
+The implementation is similar to `recurrent_op`.
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+
+
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes.
+
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+
+## Introduce TensorArray to uniform all the three RNNs
+TensorArray as a new concept is borrowed from TensorFlow,
+it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
+
+This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers,
+such as `recurrent_op`, `RecurrentGradientMachine`.
+
+In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401),
+`TensorArray` is used to segment inputs and store states in all time steps.
+By providing some methods similar to a C++ array,
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+
+## Dynamic-operations on TensorArray
+
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+ '''
+ get a tensor array `ta`, return a packed `tensor`.
+ '''
+ pass
+
+def tensor_array_unstack(tensor, ta):
+ '''
+ get a `tensor`, unstack it and get a tensor array `ta`.
+ '''
+ pass
+
+def tensor_array_write(ta, index, tensor, data_shared):
+ '''
+ get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+ value of the tensor array `ta`.
+ `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+ '''
+ pass
+
+def tensor_array_read(ta, index, tensor):
+ '''
+ get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+ `ta` and return as the `tensor`.
+ '''
+ pass
+
+def tensor_array_size(ta, tensor):
+ '''
+ get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+ '''
+ pass
+```
+
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use,
+for example
+
+```python
+class TensorArray:
+ def __init__(self, name):
+ self.name = name
+ self.desc = TensorArrayDesc()
+
+ def stack(self, name=None):
+ '''
+ Pack the values in a `TensorArray` into a tensor with rank one higher
+ than each tensor in `values`.
+ `stack` can be used to split tensor into time steps for RNN or whileloop.
+
+ @name: str
+ the name of the variable to output.
+ '''
+ tensor = Var(name)
+ tensor_array_stack(self.name, tensor)
+ return tensor
+
+ def unstack(self, input):
+ '''
+ Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+ `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+
+ @input: str
+ the name of input tensor
+ '''
+ tensor_array_unstack(tensor, self.name)
+
+ def write(self, index, value, data_shared=True):
+ '''
+ Write value into index of the TensorArray.
+ If `data_shared` is set to True, than the index-th value in TensorArray will
+ be shared with the tensor passed in.
+
+ @index: str
+ name of a scalar tensor
+ @value: str
+ name of a tensor
+ @data_shared: bool
+ '''
+ tensor_array_write(self.name, index, value, data_shared)
+
+ def read(self, index, output):
+ '''
+ Read the value at location `index` in the `TensorArray`.
+
+ @index: str
+ name of a scalar tensor
+ @output:
+ name of a output variable
+ '''
+ tensor_array_read(self.name, index, output)
+
+
+ def size(self, output):
+ '''
+ Return the number of values.
+
+ @output: str
+ name of a scalar tensor
+ '''
+ tensor_array_size(self.name, output)
+```
+
+## LoDTensor-related Supports
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+
+Since each step of RNN can only take a tensor-represented batch of data as input,
+some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
+
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+
+Some definitions are like
+
+```python
+def unpack(level):
+ '''
+ Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+ will sort by length.
+
+ Returns:
+ - a new `TensorArray`, whose values are LodTensors and represents batches
+ of data.
+ - an int32 Tensor, which stores the map from the new batch's indices to
+ original LoDTensor
+ '''
+ pass
+
+def pack(level, indices_map):
+ '''
+ Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+ and `level` and `indices_map`.
+ '''
+ pass
+```
+
+With these two methods, a varience-length sentence supported RNN can be implemented like
+
+```c++
+// input is the varient-length data
+LodTensor sentence_input(xxx);
+TensorArray ta;
+Tensor indice_map;
+Tensor boot_state = xxx; // to initialize rnn's first state
+TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
+TessorArray step_outputs;
+TensorArray states;
+
+for (int step = 0; step = ta.size(); step++) {
+ auto state = states.read(step);
+ // rnnstep is a function which acts like a step of RNN
+ auto step_input = ta.read(step);
+ auto step_output = rnnstep(step_input, state);
+ step_outputs.write(step_output, true/*data_shared*/);
+}
+
+// rnn_output is the final output of an rnn
+LoDTensor rnn_output = ta.pack(ta, indice_map);
+```
+the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
+the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
new file mode 100644
index 0000000000000000000000000000000000000000..8db67f6703d142da71cf06bd4f7e2cb13556f9b0
--- /dev/null
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -0,0 +1,100 @@
+# Design Doc: Var_desc
+
+## Background
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
+
+PaddlePaddle uses proto message to describe compile time program because :
+
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
+
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below.
+
+
+
+
+ |
+compile time |
+runtime |
+
+
+
+
+Data |
+VarDesc(proto) |
+Variable(cpp) |
+
+
+Operation |
+OpDesc(proto) |
+Operator(cpp) |
+
+
+
+
+
+## Definition of VarType
+
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+
+```proto
+message VarDesc {
+ required string name = 1;
+ required VarType type = 2;
+ optional bool persistable = 3 [ default = false ];
+}
+```
+
+## Definition of TensorDesc
+
+```proto
+message TensorDesc {
+ // Should only be PODType. Is enforced in C++
+ required Type data_type = 1;
+ repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+The `Type` here comes from the enum defined inside of `VarType` :
+
+```proto
+enum Type {
+ // Pod Types
+ BOOL = 0;
+ INT16 = 1;
+ INT32 = 2;
+ INT64 = 3;
+ FP16 = 4;
+ FP32 = 5;
+ FP64 = 6;
+
+ // Other types that may need additional descriptions
+ LOD_TENSOR = 7;
+ SELECTED_ROWS = 8;
+ FEED_MINIBATCH = 9;
+ FETCH_LIST = 10;
+ STEP_SCOPES = 11;
+ LOD_RANK_TABLE = 12;
+ LOD_TENSOR_ARRAY = 13;
+ PLACE_LIST = 14;
+ READER = 15;
+ CHANNEL = 16;
+}
+```
+
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
+
+## Definition of LodTensorDesc
+
+```proto
+message LoDTensorDesc {
+ required TensorDesc tensor = 1;
+ optional int32 lod_level = 2 [ default = 0 ];
+}
+```
+
+A LoDTensorDesc contains a tensor and a lod_level.
+
+## Definition of Variable in Python
+
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/fluid/design/concepts/variable.md b/doc/fluid/design/concepts/variable.md
new file mode 100644
index 0000000000000000000000000000000000000000..442ef6b718b227d79ca73031efcbb55817558252
--- /dev/null
+++ b/doc/fluid/design/concepts/variable.md
@@ -0,0 +1,52 @@
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2. It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation. Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+ /* malloc */ v1.GetMutable().mutable_data(DDim(100,200)),
+ /* size */ t1.Size());
+
+Randomize(
+ /* malloc */ v2.GetMutable().mutable_data(DDim(200,300)),
+ /* size */ t2.Size());
+
+Mult(
+ /*result*/ vr.GetMutable().mutable_data(SizeOfMult(v1, v2)),
+ /*input1*/ v1.Get().data(),
+ /*input2*/ v2.Get().data());
+```
+
+We see that a variable holds nothing until `Variable::GetMutable()` allocates a tensor and puts it in the variable. Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time. In other words, `class Variable` cannot be a template `template class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member. Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`. Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid). This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code.
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
new file mode 100644
index 0000000000000000000000000000000000000000..df67438bcc741ac521b00ee962fc13c93db21182
--- /dev/null
+++ b/doc/fluid/design/concurrent/channel.md
@@ -0,0 +1,139 @@
+# Channel Design
+
+## Introduction
+
+A Channel is a data structure that allows for synchronous interprocess
+communication via message passing. It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+
+## How to use it
+
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+
+### Create a channel
+
+Creates a new channel that takes in variables of a specific dtype.
+
+- **fluid.make_channel(dtype, capacity=0)**
+ - **dtype**: The data type of variables being sent/received through channel
+ - **capacity**: The capacity of the channel. A capacity of 0 represents
+ an unbuffered channel. Capacity > 0 represents a buffered channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+
+### Close a channel
+
+Closes a channel. Any pending senders and receivers will be awoken during
+this time. Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+
+- **fluid.channel_close(channel)**
+
+```
+fluid.channel_close(ch)
+```
+
+### Send data to a channel
+
+Sends a variable to a channel. Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
+`ChannelHolder` are supported.
+
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+
+- **channel_send(channel, variable, is_copy=False)**
+ - **channel**: The channel to send the variable to
+ - **variable**: The variable to send to the channel
+ - **is_copy**: If set to True, channel_send will perform a variable assign
+ to copy the source variable to a new variable to be sent.
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+
+### Receive data from a channel
+
+Receives a variable from a channel. The data of the variable is moved to the
+receiving variable.
+
+- **channel_recv(channel, return_variable)**
+ - **channel**: The channel to receive the variable from
+ - **return_variable**: The destination variable used to store the data of the
+ variable received from the channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+
+## How it Works
+
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+
+### QueueMessage
+
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**. It contains a condition variable used to lock the
+thread (when there are no available sends/receives). In addition, it contains
+a callback function to notify a thread when the QueueMessage is being
+processed by the channel.
+
+### Queues
+
+- **buff_**: This queue holds the data buffer in a buffered channel. The
+capacity is set to the capacity of the channel. This data buffer is not
+used in an unbuffered channel.
+
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel. When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+ 1. The channel is buffered and is full
+ 2. The channel is unbuffered and does not have a receiver
+
+- **recvq**: This queue holds the QueueMessage of any pending receivers of a
+channel. When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+ 1. The channel is buffered and there is no data on the buff_
+ 2. The channel is unbuffered and does not have a sender
+
+### State diagram
+
+#### Channel Send
+
+
+
+
+
+#### Channel Receive
+
+
+
+
+
+## Limitations and Considerations
+
+### Variable Copy
+
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent. We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.
+
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable. Please
+note that **assign** operator has limited support for only certain variables
+datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..0428e74f9e00a87f6b0972057f48479b8ae56ad6
--- /dev/null
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -0,0 +1,193 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+
+
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+ X = fluid.read(...)
+ W = fluid.Tensor(...)
+ Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, W, Y],
+ ops = [
+ read(output = X)
+ assign(input = ..., output = W)
+ mult(input = {X, W}, output = Y)
+ ],
+ }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+ paddlepaddle()
+ fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, L, Y],
+ ops = [
+ read(output = X)
+ kube_get_workers_addrs(output = L)
+ Y = tensor_array(len(L))
+ parallel_for(input = X, output = Y,
+ attrs = {L, block_id(1)}) # referring to block 1
+ ]
+ }
+
+ block[1] = Block {
+ parent = 0,
+ vars = [x, y, index],
+ ops = [
+ slice(input = [X, index], output = x) # index is initialized by parallel_for
+ send(input = x, attrs = L[index])
+ recv(outputs = y, attrs = L[index])
+ assign(input = y, output = Y[index])
+ ]
+ }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() { //// block 0
+ X = fluid.read(...)
+ L = fluid.k8s.get_worker_addrs()
+ Y = fluid.tensor_array(len(L))
+ fluid.parallel_for(X, L,
+ func(index int) { //// block 1
+ x = X[index]
+ fluid.send(L[index], x)
+ y = fluid.recv(L[index])
+ Y[index] = y
+ })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+
+ 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+ 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
+ 1. creates an Executor instance, and
+ 2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+ W = Tensor(...)
+ x = fluid.listen_and_do(
+ fluid.k8s.self_addr(),
+ func(input Tensor) {
+ output = fluid.mult(input, W)
+ })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+ 1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+ 2. once a connection is established,
+ 1. creates a scope of two parameters, "input" and "output",
+ 2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
+ 3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+ 1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+ 2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
new file mode 100644
index 0000000000000000000000000000000000000000..66d19f44baf861c7847e81ca83f61024ec877faf
--- /dev/null
+++ b/doc/fluid/design/concurrent/csp.md
@@ -0,0 +1,251 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning. Few example applications are:
+
+1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+
+
+
+concurrent programming model |
+implementation |
+
+
+
+
+mutex |
+types and functions in standard libraries |
+
+
+semaphore |
+ types and functions in standard libraries |
+
+
+ communicating sequential processes (CSP) |
+ Go programming language |
+
+
+ actor model |
+ Erlang programming language |
+
+
+ message passing |
+ MPI |
+
+
+ bulk synchronous parallel (BSP) |
+ Pregel distributed programming framework |
+
+
+
+
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch := make(chan int) // a channel without buffer
+ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor...> >`.
+
+### Send and Recv
+
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+ ```go
+ ch := make(chan int) // this is an unbuffered channel
+ ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+ ```
+
+1. Send
+
+ ```go
+ ch <- 111
+ ```
+
+1. Recv
+
+ ```go
+ y, ok <- ch
+ ```
+
+1. Close
+
+ ```go
+ close(ch)
+ ```
+
+ Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+ fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+ fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+ fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
+### Select
+
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1 := make(chan int)
+ch2 := make(chan int, 100)
+
+x := 0
+
+for {
+ select {
+ case ch1 <- x:
+ x := x + 1
+ case y <- ch2:
+ fmt.Println("Received on channel")
+ default:
+ fmt.Println("Default")
+ }
+ }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1 = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+ fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+ fluid.print("Received on Channel")
+
+with sel.default():
+ fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18b788e80f432ebb2f14b15229e7823c112001e
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread. It works in conjuction with CSP operators (channel_send,
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+ # Send a tensor of value 99 to "channel" on a detached thread
+ tensor = fill_constant(shape=[1], dtype='int', value=99)
+ tensor.stop_gradient = True
+ fluid.channel_send(channel, tensor)
+
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)
+fluid.channel_recv(ch, result)
+```
+
+The go operator can be accessed by using the fluid.Go() control flow. This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block. Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope. Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below. Take note of
+the **go_op** in particular. It is added as an operator in the current
+block (in this example, block0). The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a
+detached thread.
+
+```
+blocks {
+ idx: 0
+ parent_idx: -1
+ vars {
+ name: "return_value"
+ type {
+ type: LOD_TENSOR
+ lod_tensor {
+ tensor {
+ data_type: INT64
+ }
+ }
+ }
+ }
+ vars {
+ name: "status_recv"
+ type {
+ type: LOD_TENSOR
+ lod_tensor {
+ tensor {
+ data_type: BOOL
+ }
+ }
+ }
+ }
+ ...
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "channel"
+ }
+ type: "channel_create"
+ attrs {
+ name: "data_type"
+ type: INT
+ i: 7
+ }
+ attrs {
+ name: "capacity"
+ type: INT
+ i: 0
+ }
+ }
+ ops {
+ inputs {
+ parameter: "X"
+ arguments: "channel"
+ }
+ type: "go"
+ attrs {
+ name: "sub_block"
+ type: BLOCK
+ block_idx: 1
+ }
+ }
+ ops {
+ inputs {
+ parameter: "Channel"
+ arguments: "channel"
+ }
+ outputs {
+ parameter: "Out"
+ arguments: "return_value"
+ }
+ outputs {
+ parameter: "Status"
+ arguments: "status_recv"
+ }
+ type: "channel_recv"
+ }
+ ...
+}
+
+blocks {
+ idx: 1
+ parent_idx: 0
+ vars {
+ name: "status"
+ type {
+ type: LOD_TENSOR
+ lod_tensor {
+ tensor {
+ data_type: BOOL
+ }
+ }
+ }
+ }
+ ...
+
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_1.tmp_0"
+ }
+ type: "fill_constant"
+ attrs {
+ name: "force_cpu"
+ type: BOOLEAN
+ b: false
+ }
+ attrs {
+ name: "value"
+ type: FLOAT
+ f: 99.0
+ }
+ attrs {
+ name: "shape"
+ type: INTS
+ ints: 1
+ }
+ attrs {
+ name: "dtype"
+ type: INT
+ i: 3
+ }
+ }
+ ops {
+ inputs {
+ parameter: "Channel"
+ arguments: "channel"
+ }
+ inputs {
+ parameter: "X"
+ arguments: "fill_constant_1.tmp_0"
+ }
+ outputs {
+ parameter: "Status"
+ arguments: "status"
+ }
+ type: "channel_send"
+ attrs {
+ name: "copy"
+ type: BOOLEAN
+ b: false
+ }
+ }
+```
+
+## Current Limitations
+
+#### Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block. When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new
+child block. After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited. If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault. As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed. Currently, the go_op will explicitly enforce
+this requirement and raise an exception if a variable could not be found in
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to
+manage multiple threads (instead of natively by the OS). Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads. For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation. Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
new file mode 100644
index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_recv.png differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
new file mode 100644
index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_send.png differ
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..719ed76f9d542d6c4f20c30f27656bb53325aa85
Binary files /dev/null and b/doc/fluid/design/concurrent/images/select_op_workflow.png differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e47135e9fc42760898083710e0a6767252a0225b
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_cn.rst
@@ -0,0 +1,8 @@
+并发编程
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ concurrent_programming.md
+ parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0727e75798b2a869588f80d3cce7a886554e4ffb
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_en.rst
@@ -0,0 +1,8 @@
+Concurrent Programming
+-------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ concurrent_programming.md
+ parallel_do.md
diff --git a/doc/fluid/design/concurrent/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md
new file mode 100644
index 0000000000000000000000000000000000000000..42bd136f825986d94fafaeaa5f58edb02848a74c
--- /dev/null
+++ b/doc/fluid/design/concurrent/parallel_do.md
@@ -0,0 +1,163 @@
+# Design Doc: Parallel_Do in PaddlePaddle
+
+In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
+
+## Design overview
+
+The definition of a parallel_do op looks like the following
+
+```c++
+AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
+AddInput(kParameters, "Parameters are duplicated over different devices")
+ .AsDuplicable();
+AddInput(kPlaces, "Devices used for parallel processing");
+AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
+AddOutput(kParallelScopes,
+ "Scopes for all local variables in forward pass. One scope for each device");
+AddAttr(kParallelBlock,
+ "List of operaters to be executed in parallel");
+```
+
+A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
+`||||` means multiple threads)
+
+```
+In the forward pass
+ | Split input onto different devices
+ | Copy parameter onto different devices
+ |||| Compute forward pass in parallel
+ | Merge output from different devices
+
+In the backward pass
+ | Split output@grad onto different devices
+ |||| Compute backward pass in parallel
+ | accumulate param@grad from different devices to the first device
+ | Merge input@grad from different devices
+ | Copy param@grad to the place of parallel_do_op
+```
+
+This implementation allows to write mixed device program like this
+
+```python
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
+
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
+# parallel processing on multiple GPUs
+pd = ParallelDo(gpu_places)
+with pd.do(input=data):
+ prediction = softmax(fc(fc(data, W1), W2))
+ write_output(prediction)
+prediction = pd()
+loss = cross_entropy(prediction, label)
+```
+
+And the programDesc are like the following
+
+```
+# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
+start_program
+{
+ vars: w1, w2
+ ops: init(w1), init(w2)
+}
+
+main_program
+{
+block0 {
+ vars: data, places, w1, w2, w1_grad, w2_grad,
+ ops: data, get_place, parallel_do(block1),
+ parallel_do_grad(block2),
+ sgd(w2, w2_grad),
+ sgd(w1, w1_grad)
+}
+block1 { # the forward pass
+ parent_block: 0
+ vars: data, h1, h2, loss
+ ops: fc, fc, softmax
+}
+block2 { # the backward pass
+ parent_block: 1
+ vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
+ ops: softmax_grad,
+ fc_grad
+ fc_grad
+}
+}
+```
+
+## Performance Imporvement
+
+There are serial places we can make this parallel_do faster.
+
+### forward: split input onto different devices
+
+If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by
+prefetching the input onto different devices in a seperate background thread. And the python code
+looks like this.
+```python
+pd = ParallelDo(gpu_places)
+with pd.do():
+ feature = get_data_from_prefetch_queue(gpu_places)
+ prediction = my_net(feature)
+ write_output(activation)
+```
+
+### forward: Copy parameter to onto different devices
+
+We can avoid this step by making each device have a copy of the parameter. This requires:
+
+1. `fluid.default_start_up_program()` to be run on all devices
+1. In the backward, allreduce param@grad at different devices, this requires
+ 1. `backward.py` add `allreduce` operators at parallel_do_grad
+ 1. `allreduce` operators need to be called in async mode to achieve maximum throughput
+1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
+
+By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
+And the ProgramDesc looks like the following
+
+```
+# w1, w2 will be allocated on all GPUs
+start_program
+{
+block0 {
+ parallel_do(block1)
+}
+block1 {
+ parent_block: 0
+ vars: w1, w2
+ ops: init(w1), init(w2)
+}
+}
+
+main_program
+{
+block0 {
+ vars: data, places, w1, w2
+ ops: data, get_place, parallel_do(block1),
+ parallel_do_grad(block2), # append_backward
+ parallel_do(block3) # append_optimization
+
+}
+block1 {
+ parent_block: 0
+ vars: data, h1, h2, loss
+ ops: fc, fc, softmax
+}
+block2 {
+ parent_block: 1
+ vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+ ops: softmax_grad,
+ fc_grad, allreduce(places, scopes, w1_grad),
+ fc_grad, allreduce(places, scopes, w2_grad)
+}
+block3 {
+ parent_block: 0
+ vars: lr
+ ops: sgd(w2, w2_grad),
+ sgd(w1, w1_grad)
+}
+}
+```
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fcae57cc7932cdaebe549486e7f7cebf0bd038a
--- /dev/null
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -0,0 +1,265 @@
+# select_op Design
+
+## Introduction
+
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case. If multiple cases are ready to run, then one case is
+choosen at random to be executed.
+
+With the introduction of CSP for Paddle, we mimic this behavior by
+creating a ***select_op***.
+
+## How to use it
+
+The **select_op** is available as a c++ operator. However most users
+will prefer to use the much simplier Python API.
+
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program. Also creates a sub block and adds it to the
+main program. This sub block is used to hold all variables and operators
+used by the case statements.
+
+Within the select block, users can add cases by
+calling **select.case** or **select.default** method.
+
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case. This method creates a SelectCase block
+guard and adds it to the Select block. The arguments into this method tells
+the select which channel operation to listen to.
+
+- **fluid.Select.default()**: Represents the fluid default case. This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)
+
+with while_op.block():
+ with fluid.Select() as select:
+ with select.case(fluid.channel_send, channel, x):
+ # Send x, then perform Fibonacci calculation on x and y
+ x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+ assign(input=x, output=x_tmp)
+ assign(input=y, output=x)
+ assign(elementwise_add(x=x_tmp, y=y), output=y)
+ with select.case(fluid.channel_recv, quit_channel, result2):
+ # Exit out of While loop
+ while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+ helper = layer_helper.LayerHelper('assign')
+ helper.append_op(
+ type='assign',
+ inputs={'X': [while_false]},
+ outputs={'Out': [while_cond]})
+```
+
+## How it Works
+
+### Program Description
+
+```
+blocks {
+ idx: 0
+ ...
+ // Create "case_to_execute" variable
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_110.tmp_0"
+ }
+ type: "fill_constant"
+ attrs {
+ name: "force_cpu"
+ type: BOOLEAN
+ b: false
+ }
+ attrs {
+ name: "value"
+ type: FLOAT
+ f: -1.0
+ }
+ attrs {
+ name: "shape"
+ type: INTS
+ ints: 1
+ }
+ attrs {
+ name: "dtype"
+ type: INT
+ i: 2
+ }
+ }
+ // Create "select" operator.
+ // inputs:
+ // X: All input variables used by operators within the select block
+ // case_to_execute: Variable filled in by select_op when it determines
+ // which case to execute.
+ //
+ // outputs:
+ // Out: All output variables referenced by operators within select block.
+ //
+ // attrs:
+ // sub_block: The block id containing the select "cases"
+ // cases: Serialized list of all cases in the select op.
+ // Each case is serialized as: ',,,'
+ // where type is 0 for default, 1 for send, and 2 for receive.
+ // No channel and values are needed for default cases.
+ ops {
+ inputs {
+ parameter: "X"
+ arguments: "fill_constant_103.tmp_0"
+ arguments: "fill_constant_104.tmp_0"
+ }
+ inputs {
+ parameter: "case_to_execute"
+ arguments: "fill_constant_110.tmp_0"
+ }
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_110.tmp_0"
+ }
+ type: "select"
+ attrs {
+ name: "sub_block"
+ type: BLOCK
+ block_idx: 1
+ }
+ attrs {
+ name: "cases"
+ type: STRINGS
+ strings: "0,1,channel_101,fill_constant_109.tmp_0"
+ strings: "1,2,channel_102,fill_constant_108.tmp_0"
+ }
+ }
+ ...
+}
+```
+
+The python select API will add the **select_op** to the current block. In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**. It will also create a temp variable called **case_to_execute**. This variable is
+filled in by the select_op after it has completed processing the case statements.
+
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread. The thread will
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+
+Finally the select_op will call executor.run on the **sub_block**.
+
+```
+blocks {
+ idx: 1
+ parent_idx: 0
+ ...
+ // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+ ops {
+ outputs {
+ parameter: "Out"
+ arguments: "fill_constant_111.tmp_0"
+ }
+ type: "fill_constant"
+ attrs {
+ name: "force_cpu"
+ type: BOOLEAN
+ b: false
+ }
+ attrs {
+ name: "value"
+ type: FLOAT
+ f: 0.0
+ }
+ attrs {
+ name: "shape"
+ type: INTS
+ ints: 1
+ }
+ attrs {
+ name: "dtype"
+ type: INT
+ i: 2
+ }
+ }
+ // Create an "equal" operator to compare the case index with the "case_to_execute"
+ // tensor (which was filled in by the select op).
+ ops {
+ inputs {
+ parameter: "X"
+ arguments: "fill_constant_111.tmp_0" // case 0
+ }
+ inputs {
+ parameter: "Y"
+ arguments: "fill_constant_110.tmp_0" // case_to_execute
+ }
+ outputs {
+ parameter: "Out"
+ arguments: "equal_0.tmp_0"
+ }
+ type: "equal"
+ attrs {
+ name: "axis"
+ type: INT
+ i: -1
+ }
+ }
+ // Use the output of the "equal" operator as a condition for the "conditional_block".
+ // If the condition evaluates to true, then execute the "sub_block" (which represents
+ // the select case's body)
+ ops {
+ inputs {
+ parameter: "Params"
+ }
+ inputs {
+ parameter: "X"
+ arguments: "equal_0.tmp_0"
+ }
+ outputs {
+ parameter: "Out"
+ }
+ outputs {
+ parameter: "Scope"
+ arguments: "_generated_var_0"
+ }
+ type: "conditional_block"
+ attrs {
+ name: "is_scalar_condition"
+ type: BOOLEAN
+ b: true
+ }
+ attrs {
+ name: "sub_block"
+ type: BLOCK
+ block_idx: 4
+ }
+ }
+ ...
+ // Repeat the above operators for each case statements inside the select body
+}
+
+```
+
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block,
+only one case will be executed.
+
+### select_op flow
+
+
+
+
+
+The select algorithm is inspired by golang's select routine. Please refer to
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+
+## Backward Pass
+
+TODO
diff --git a/doc/fluid/design/data_type/float16.md b/doc/fluid/design/data_type/float16.md
new file mode 100644
index 0000000000000000000000000000000000000000..844d2aafcf257b85057e1ac200ed3d5cf0be2ff0
--- /dev/null
+++ b/doc/fluid/design/data_type/float16.md
@@ -0,0 +1,183 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range.
+
+When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially
+
+- reduce storage space, memory bandwidth, and power usages;
+- increase the chance of data fitting into a smaller cache of lower latency;
+- provide arithmetic speed up if supported by hardware.
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier.
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0.
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+ unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+ half a, b, c;
+ c = __hadd(a, b); // correct
+ c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+ unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+ unsigned short __x;
+public:
+ // constructors and conversion operators from/to
+ // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__
+__half operator+(const __half &lh, const __half &rh) {
+ return __hadd(lh, rh);
+}
+
+// Other overloaded operators
+```
+This new design makes `c = a + b` work correctly for CUDA half data type.
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+ uint16_t x;
+};
+```
+
+float16 supports the following features:
+ - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double.
+ - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+ - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen.
+ - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware.
+
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## float16 inference
+In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one.
+
+### Operator level requirement
+Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output.
+
+This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type.
+
+The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable.
+
+So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator.
+
+### Variable level requirement
+Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type.
+
+When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights.
+
+In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h).
+
+The following code demonstrates how to do the tensor conversion.
+```Python
+# var is the variable of float weights
+# tensor is a numpy array of data copied from the tensor data in var
+# fp16_var is the variable that will contain float16 weights converted from var
+tensor = numpy.array(var.get_tensor())
+fp16_tensor = fp16_var.get_tensor()
+
+# After the original tensor data is converted to numpy float16 data type,
+# view(numpy.uint16) is used so that the internal memory of the numpy array
+# will be reinterpreted to be of uint16 data type, which is binded to
+# Fluid float16 class via pybind with the help of uint16_t built-in c++ type
+fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace)
+```
+
+### Consistent API requirement
+The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type.
+
+This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode.
+
+### float16 transpiler
+Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one.
+
+Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md),
+this transpiler mainly does the following modifications:
+
+1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel.
+
+2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result.
+
+3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program.
+
+4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart.
+
+Below is an example of usage:
+```Python
+# Get the float inference program
+[float_inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(float_inference_program,
+ feed={feed_target_names[0]: tensor_img},
+ fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = fluid.InferenceTranspiler()
+t.float16_transpile(float16_inference_program, GPUPlace)
+
+# Running
+float16_results = exe.run(float16_inference_program,
+ feed={feed_target_names[0]: tensor_img},
+ fetch_list=fetch_targets)
+```
+
+As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode.
+
+### Speedup on GPU
+Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart.
+
+Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details.
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b60167b6b1599df69dfc5073ebf32bdbb0a316ec
--- /dev/null
+++ b/doc/fluid/design/data_type/index_cn.rst
@@ -0,0 +1,7 @@
+数据类型
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a88d17943f49134a2d00363845e919537ff4545
--- /dev/null
+++ b/doc/fluid/design/data_type/index_en.rst
@@ -0,0 +1,7 @@
+Data Type
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ float16.md
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+
+
+1. Pserver
+ 1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+ 1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+ 1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+ 1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+ 1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+ 1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+ 1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+ 1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+ 1. Pserver will run the optimization block to optimize the parameters.
+ 1. After optimization, pserver set `barrier_condition_` to 1.
+ 1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+ 1. The trainer uses GetVariable to get all the parameters from pserver.
+ 1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+ 1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+
+
+1. Pserver:
+ 1. Each parameter has a queue to receive its gradient from trainers.
+ 1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+ 1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+ 1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+ 1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+ 2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..248d2ec18dafdecac9184527638754b6ba4d85b8
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,61 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
+1. After the PS node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
+so all the Trainers would get the same parameters.
+
+In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
+have completed running current mini-batch. After that, all trainers can continue to run the next
+mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends
+on the slowest node.
+
+In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
+the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
+train such models that achieve scaling, better throughput. In this design doc, we will introduce how to
+implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
+
+## Design
+
+
+
+As the figure above, we describe a global view of the asynchronous update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PS
+instances and then send them.
+1. PS would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch the latest parameter from PS before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PS instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8b8427811cddcddf872db5badfd37c96a76c3e3
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+whatever platform you like.
+
+It has two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer has the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
new file mode 100644
index 0000000000000000000000000000000000000000..371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -0,0 +1,197 @@
+# Design Doc: Fluid Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
+
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
+
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
+
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
+
+## Analysis
+
+The assumption is that the user writes the trainer program in either Python or C++.
+
+### Limitation 1
+
+There are two basic functionalities in the trainer program:
+
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
+ optimizer.
+
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
+
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
+
+### Limitation 2
+
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
+
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
+
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
+
+
+
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
+
+
+
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
+
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
+
+## Distributed Training Architecture
+
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
+
+
+
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
+
+### Python API
+
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
+
+```Python
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.mnist.train(), buf_size=500),
+ batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+ for data in train_reader():
+ loss, acc = exe.run(trainer_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+```
+
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+ distributed training program:
+ 1. Parse configurations from `RemoteExecutor`.
+ 1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+ 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+ DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+ "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost],
+ job_desc=JobDesc(
+ jobname,
+ num_trainer,
+ num_pserver,
+ cpu_per_trainer,
+ gpu_per_trainer,
+ mem_per_trainer,
+ cpu_per_pserver,
+ mem_per_pserver
+ ))
+for data in train_reader():
+ loss, acc = exe.run(trainer_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+```
+
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
+
+
+
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
+
+
+### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
+
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
+
+
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](./README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -0,0 +1,89 @@
+# Design Doc: Distributed Lookup Table Operator
+
+A distribute lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+
+## Background
+
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+
+### The Forward Algorithm
+
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+
+$$y = x * W$$
+
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x). Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say two symbols, and a lookup table W:
+
+
+
+### The Backward Algorithm
+
+The backward algorithm computes W'(x) using W(x). W'(x) has the same
+the scale of size as W(x) and is much smaller than W.
+
+To optimize W given W', we can do simple SGD update:
+
+$$W = f(W') = \lambda * W'$$
+
+or some more sophisticated algorithms that rely on both W' and W:
+
+$$W = f(W, W')$$
+
+The following figure illustrates the backward pass of the lookup
+operator: 
+
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+
diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
new file mode 100644
index 0000000000000000000000000000000000000000..c09b7c99159ace9b3df989f803ede20bc3585d92
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -0,0 +1,44 @@
+# Parallelism, Asynchronous, Synchronous, Codistillation
+
+
+For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
+
+# Model Parallelism
+In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
+
+A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
+
+More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
+
+# Data Parallelism
+Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
+
+# Asynchronous Training
+In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
+
+In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
+
+Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
+
+# Synchronous Training
+Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
+
+An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
+
+Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
+
+Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
+
+# Codistillation
+Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
+
+
+# Reference
+
+[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
+
+[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
+
+[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
+
+[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ distributed_architecture.md
+ distributed_lookup_table_design.md
+ parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ distributed_architecture.md
+ distributed_lookup_table_design.md
+ parameter_server.md
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad3afc7b7522c60460c6f1f387f9415d3738778
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args: Need add execute args to assign when and how to use MPI operations.
+* New ops: Need new op ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add ```mpi_send_op``` and ```mpi_listenandserve_op``` to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI``` compile args to control MPI to use or not. If the ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer), The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests, the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives, the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if confirm to use MPI, we will modify ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..38222d083084ebfca3099ce96b47868c42d55101
--- /dev/null
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+
+
+After converted:
+
+
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+ which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+ for the atomic counter become `0`:
+ ```cpp
+ BlockingCounter bc(thread_count);
+ for (int i = 0; i < thread_count; ++i) {
+ thread_pool->Start([&bc] {bc.DecrementCount(); })
+ }
+ bc.Wait();
+ ```
+- `ParallelDo` Operator
+ - Initialize a thread pool which is a Singleton.
+ - Use a block id as the input, and create run the specify Block on independent scope
+ with multi-threads.
+ - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+ with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+ assign the parameters to the different threads and execute
+ optimizer with multi-threads.
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
new file mode 100644
index 0000000000000000000000000000000000000000..563b70bc0e852bec953eb40dda3c46b3d45d7e68
--- /dev/null
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -0,0 +1,106 @@
+# Design Doc: Parameter Server
+
+## Abstract
+
+We propose an approach to implement the parameter server. In this
+approach, there is no fundamental difference between the trainer and
+the parameter server: they both run subgraphs, but subgraphs of
+different purposes.
+
+## Background
+
+The previous implementations of the parameter server do not run a
+fluid sub-program. Parameter initialization, optimizer computation, network
+communication and checkpointing are implemented twice on both the
+trainer as well as the parameter server.
+
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
+server becomes a natural extension.
+
+## Design
+
+### Distributed Transpiler
+
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
+steps:
+
+1. OP placement: the OPs will be placed on different nodes according
+ to a heuristic that minimizes the estimated total computation
+ time. Currently we will use a simple heuristic that puts parameter
+ variable on parameter server workers and everything else on trainer
+ workers.
+1. Add communication OPs to enable the communication between nodes.
+
+We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
+
+Below is an example of converting the user defined graph to the
+subgraphs for the trainer and the parameter server:
+
+
+
+After converting:
+
+
+
+1. The parameter variable W and its optimizer program are placed on the parameter server.
+1. Operators are added to the program.
+ - *Send* sends data to the connected *Recv* operator. The
+ scheduler on the receive node will only schedule *Recv* operator
+ to run when the *Send* operator has ran (the *Send* OP will mark
+ the *Recv* OP runnable automatically).
+ - *Enqueue* enqueues the input variable, it can block until space
+ become available in the queue.
+ - *Dequeue* outputs configurable numbers of tensors from the
+ queue. It will block until the queue has the required number of
+ tensors.
+
+### Sparse Update
+
+For embedding layers, the gradient may have many rows containing only 0 when training,
+if the gradient uses a dense tensor to do parameter optimization,
+it could spend unnecessary memory, slow down the calculations and waste
+the bandwidth while doing distributed training.
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
+non-zero gradient data. So when we do parameter optimization both locally and remotely,
+we only need to send those non-zero rows to the optimizer operators:
+
+
+### Benefits
+
+- Model parallelism becomes easier to implement: it is an extension to
+ the trainer - parameter server approach. We can have several "Transpilers"
+ to achieve different goals.
+- User-defined optimizer is easier to add - user can now express it as
+ a sub-program.
+- No more duplication logic inside the trainer and the parameter
+ server mentioned in the background section.
+
+### Challenges
+
+- It is important to balance the parameter shards on multiple
+ parameter servers. If a single parameter is very big (for example: some
+ word-embedding, fully connected, softmax layer), we need to
+ automatically partition the single parameter onto different
+ parameter servers when possible (only element-wise optimizer depends
+ on the parameter variable).
+- In the "Async SGD" figure, the "W" variable on the parameter server
+ could be read and written concurrently. See
+ [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+ details about concurrent program in Fluid.
+
+### Discussion
+
+- Can the Enqueue OP be implemented under our current tensor design
+ (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
+ `min_count` attribute), does our current design support it? (similar
+ question for the *Add* OP)
+
+### References
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/design/dist_train/src/compiler.graffle differ
diff --git a/doc/fluid/design/dist_train/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/design/dist_train/src/compiler.png differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/dist-graph.graffle differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/dist-graph.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local-graph.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local-graph.png differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.graffle differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/design/dist_train/src/local_architecture.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table_training.png differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads.graffle differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/paddle-compile.graffle differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/paddle-compile.png differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.graffle differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/design/dist_train/src/remote_executor.png differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sparse_update.png differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+ rnn [label="1st level RNN" shape=box]
+
+ subgraph cluster0 {
+ label = "time step 0"
+
+ sent0 [label="sentence"]
+ sent1 [label="sentence"]
+
+ rnn1 [label="2nd level RNN" shape=box]
+
+ sent0 -> rnn1
+ sent1 -> rnn1
+ }
+
+ subgraph cluster1 {
+ label = "time step 1"
+
+ sent2 [label="sentence"]
+ sent3 [label="sentence"]
+
+ rnn2 [label="2nd level RNN" shape=box]
+
+ sent2 -> rnn2
+ sent3 -> rnn2
+ }
+
+ subgraph cluster2 {
+ label = "time step 2"
+
+ sent4 [label="sentence"]
+ sent5 [label="sentence"]
+
+ rnn3 [label="2nd level RNN" shape=box]
+
+ sent4 -> rnn3
+ sent5 -> rnn3
+ }
+
+
+ para0 [label="paragraph info 0"]
+ para1 [label="paragraph info 1"]
+ para2 [label="paragraph info 2"]
+
+ rnn1 -> para0
+ rnn2 -> para1
+ rnn3 -> para2
+
+ para0 -> rnn
+ para1 -> rnn
+ para2 -> rnn
+
+ chapter [label="chapter info"]
+ rnn -> chapter
+}
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/2_level_rnn.png differ
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ rnn.md
+ rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ rnn.md
+ rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+ label = "simple RNN implementation"
+
+ ranksep=2;
+
+ //graph [nodesep=1, ranksep=1];
+
+ node[nodesep=1]
+
+ subgraph cluster0 {
+ label = "global scope"
+ rankdir = TB
+ W
+ boot_memory
+ input
+ output
+ }
+
+ subgraph cluster1 {
+ label = "step-scope 0"
+ rankdir = TB
+ memory0[label="memory"]
+ prememory0[label="pre-memory"]
+ step_input0[label="step input"]
+ step_output0[label="step output"]
+ }
+
+ subgraph cluster2 {
+ label = "step-scope 1"
+ rankdir = TB
+ memory1[label="memory"]
+ prememory1[label="pre-memory"]
+ step_input1[label="step input"]
+ step_output1[label="step output"]
+ }
+
+ subgraph cluster3 {
+ label = "step-scope 2"
+ rankdir = TB
+ memory2[label="memory"]
+ prememory2[label="pre-memory"]
+ step_input2[label="step input"]
+ step_output2[label="step output"]
+ }
+
+ stepnet [shape=box]
+ stepnet0 [shape=box, style=dashed]
+ stepnet1 [shape=box, style=dashed]
+ stepnet2 [shape=box, style=dashed]
+
+
+ edge[color=blue]
+ boot_memory -> prememory0 [label="init" color="blue"]
+ memory0 -> prememory1 [label="copy/reference" color="blue"]
+ memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+ edge[color=black]
+ W -> stepnet0[constraint=false, style=dashed]
+ W -> stepnet1[constraint=false, style=dashed]
+ W -> stepnet2[constraint=false, style=dashed]
+
+ memory0 -> stepnet0[style=dashed]
+ prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+ memory1 -> stepnet1[style=dashed]
+ prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+ memory2 -> stepnet2[style=dashed]
+ prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+ input -> step_input0
+ input -> step_input1
+ input -> step_input2
+
+ step_input0 -> stepnet0 [style=dashed]
+ step_input1 -> stepnet1[style=dashed]
+ step_input2 -> stepnet2[style=dashed]
+
+ step_output0 -> output
+ step_output1 -> output
+ step_output2 -> output
+
+ stepnet0 -> stepnet[style=dashed]
+ stepnet1 -> stepnet[style=dashed]
+ stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/design/dynamic_rnn/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn.jpg differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b39ae0675c45e56852293d97f45e91861cf31667
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -0,0 +1,153 @@
+# RNNOp design
+
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
+
+## RNN Algorithm Implementation
+
+
+
+
+
+The above diagram shows an RNN unrolled into a full network.
+
+There are several important concepts here:
+
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
+
+### Step-scope
+
+There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
+
+
+
+Figure 2 illustrates the RNN's data flow
+
+
+Please be aware that every step runs the same step-net. Each step does the following:
+
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
+
+The RNN operator will compose its output from step outputs in each of the step scopes.
+
+### Memory and Ex-memory
+
+Let's give more details about memory and ex-memory using a simple example:
+
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
+
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
+
+### Usage in Python
+
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
+
+We can define an RNN's step-net using a Block:
+
+```python
+import paddle as pd
+
+X = some_op() # x is some operator's output and is a LoDTensor
+a = some_op()
+
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+ x = rnn.add_input(X)
+ # declare a memory (rnn's step)
+ h = rnn.add_memory(init=a)
+ # h.pre_state(), the previous memory of rnn
+ new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+ # update current memory
+ h.update(new_state)
+ # indicate that h variables in all step scopes should be merged
+ rnn.add_outputs(h)
+
+out = rnn()
+```
+
+Python API functions in above example:
+
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
+
+### Nested RNN and LoDTensor
+
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
+
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
+
+
+
+
+
+```python
+import paddle as pd
+
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+
+# a is output of some op
+a = some_op()
+
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is a chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+
+def lower_level_rnn(paragraph):
+ '''
+ x: the input
+ '''
+ rnn = pd.create_rnn_op(output_num=1)
+ with rnn.stepnet():
+ sentence = rnn.add_input(paragraph, level=0)
+ h = rnn.add_memory(shape=[20, 30])
+ h.update(
+ pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+ # get the last state as sentence's info
+ rnn.add_outputs(h)
+ return rnn
+
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+ paragraph_data = rnn.add_input(chapter_data, level=1)
+ low_rnn = lower_level_rnn(paragraph_data)
+ paragraph_out = low_rnn()
+
+ h = rnn.add_memory(init=a)
+ h.update(
+ pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+ top_level_rnn.add_outputs(h)
+
+# output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+
+In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
+
+
+
+
+
diff --git a/doc/fluid/design/dynamic_rnn/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+ chapter [label="chapter"]
+
+ subgraph cluster0 {
+ label = "paragraph 0"
+
+ top_rnn0[label="top rnn step 0" shape=box]
+
+ p0 [label="paragraph 0"]
+ p1 [label="paragraph 1"]
+ }
+
+ subgraph cluster1{
+ label = "paragraph 1"
+
+ top_rnn1[label="top rnn step 1" shape=box]
+
+ p2 [label="paragraph 0"]
+ p3 [label="paragraph 1"]
+ }
+
+ subgraph cluster_p0 {
+ label = "sentence 0"
+
+ low_rnn0 [label="low rnn step 0" shape=box]
+ s00 [label="sentence 0"]
+ s01 [label="sentence 1"]
+
+ low_rnn0 -> s00
+ low_rnn0 -> s01
+ }
+
+ subgraph cluster_p1 {
+ label = "sentence 1"
+ low_rnn1 [label="low rnn step 1" shape=box]
+ s10 [label="sentence 0"]
+ s11 [label="sentence 1"]
+ low_rnn1 -> s10
+ low_rnn1 -> s11
+ }
+
+ subgraph cluster_p2 {
+ label = "sentence 1"
+ low_rnn2 [label="low rnn step 0" shape=box]
+ s20 [label="sentence 0"]
+ s21 [label="sentence 1"]
+ low_rnn2 -> s20
+ low_rnn2 -> s21
+ }
+
+ subgraph cluster_p3 {
+ label = "sentence 1"
+ low_rnn3 [label="low rnn step 1" shape=box]
+ s30 [label="sentence 0"]
+ s31 [label="sentence 1"]
+ low_rnn3 -> s30
+ low_rnn3 -> s31
+ }
+
+
+ chapter -> top_rnn0
+ chapter -> top_rnn1
+
+ top_rnn0 -> p0
+ top_rnn0 -> p1
+ top_rnn1 -> p2
+ top_rnn1 -> p3
+
+
+ p0 -> low_rnn0
+ p1 -> low_rnn1
+ p2 -> low_rnn2
+ p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..cecfcd3307ae4c4fa603220a360e9e124069fa58
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -0,0 +1,242 @@
+# RNN 变长输入设计
+对变长序列的学习,现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式,
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持,本文也将基于该模块的思路,设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape,因此基于tensor 的主流框架在存储变长序列时,
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协, 从用户角度,在使用RNN类模型时自然会比较介意padding的存在,
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗,tensorflow和mxnet均使用了bucketing来进行优化[1][2],
+但不管是padding还是bucket,对于用户都是额外的使用负担。
+
+因此,**paddle原生支持变长序列的方式,能直接满足用户对变长序列的最直接的需求,在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持,需要对目前框架做一些修改,下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上,
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息,更高维度的序列则无法直接支持;
+
+为了支持 `N-level` 序列的存储,本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector level_t;
+std::vector lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息,和paddle目前做法一致。
+
+为了更透明地传递序列信息,我们引入了一种新的tensor 称为 `LODTensor`[4],
+其关于tensor相关的接口都直接继承自 `Tensor`,但另外添加了序列相关接口。
+如此,在操作一个 `LODTensor` 时,普通 `Op` 直接当成 `Tensor` 使用,
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下:
+
+```c++
+class LODTensor : public Tensor {
+public:
+ size_t Levels() const { return seq_start_positions_.size(); }
+ size_t Elements(int level = 0) const {
+ return seq_start_positions_[level].size();
+ }
+ // slice of level[elem_begin: elem_end]
+ // NOTE low performance in slice seq_start_positions_.
+ // TODO should call Tensor's Slice.
+ LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+ // slice with tensor's data shared with this.
+ LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+ // copy other's lod_start_pos_, to share LOD info.
+ // NOTE the LOD info sould not be changed.
+ void ShareConstLODFrom(const LODTensor &other) {
+ lod_start_pos_ = other.lod_start_pos_;
+ }
+ // copy other's lod_start_pos_'s content, free to mutate.
+ void ShareMutableLODFrom(const LODTensor &other) {
+ lod_start_pos_ = std::make_shared <
+ std::vector>(other.lod_start_pos_.begin(),
+ other.lod_start_pos_.end());
+ }
+
+private:
+ std::shared_ptr>> lod_start_pos_;
+};
+```
+
+其中, `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价,
+可以认为 `LODTensor` 是 `Tensor` 的扩展,几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递,框架里很多 `Tensor` 都需要变成 `LODTensor`,
+简单实现,直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`,这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外,用户有可能需要感知序列的存在(比如序列的可视化需要解析模型中输出的序列),因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性,以实现`lod_start_pos`的传递:
+
+1. 以 `shared_ptr` 的方式实现传递
+ - 不修改 `lod_start_pos` 内容的作为 consumer
+ - 修改 `lod_start_pos` 的作为 producer
+ - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+ - producer 需要创建自己的独立的内存,以存储自己独立的修改,并暴露 `shared_ptr` 给后续 consumer
+ - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况,调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的,调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中,并且只需要初始化一次,基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性,默认为 `false`
+ - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ,并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+ // ...
+ void InferShape() {
+ if (!is_load_inited) {
+ bool do_mutate_lod_info = GetAttr("do_mutate_load_info");
+ // find a input having LOD to copy
+ auto lod_input = ValidLODInput();
+ for (auto &output : outputs) {
+ if (do_mutate_load_info) {
+ output.ShareMutableLODFrom(lod_input);
+ } else {
+ output.ShareConstLODFrom(load_input);
+ }
+ }
+ is_pod_inited = true;
+ }
+
+ // call op's InferShape
+ // ...
+ }
+
+private:
+ // ...
+ bool is_lod_inited{false};
+};
+```
+
+如此,`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到,对于需要修改 `load_start_pos` 的Op,`OperatorBase` 会分配一块自己的内存以存储修改,
+Op在 `Run` 的实现中,操作更新自己的 `load_start_pos` ,
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后,从前往后的时间步的batch size会自然地递减,可以直接塞入 Net 做batch计算
+
+比如原始的输入:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后,每个会有4个时间步,每个时间步的输入如下(纵向排列)
+
+```
+0 1 2 3
+x x x x
+x x x
+x x
+```
+
+为了追踪排序前后序列的变化,这里用
+```c++
+struct SortedSeqItem {
+ void *start{nullptr};
+ void *end{nullptr};
+};
+
+std::vector sorted_seqs;
+```
+来追踪序列排序后的位置,并添加一个新的接口
+
+```c++
+std::vector SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化,以下现有的接口需要针对性地修改:
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外,由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用,因此会变成 `RecurrentOp` 一个新的output输出,
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化,`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息,将原始的序列按照排序后的序列顺序,从横向切割,转为每个step中的inputs。
+
+即下面的转变:
+```
+origin:
+xxxx
+xx
+xxx
+
+ |
+ |
+ \ /
+ !
+0 1 2 3
+x x x x
+x x x
+x x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序(以防止Infer阶段顺序打乱)
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9493908f4f73b3e7d91f5f6364a2a3660257d508
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
@@ -0,0 +1,175 @@
+# Varient Length supported RNN Design
+For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
+
+Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
+
+The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`,
+which supports the variable length sequences without padding.
+This doc will design fluid's RNN based on this idea.
+
+## Multi-layer sequence data format `LODTensor`
+At present, Paddle stores data in one mini-batch in one-dimensional array.
+
+`Argument.sequenceStartPositions` is used to store information for each sentence.
+
+In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
+
+In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
+
+
+```c++
+std::shared_ptr>> lod_start_pos_;
+```
+
+Or more clearly defined here
+
+```c++
+typedef std::vector level_t;
+std::vector lod_start_pos;
+```
+Each `level_t` here stores a level of offset information consistent with paddle's current practice.
+
+In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
+Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
+Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
+The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
+
+The definition of `LODTensor` is as follows:
+
+
+```c++
+class LODTensor : public Tensor {
+public:
+ size_t Levels() const { return seq_start_positions_.size(); }
+ size_t Elements(int level = 0) const {
+ return seq_start_positions_[level].size();
+ }
+ // slice of level[elem_begin: elem_end]
+ // NOTE low performance in slice seq_start_positions_.
+ // TODO should call Tensor's Slice.
+ LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+ // slice with tensor's data shared with this.
+ LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+ // copy other's lod_start_pos_, to share LOD info.
+ // NOTE the LOD info sould not be changed.
+ void ShareConstLODFrom(const LODTensor &other) {
+ lod_start_pos_ = other.lod_start_pos_;
+ }
+ // copy other's lod_start_pos_'s content, free to mutate.
+ void ShareMutableLODFrom(const LODTensor &other) {
+ lod_start_pos_ = std::make_shared <
+ std::vector>(other.lod_start_pos_.begin(),
+ other.lod_start_pos_.end());
+ }
+
+private:
+ std::shared_ptr>> lod_start_pos_;
+};
+```
+Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
+`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
+
+## How to support the framework
+### Replace `Tensor` with `LoDTensor`
+To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
+Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
+
+In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
+
+### Transmit `lod_start_pos` along with the Op call chain
+`lod_start_pos` is passed along with the Op call chain
+The framework needs to support the following features to implement the transmit of `lod_start_pos`:
+
+1. Implement the transfer as `shared_ptr`
+ - Do not modify the contents of `lod_start_pos` as a consumer
+ - Modify producer of `lod_start_pos` as producer
+ - Conventions consumer only needs to copy `shared_ptr` passed over
+ - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
+ - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
+
+2. Op is transparent enough not to sense `lod_start_pos`
+3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
+
+## sorted by length
+After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
+
+For example, the original input:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
+
+```
+0 1 2 3
+x x x x
+x x x
+x x
+```
+
+In order to track the changes before and after sorting, use here
+
+```c++
+struct SortedSeqItem {
+ void *start{nullptr};
+ void *end{nullptr};
+};
+
+std::vector sorted_seqs;
+```
+To track the position of the sequence after sorting, and add a new interface
+
+```c++
+std::vector SortBySeqLen(const LODTensor& tensor);
+```
+Due to the sequence of input sequences, the following existing interfaces need to be modified:
+
+- InitMemories, memory needs to be rearranged according to `sorted_seqs`
+- SetmentInputs
+- ConcatOutputs
+
+In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
+It is passed in as an input to `RecurrentGradientOp`.
+
+## InitMemories
+Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
+
+## SegmentInputs
+
+`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
+
+the transition is as follows:
+```
+origin:
+xxxx
+xx
+xxx
+
+ |
+ |
+ \ /
+ !
+0 1 2 3
+x x x x
+x x x
+x x
+```
+## ConcatOutputs
+`ConcatOutputs` needs
+
+- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
+- Concat each sequence as a regular mini-batch representation
+
+## references
+1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/execution/if_else_op.md b/doc/fluid/design/execution/if_else_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..26d140f06db4ecefa86be015eaa731ffddc6910c
--- /dev/null
+++ b/doc/fluid/design/execution/if_else_op.md
@@ -0,0 +1,51 @@
+# The `IfElse` Operator
+
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
+
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+ d = pd.layer.add(x, y)
+ ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+ d = pd.layer.fc(z)
+ ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+ int d = x + y;
+ o1 = z;
+ o2 = pd::layer::softmax(z);
+} else {
+ int d = pd::layer::fc(z);
+ o1 = d;
+ o2 = d+1;
+}
+```
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b017429d168b2466d8f6b423f48bd5d78d1f
--- /dev/null
+++ b/doc/fluid/design/execution/index_cn.rst
@@ -0,0 +1,8 @@
+执行流程
+-------------
+
+.. toctree::
+ :maxdepth: 1
+
+ switch.md
+ if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcf846da348ff0bed707c42718e08314998fbac0
--- /dev/null
+++ b/doc/fluid/design/execution/index_en.rst
@@ -0,0 +1,8 @@
+Execution Process
+--------------------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ switch.md
+ if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8
--- /dev/null
+++ b/doc/fluid/design/execution/switch.md
@@ -0,0 +1,31 @@
+# Design Doc: Switch
+
+## Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+ with switch.case(fluid.less_equal(a, 10)):
+ fluid.print("Case 1")
+ with switch.case(fluid.larger(a, 0)):
+ fluid.print("Case 2")
+ with switch.default():
+ fluid.print("Case 3")
+```
+
+## The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,19 @@
+设计思想
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ motivation/index_cn.rst
+ execution/index_cn.rst
+ concepts/index_cn.rst
+ data_type/index_cn.rst
+ memory/index_cn.rst
+ multi_devices/index_cn.rst
+ dynamic_rnn/index_cn.rst
+ concurrent/index_cn.rst
+ algorithm/index_cn.rst
+ network/index_cn.rst
+ modules/index_cn.rst
+ interface/index_cn.rst
+ dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bfee02ad4626633b08ddff747e2886faf9ba99f
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,19 @@
+Design
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ motivation/index_en.rst
+ execution/index_en.rst
+ concepts/index_en.rst
+ data_type/index_en.rst
+ memory/index_en.rst
+ multi_devices/index_en.rst
+ dynamic_rnn/index_en.rst
+ concurrent/index_en.rst
+ algorithm/index_en.rst
+ network/index_en.rst
+ modules/index_en.rst
+ interface/index_en.rst
+ dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69a8d9bad4fe88935b9fa87757abf0105ca8eb75
--- /dev/null
+++ b/doc/fluid/design/interface/index_cn.rst
@@ -0,0 +1,4 @@
+多语言接口
+------------
+
+TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22abc71f984aa5da7151d5ebf0c3bdbcc69a3624
--- /dev/null
+++ b/doc/fluid/design/interface/index_en.rst
@@ -0,0 +1,4 @@
+Multi-Language Interface
+-----------------------
+
+TBD
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..83ef97c99efeaf27a27f93f0cd3857c0f1bc812e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+ explicit Graph(const ProgramDesc &program);
+
+ bool Has(const std::string &attr_name) const;
+
+ template
+ AttrType &Get(const std::string &attr_name) const;
+
+ template
+ void Set(const std::string &attr_name, AttrType *attr);
+ const std::unordered_set &Nodes() const;
+
+ // Create a normal variable with non-null VarDesc.
+ ir::Node *CreateVarNode(VarDesc *var_desc);
+
+ // Create a normal runnable operator with OpDesc.
+ ir::Node *CreateOpNode(OpDesc *op_desc);
+
+ // Create a control dependency var that connects 2 operations. The
+ // var doesn't hold any data. Other than that, it's no different from
+ // other var, considering dependency analysis.
+ ir::Node *CreateControlDepVar();
+
+ // A more free style way of creating a graph node. Mostly use for test
+ // or "copy" from another node. Avoid using it if possible.
+ ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+ // Clear all node information of the graph and return the ownership of the
+ // nodes.
+ std::vector> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+ std::unique_ptr Apply(std::unique_ptr graph) const {
+ // Some correctness check.
+ auto new_graph = ApplyImpl(std::move(graph));
+ // Some correctness check.
+ return new_graph;
+ }
+
+ // Get a reference to the attributed previously set.
+ template
+ AttrType &Get(const std::string &attr_name) const;
+
+ // Set a pointer to the attribute. Pass takes ownership of the attribute.
+ template
+ void Set(const std::string &attr_name, AttrType *attr) ;
+
+ // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+ // should delete the attribute.
+ template
+ void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+ virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+ std::unique_ptr ApplyImpl(std::unique_ptr graph) const override {
+ // do something.
+ return graph;
+ }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/design/memory/README.md b/doc/fluid/design/memory/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7cf61d089b39041b7a15184e0ea9211d14a66f5e
--- /dev/null
+++ b/doc/fluid/design/memory/README.md
@@ -0,0 +1,141 @@
+# Region-based Heterogeneous Memory Management
+## Design
+
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::CUDAPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template void* Alloc(Place, size_t);
+template void Free(Place, void*);
+template size_t Used(Place);
+} // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
+
+```cpp
+template<>
+void* Alloc(CPUPlace p, size_t size) {
+ return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and
+
+```cpp
+template<>
+void Alloc(CUDAPlace p, size_t size) {
+ return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+ static BuddyAllocator* a = NULL;
+ if (a == NULL) {
+ a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+ }
+ return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+ static BuddyAllocator* as = NULL;
+ if (as == NULL) {
+ as = new BuddyAllocator*[platform::NumGPUs()];
+ for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+ as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+ }
+ }
+ return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+ ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+ struct Block {
+ size_t size;
+ Block* left, right;
+ size_t index; // allocator id
+ };
+ ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor::mutable_data()` allocates the memroy. In particular, [`Tensor::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/doc/fluid/design/memory/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/design/memory/images/control_flow_graph.png differ
diff --git a/doc/fluid/design/memory/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/design/memory/images/dataflow_equations.png differ
diff --git a/doc/fluid/design/memory/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/design/memory/images/deep_learning.png differ
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c507c638bd1a6eb428175ed2756a6ecfc6cca198
--- /dev/null
+++ b/doc/fluid/design/memory/index_cn.rst
@@ -0,0 +1,7 @@
+内存管理
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7526437a73a09b300f05e138084755f5528b242
--- /dev/null
+++ b/doc/fluid/design/memory/index_en.rst
@@ -0,0 +1,7 @@
+Memory Management
+-------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ memory_optimization.md
diff --git a/doc/fluid/design/memory/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..285464ada728d8f7a086a26beca6cfa4418e98e4
--- /dev/null
+++ b/doc/fluid/design/memory/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
+
+Following graph shows the details:
+
+
+
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference.
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
+
+#### In-place Operation
+In a relu activation operator:
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
+
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+
+
+#### Dataflow Analysis
+
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+ def __init__(self, Program):
+ self._sucessors = defaultdict(set)
+ self._presucessors = defaultdict(set)
+ self._uses = defaultdict(set)
+ self._defs = defaultdict(set)
+ self._live_in = defaultdict(set)
+ self._live_out = defaultdict(set)
+ self._program = Program
+
+ def build(self):
+ pass
+
+ def dataflow_analysis(self):
+ pass
+
+ def memory_optimization(self):
+ pass
+
+ def get_program(self):
+ return self._program
+```
+
+#### Make dataflow analysis
+
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+ i --> pool
+ pool --> o
+else:
+ pool --> o
+ i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/fluid/design/modules/backward.md b/doc/fluid/design/modules/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a
--- /dev/null
+++ b/doc/fluid/design/modules/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place.
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+ """
+ Append backward part to main_program
+
+ Args:
+ loss(Variable): The variable generated by the cost function.
+ parameter_list(list): Parameters that need to be updated by optimizers.
+ If None, it means all parameters need to be updated.
+
+ no_grad_set(set): Variables that have no gradients in Block 0.
+ If None, the set will be generated inside the function and
+ contains all variables with `step_gradient=True` from all blocks.
+
+ Return:
+ (list[Variable]): list of (parameters, gradients) pair.
+ """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building.
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables.
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+ block,
+ target_block,
+ no_grad_dict,
+ grad_to_var):
+ """
+ Create all grad ops, and insert them into given block
+
+ Args:
+ target(Variable): the target variable of forward pass
+ block(Block): the block where forward ops are
+ target_block(Block): the block which is going to hold new generated grad ops
+ no_grad_dict(dict):
+ key(int) block index
+ val(set) a set of varibale names. These varibales have no gradient
+ grad_to_var(dict)(output argument):
+ key(str): grad variable name
+ val(str): corresponding forward variable name
+ """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`.
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+ if op has an attribute named 'sub_block':
+ Get the sub-block(`s_block`) from op's attribute.
+ Create a new block(`grad_s_block`), whose father is `s_block`.
+ Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+
+ Invoke `core.get_grad_op_desc()` to get op's grad_op.
+ Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+ Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+ Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up.
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass.
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros.
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False).
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block,
+ start_op_idx,
+ grad_to_var,
+ grad_info_map):
+ """
+ Create new variables required by backward pass.
+
+ Args:
+ block(Block): the block where new variables will be created
+ start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+ grad_to_var(dict):
+ key(str): grad variable name
+ val(str): corresponding forward variable name
+ In most cases, this dict is generated by _append_backward_ops_()
+ grad_info_map(dict)(output argument):
+ key(str): forward variable name
+ val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+ """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+ if op has an attribute named 'sub_block':
+ Get the sub-block(`s_block`) from op's attribute.
+ Invoke _append_backward_vars_(), with `block=s_block`
+
+ for var_name in op.all_output_names():
+ if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+ continue
+ create a new variable named 'var_name' in block
+ if grad_to_var.has_key(var_name):
+ set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+
+ do op's var type inference
+ do op's shape inference
+```
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37
--- /dev/null
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -0,0 +1,134 @@
+# Batch Normalization
+
+## What is batch normalization
+
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
+
+The principle of batch normalization can be summarized into a simple function:
+
+```
+y = (x - E[x]) / STD[x]) * scale + bias
+```
+
+`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
+
+In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
+
+## Differences with normal operators
+
+`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
+
+1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
+
+2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
+
+```
+if batch_id == 0
+ estimated_mean = E[x]
+else
+ estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
+```
+
+The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
+
+## Implementation
+
+Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
+
+### C++
+
+As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
+
+#### Inputs
+
+- `x`: The inputs data, which is generated by the previous layer.
+- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
+- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
+- `scale`: trainable parameter 'scale'
+- `bias`: trainable parameter 'bias'
+
+#### Outputs
+
+- `y`: The output data.
+- `batch_mean`: The mean value of batch data.
+- `batch_var`: The standard deviation value of batch data.
+- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
+- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
+
+#### Attributes
+
+- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
+- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
+- `epsilon`: *float*. The epsilon value to avoid division by zero.
+- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
+
+#### Kernels
+
+The following graph showes the training computational process of `batch_norm_op`:
+
+
+
+cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
+
+### Python
+
+`batch_norm_op` is warpped as a layer in Python:
+
+```python
+def batch_norm_layer(net,
+ input,
+ output,
+ scale,
+ bias,
+ use_global_est = False,
+ epsilon = 1e-6,
+ momentum = 0.99):
+ mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
+ var_cache = scop.new_var(name = 'estimated_var', trainable = False)
+ batch_mean = scope.new_var(name = 'batch_mean')
+ batch_var = scope.new_var(name = 'batch_var')
+ batch_norm_op = Operator('batch_norm_op',
+ x = input,
+ estimated_mean = mean_cache,
+ estimated_mean = var_cache,
+ scale = scale,
+ bias = bias,
+ y = output,
+ batch_mean = batch_mean,
+ batch_var = batch_var,
+ saved_mean = mean_cache,
+ saved_var = var_cache,
+ is_infer = False,
+ use_global_est = use_global_est,
+ epsilon = epsilon,
+ momentum = momentum)
+ net.append_op(batch_norm_op)
+ return output
+```
+
+Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
+
+1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
+
+2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
+
+```python
+for pass_id in range(PASS_NUM):
+ # ...
+ net.train() # run training model
+ if pass_id % 100 == 0:
+ net.infer(test_image) # run inferencing model
+ # ...
+```
+
+`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
+
+
+

+
+
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
+
+When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
+
+How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md
new file mode 100644
index 0000000000000000000000000000000000000000..de9605b0e67a035ab1ef1e4cafbe838f83bc5807
--- /dev/null
+++ b/doc/fluid/design/modules/evaluator.md
@@ -0,0 +1,58 @@
+# Evaluator Design
+
+## Problem Statement
+
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
+
+## Evaluator Design
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+## Implementation
+This design is shown in the Python API.
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
+
+
+```python
+class Evaluator(object):
+ """
+ Evaluator Base class.
+ """
+ def __init__(self, name, **kwargs):
+ """
+ Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+ Auc need four variables, `true_positives`,
+ `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+ The initialization of Evaluator should be responsible for:
+ create metric states and append to the main_program
+ """
+ pass
+
+ def _update_ops(self, input, label, **kwargs)
+ """
+ Add mini-batch evaluator caculate operators to the main_program.
+ Add increment operator to accumulate the metric states.
+ """
+
+
+ def reset(self, executor, reset_program=None):
+ """
+ Reset metric states at the begin of each pass/user specified batch number.
+ Execute the reset_program to reset the states.
+ """
+
+
+ def eval(self, executor, eval_program=None):
+ """
+ Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+ Execute the eval_program and return the result.
+ """
+ return eval_result
+```
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/design/modules/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+ subgraph cluster_before {
+ Prev [label="...", shape=plaintext];
+ Rnn [label="rnn_op", shape=box];
+ BatchNorm [label="batch_norm_op", shape=box];
+ Fc [label="fc_op", shape=box];
+ After [label="...", shape=plaintext];
+ Prev -> Rnn -> BatchNorm -> Fc -> After;
+ label="original";
+ }
+
+ subgraph cluster_after {
+ Prev2 [label="...", shape=plaintext];
+ Rnn2 [label="rnn_op", shape=box];
+ BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+ BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+ Fc2_1 [label="fc_op", shape=box];
+ Fc2_2 [label="fc_op", shape=box];
+ After2_1 [label="...", shape=plaintext];
+ After2_2 [label="...", shape=plaintext];
+ Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+ Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+ label="forked";
+ }
+}
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/design/modules/images/batch_norm_fork.png differ
diff --git a/doc/fluid/design/modules/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/design/modules/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/design/modules/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/design/modules/images/feed_forward.png differ
diff --git a/doc/fluid/design/modules/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/design/modules/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/design/modules/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/design/modules/images/l1_regularization.png differ
diff --git a/doc/fluid/design/modules/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/design/modules/images/l2_regularization.png differ
diff --git a/doc/fluid/design/modules/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/design/modules/images/loss_equation.png differ
diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b25783f0f5120991c29ba31b7b512bd4c183eecf
--- /dev/null
+++ b/doc/fluid/design/modules/index_cn.rst
@@ -0,0 +1,14 @@
+代码结构和重要模块
+-----------------
+
+.. toctree::
+ :maxdepth: 1
+
+ backward.md
+ python_api.md
+ regularization.md
+ infer_var_type.md
+ optimizer.md
+ prune.md
+ register_grad_op.md
+ net_op_design.md
diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2108156e080996916f2650448f0a56f998757204
--- /dev/null
+++ b/doc/fluid/design/modules/index_en.rst
@@ -0,0 +1,14 @@
+Code Structure and Important Modules
+-------------------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ backward.md
+ python_api.md
+ regularization.md
+ infer_var_type.md
+ optimizer.md
+ prune.md
+ register_grad_op.md
+ net_op_design.md
diff --git a/doc/fluid/design/modules/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a
--- /dev/null
+++ b/doc/fluid/design/modules/infer_var_type.md
@@ -0,0 +1,78 @@
+# Design Doc: InferVarType
+
+## The Problem Posed
+
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+
+## Proposed Solution
+
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+
+
+```c++
+using InferVarTypeFN = std::function<
+ void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+
+It takes an operator description as its input and will write the output variable type and store them in block description.
+
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+ InferVarTypeFN infer_var_type_;
+ ...
+};
+```
+
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+ // set the output type of variable as `LoDTensor`.
+ // ...
+}
+
+struct OpInfo {
+ InferVarTypeFN infer_var_type_;
+ InferVarTypeFN GetInferVarType() const {
+ if (infer_var_type_) {
+ return infer_var_type_;
+ } else {
+ return DefaultInferVarType;
+ }
+ }
+};
+```
+
+## Register InferVarType
+
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+
+```cpp
+class VarTypeInferer {
+public:
+ virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+
+Operator developers can write the specialize `VarTypeInferer` as follow.
+
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+ virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+ // .. own logic
+ }
+}
+```
+
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e64ac2fb1c6898bfeb883250347da3d9a4757b97
--- /dev/null
+++ b/doc/fluid/design/modules/net_op_design.md
@@ -0,0 +1,250 @@
+# Network Design
+
+`Network` is the container and controller of a set of operators,
+user can build a real network from a `NetDesc` which is a protobuf message
+and use `Network.Run()` to run all the operators in the network.
+
+A network object knows all Operators belonging to this network. Variables,
+which are inputs and outputs of these operators,
+are created and managed by a hierarchy of Scope objects.
+
+## API
+
+### Net
+To make the `Network` extendable, a base class is defined like this
+
+```c++
+// operator's index stored in a network.
+typedef int OpIndex;
+
+// The minimum a network should be implemented.
+class Net {
+ public:
+ // run all the operators and return success(true) or not, with all the
+ // variables are located in `scope`. `context` describes the detail execution
+ // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+ // If no positive indexes are provided, all operators in `ops_` will run.
+ virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+ OpIndex end = -1) const = 0;
+
+ // Add an Operator according to `def`.
+ virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+ // Add optimizer operators acctording to `attrs`.
+ virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+ // Add backward operators.
+ virtual Error AddBackwardOps() = 0;
+
+ // Infer the shapes of variables required by operators in the network. The
+ // `scope` will be mutated according to the inferred shapes.
+
+ static std::unique_ptr Create(const NetDesc &def = NetDesc());
+};
+```
+
+All network implementations should build networks from a protobuf message which
+describes the structure of a real network; `Run` method should be implemented by
+all implementations to offer a universal method to forward or backward compute a network.
+
+`Net::Create` is a method of factory pattern and can be implemented like
+
+```c++
+std::unique Net::Create(const NetDesc& def) {
+ switch (def.model_type()) {
+ case NN:
+ return new Network(def);
+ case Recursive:
+ return new RecursiveNet(def);
+ case Recurrent:
+ return new RecurrentNet(def);
+ }
+ return nullptr;
+}
+```
+
+Network is designed as the container of operators. to make it more extendable,
+we decouple it from the related variable resources.
+
+`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
+
+Finally, `Net` can be used as followed
+
+```c++
+Scope default_scope;
+OpContext default_context;
+auto net = Net::CreateNet(def);
+
+if (net) {
+ net.Run(&default_scope, &default_context);
+}
+```
+
+### `PlainNet` as a simple implementation of `BaseNet`
+
+A very basic implementation is as follows. All it does is simply to run every operators in sequence.
+
+```c++
+class PlainNet : public Net {
+ public:
+ // Create a network describe by `def`. NetDesc is the definition of a network.
+ PlainNet(const NetDesc &def);
+
+ // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+ training.
+ virtual Error InferShape(Scope *scope) override;
+
+ // Run all the operators with the `scope`, if no scope is provided, default
+ // scope will be used instead. If no OpContext is provicded, default context will be used.
+ virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
+ OpIndex end = -1) const override;
+
+ virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+ virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+ virtual Error AddBackwardOps() override;
+
+ protected:
+ // Create operators accordding to `def`, will be called by the constructor.
+ Error BuildNet(const NetDesc &def);
+
+ // Add a operator which is identified as `type` and has attributes described
+ // in `attrs`, the `inputs` are the keys of readonly input variables,
+ // `outputs` are keys of mutable output variables. An `OpIndex` will be
+ // returned to indicate the offset of the new operator in `ops_`.
+ OpIndex AddOp(const std::string &type, const std::vector &inputs,
+ const std::vector &outputs,
+ const OprAttr &attrs = OprAttr());
+
+ private:
+ // the operators owned by `Network`.
+ std::vector ops_;
+};
+```
+
+`PlainNet` will create operators so that a private member `ops_` is defined,
+the operators are created by `CreateNet`, and each operator is created by `AddOp`.
+
+
+## PlainNet Usage
+`PlainNet` can be used to define and run a network as follows
+
+```c++
+// create an empty scope located on CPU device.
+Scope scope(CPUPlace());
+
+// create and init variables described in `net_desc`.
+scope.CreateVariables(net_desc);
+scope.InitVariables(net_desc);
+
+// create a network according to `net_desc`
+auto net = Net::CreateNet(net_desc);
+// Add more operators if needed.
+net->AddOp(add...);
+net->AddOp(fc...);
+
+net->AddBackwardOps();
+net->AddOptimizerOps();
+
+// run the network providing the `scope`.
+net.Run(&scope);
+```
+
+## `NetBuilder` as a C++ syntax wrapper
+This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
+
+The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
+
+```c++
+Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
+Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
+Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
+Variable* avg_loss = builder.AddOp("mean", loss);
+
+builder.BackwardFrom(avg_loss)
+builder.AddOptimization(1e-4, "adam");
+builder.Run();
+```
+
+`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
+
+```c++
+class NetBuilder final {
+ public:
+ NetBuilder(Net* net) : net_(net) {}
+
+ Variable* AddOp(const string& type, const vector& inputs,
+ size_t size, Activation act) {
+ // much code here.
+ // ...
+ net_->AddOp(def);
+ need_rebuild_net_ = true;
+ net_->InferShape();
+ // ...
+ }
+
+ Error BackwardFrom(const Variable& cost);
+
+ Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
+ // backward.
+ if (need_backward) {
+ if (need_rebuild_net_) {
+ AddBackwardOps();
+ AddOptimizerOps();
+ }
+ net_->Run(scope, context);
+ return;
+ }
+ // just forward.
+ net_->Run(scope, context, 0, last_forward_op_);
+ }
+
+ protected:
+ Error AddBackwardOps();
+ Error AddOptimizerOps();
+
+ private:
+ Net* net_;
+ OpIndex last_forward_op_{-1};
+ bool need_rebuild_net_{true};
+}
+```
+
+### Compatibility with RNN
+
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
+for example we can implement a simple recurrent neural network as follows
+
+```c++
+// copy some `vars` form `source` to `target`
+void Copy(const Scope &source, Scope &target,
+ const std::vector &vars);
+
+Scope default_scope;
+// some initial mutations on `default_scope` here.
+
+auto rnn_step_net = PlainNet(rnn_step_net_def);
+
+// Create rnn's states, the last scope is used to store rnn outputs.
+Scope *rnn_states = new Scope[num_states + 1];
+
+for (int i = 0; i < num_states + 1; i++) {
+ // Initialize all rnn state scopes, copy parameters and so on.
+ rnn_states[i].CreateVars(rnn_step_net_def);
+ Copy(default_scope, rnn_states[i], rnn_related_vars);
+ // Prepare rnn's inlinks, just copy inlink variables to each state.
+ Copy(default_scope, rnn_states[i], inlink_vars);
+}
+
+// Run the rnn.
+for (int i = 0; i < num_states; i++) {
+ rnn_step_net.Run(rnn_states[i]);
+ // Copy current state's state variables to next state, the related variables
+ // are named like "previous_state_xxx".
+ Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
+}
+
+// Copy rnn's final outputs to `default_scope`.
+Copy(rnn_states[num_states], default_scope, outlink_vars);
+```
diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c25fde9cafb322f789662077d3fc6cc1d64ce38
--- /dev/null
+++ b/doc/fluid/design/modules/optimizer.md
@@ -0,0 +1,91 @@
+# Optimizer Design
+
+## The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+## High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+ ```python
+ images = layer.data("images")
+ labels = layer.data("labels")
+ w1 = pd.var("w1")
+ b1 = pd.var("b1")
+ hidden = layer.fc(images, w=w1, b=b1)
+ cost = layer.mse(hidden, labels)
+ ```
+
+ The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+ ```python
+ optimizer = AdagradOptimizer(learing_rate=0.001)
+ ```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+ ```python
+ opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+ ```
+ The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+ ```python
+ sess.run(target= opt_op_list, ...)
+ ```
+
+### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+ """Optimizer Base class.
+
+ """
+
+ def __init__(self):
+ pass
+
+ def create_optimization_pass(self, parameters_and_grads):
+ """Add optimization operators to update gradients to variables.
+
+ Args:
+ parameters_and_grads: a list of (variable, gradient) pair to update.
+
+ Returns:
+ optmization_op_list: a list of optimization operator that will update parameter using gradient.
+ """
+ return None
+
+ def minimize(self, loss, parameter_list):
+ """Add operations to minimize `loss` by updating `parameter_list`.
+
+ This method combines interface `append_backward()` and
+ `create_optimization_pass()` into one.
+ """
+ params_grads = self.create_backward_pass(loss, parameter_list)
+ update_ops = self.create_optimization_pass(params_grads)
+ return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/fluid/design/modules/prune.md b/doc/fluid/design/modules/prune.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a
--- /dev/null
+++ b/doc/fluid/design/modules/prune.md
@@ -0,0 +1,63 @@
+# Prune
+
+## Motivation
+
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+
+## Challenge
+
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+
+## Solution
+
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+
+```c++
+message OpDesc {
+ required string type = 3;
+ repeated Var inputs = 1;
+ repeated Var outputs = 2;
+ repeated Attr attrs = 4;
+ optional bool is_target = 5 [ default = false ];
+};
+```
+
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+
+### Algorithm
+
+If an operator needs to be run, it must fall into one of the following cases:
+
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) {
+ for (auto& var : op_desc.outputs()) {
+ for (auto& argu : var.arguments()) {
+ if (dependent_vars.count(argu) != 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+```
+
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..83af4e55485c079265d3f2b1e15070825b532c02
--- /dev/null
+++ b/doc/fluid/design/modules/python_api.md
@@ -0,0 +1,325 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+
+
+
+Python classes |
+Protobuf messages |
+
+
+
+
+Program |
+ProgramDesc |
+
+
+Block |
+BlockDesc |
+
+
+Operator |
+OpDesc |
+
+
+Variable |
+VarDesc |
+
+
+
+
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+ def __init__(self):
+ self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+ self.blocks = vector()
+ self.blocks.append(Block(self, -1)) # the global block
+ self.current_block = 0 # initialized to the global block
+
+ def global_block():
+ return self.blocks[0]
+
+ def current_block():
+ return self.get_block(self.current_block)
+
+ def rollback():
+ self.current_block = self.current_block().parent_idx
+
+ def create_block():
+ new_block_idx = len(self.block)
+ self.blocks.append(Block(self, self.current_block))
+ self.current_block = new_block_idx
+ return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+ def __init__(self, program, parent_idx):
+ self.desc = core.NewBlock(program.desc)
+ self.program = program
+ self.vars = map()
+ self.ops = vector()
+ self.parent_idx = parent_idx
+
+ def create_var(self, ...):
+ return Variable(self, ...)
+
+ def _create_global_var(self, ...):
+ program.global_block().create_var(...)
+
+ def create_parameter(self, name, ...):
+ # Parameter is a subclass of variable. See Parameter section for details.
+ self.vars[name] = Parameter(self._create_global_var(...), ...)
+ return self.vars[name]
+
+ def append_operator(self, ...):
+ self.ops.append(Operator(self, ...))
+
+ def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+ self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+ def __init__(self,
+ block, # Block
+ type, # string
+ inputs, # dict
+ outputs,# dict
+ attrs # dict
+ ):
+ self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+ core.infer_shape(self.desc, inputs, outputs)
+
+ def type(self):
+ return self.desc.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+ def __init__(self,
+ block=None, # Block
+ name=None, # string
+ shape, # tuple
+ dtype="float32", # string
+ lod_level=None # int
+ ):
+ if name is None:
+ name = unique_name_generator()
+ self.name = name
+ self.block = block
+ self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+ self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+ def __init__(self,
+ block=None, # Block
+ name=None, # string
+ shape, # tuple
+ dtype="float32", # string
+ lod_level=None # int
+ trainable, # bool
+ initialize_op_attrs,
+ optimize_op_attrs):
+ super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+ self.trainable = trainable
+ self.optimize_op_attrs = optimize_op_attrs
+ block.prepend(Operator(block, # Block
+ initialize_op_attrs['type'], # string
+ None, # no inputs
+ self, # output is the parameter
+ initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+ ...,
+ init_attr={
+ type: "uniform_random",
+ min: -1.0,
+ max: 1.0,
+ })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator. It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Function
+
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+ if name is None:
+ name = unique_name("fc")
+ input = multiple_input(input)
+ param_attr = default_param_attr(param_attr)
+ param_attr = multiple_param_attr(param_attr, len(input))
+
+ # mul
+ mul_results = []
+ for ipt, attr in zip(input, param_attr):
+ shape = ipt.shape[1:] + [size]
+ w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+ tmp = create_tmp_var(name)
+ g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+ mul_results.append(tmp)
+
+ # add sum
+ ...
+ # add bias
+ ...
+ # add activation
+ ...
+ return out
+```
+
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+ helper = LayerHelper(locals()) # pass all parameter to LayerHelper
+
+ mul_results = []
+ for ipt, param in helper.iter_multiple_input_and_param():
+ w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+ tmp = helper.create_tmp_variable()
+ helper.append_op('mul', {ipt, w}, {tmp})
+ mul_results.append(tmp)
+
+ pre_bias = helper.add_sum(mul_results)
+ pre_activation = helper.add_bias(pre_bias)
+ return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
+
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+
+```python
+class LayerHelper(object):
+ def __init__(self, **kwargs): # kwargs is short for `keyword arguments`
+ self.kwargs = kwargs
+
+ def add_activation(self, input_var):
+ act = self.kwargs.get("act", None) # default value is None
+ if act is None: # do nothing if no act
+ return input_var
+
+ tmp = self.create_tmp_var(self)
+ self.append_op(type=act, input=input_var, output=tmp)
+ return tmp
+```
+
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator. However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/fluid/design/modules/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c
--- /dev/null
+++ b/doc/fluid/design/modules/register_grad_op.md
@@ -0,0 +1,92 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+
+However, we noticed two problems with the current design:
+
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+
+1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+
+## The Current Implementation
+
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+
+```cpp
+struct OpInfo {
+ std::function creator_;
+ std::string grad_op_type_;
+ ...
+};
+
+map OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+ return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+
+```cpp
+// (OpDesc) --> vector
+std::function(const OpDescBind&)>;
+```
+
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like
+
+```cpp
+struct OpInfo {
+ std::function>(const OpDescBind&)> grad_op_maker_;
+ ...
+};
+```
+
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+ GradOpDescMakerBase(const OpDescBind& );
+ virtual std::vector> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+ GradOpMaker maker(fwd_op);
+ return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
new file mode 100644
index 0000000000000000000000000000000000000000..519a9143033386678351ff78a465e5ba6e220c52
--- /dev/null
+++ b/doc/fluid/design/modules/regularization.md
@@ -0,0 +1,66 @@
+# Regularization in PaddlePaddle
+
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
+
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+
+
+
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+
+##### L2 Regularization:
+
+
+##### L1 Regularization
+
+
+A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+
+## Regularization Survey
+
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
+
+## Proposal for Regularization in PaddlePaddle
+
+### Low-Level implementation
+
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Computation Graph
+
+Below is an example of a really simple feed forward neural network.
+
+
+
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+
+
+
+### Python API implementation for Regularization
+
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
+
+The proposal is to add these ops in a lazy manner just before the backward pass.
+
+#### Storage of Regularization attributes
+
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/fluid/design/modules/selected_rows.md b/doc/fluid/design/modules/selected_rows.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5
--- /dev/null
+++ b/doc/fluid/design/modules/selected_rows.md
@@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+ vector rows_;
+ Tensor value_;
+ int height_;
+};
+```
+
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+ rows = [73, 84],
+ value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data.
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+ required DataType data_type = 1;
+ repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+ required TensorDesc tensor = 1;
+ optional int lod_level = 2;
+}
+
+message VarDesc {
+ required string name = 1;
+ enum VarType {
+ LOD_TENSOR = 0;
+ SELECTED_ROWS = 1;
+ }
+ required VarType type = 2;
+ optional LodTensorDesc lod_desc = 3;
+ optional TensorDesc selected_rows_desc = 4;
+ optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+ ...
+ context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators that need to be written to support `SelectedRows`. These are:
+
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc222564e3ec28e306ca0572b6a23104f6e9cbc5
--- /dev/null
+++ b/doc/fluid/design/motivation/api.md
@@ -0,0 +1,261 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems as neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+ parameters.
+
+1. In some topologies, layers share parameters. For
+ example,
+ [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+ of parameters. PaddlePaddle can figure out and create parameters
+ required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example. For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+ e = paddle.layer.embedding(in, parameter_name="embedding")
+ o = paddle.layer.softmax(e, parameter_name="semantic")
+ return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+ paddle.layer.cross_entropy(fA, fQ),
+ paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use GAN in this example. In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+
+
+```python
+def G(in):
+ # over-simplified example as G has only one layers:
+ return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+ # again, over-simplified:
+ return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D. Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+ train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+ train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers. Every layer
+ has a *parameter name*. If the users don't specify it explicitly, it's automatically generated as a unique name. By
+ specifying the parameter name, users can specify the sharing of
+ parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+ or more topologies from parameter names of layers. It creates these
+ parameters and returns a `ParameterSet` object, which is in essence
+ a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+ requires both a topology and the parameter set that holds the parameters of that topology. There are some reasons:
+
+ 1. This prevents users from forgetting to call
+ `paddle.parameters.create`.
+ 1. `paddle.train` needs to know which parameter set to update.
+ 1. Users could load another (pre-trained) parameter set and use it
+ with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+ `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files. Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`. The former is less flexible but also less error-prone. We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+ feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`. Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`. This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+ if ininstance(event, paddle.event.EndIteration):
+ print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+ trainer=paddle.trainer.SGD(...,
+ paddle.updater.Adam(...)),
+ reader=read,
+ k8s_user="yi",
+ k8s_token="kube_cluster_tls.pem",
+ k8s_job="hello",
+ num_parameter_servers=15)
+```
+
+The pseudo code of `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+ if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+ image_name = k8s_user + '/' + k8s_job
+ docker_build(image_name)
+ docker_push()
+ kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+ else:
+ rank = kube_list_containers_in_job_and_return_current_containers_rank()
+ if rank == 0:
+ master()
+ elif rank < 15:
+ parameter_server()
+ else:
+ trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*. Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/fluid/design/motivation/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.graffle differ
diff --git a/doc/fluid/design/motivation/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/design/motivation/fluid-compiler.png differ
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid.md
@@ -0,0 +1,140 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+
+
+
+Existed since |
+model as sequence of layers |
+model as graph of operators |
+No model |
+
+
+
+
+2013 |
+Caffe, Theano, Torch, PaddlePaddle |
+ |
+ |
+
+
+2015 |
+ |
+TensorFlow, MxNet, Caffe2, ONNX, n-graph |
+ |
+
+
+2016 |
+ |
+ |
+ PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+
+
+
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+ m = read_minibatch()
+ forward({input=x, data=m}, minimize=c)
+ backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+ m = read_minibatch()
+ x = m["image"]
+ l = m["label"]
+ f = layer.fc(x, W)
+ s = layer.softmax(f)
+ c = layer.mse(l, s)
+ backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+ m = read_minibatch()
+ x = m["sentence"]
+ for t in xrange x.len():
+ h[t] = the_step(x[t])
+```
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+ m = read_minibatch()
+ x = m["sentence"]
+ rnn = layers.While(...)
+ with rnn.block():
+ h[t] = the_step(input[t])
+```
+
+An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..6dd3840a0734e8593890dcf8044746197350c6f5
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -0,0 +1,110 @@
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself. The C++ class `Executor` can run this
+protobuf message as an interpreter. This article describes the Fluid
+compiler.
+
+
+
+## ProgramDesc
+
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+
+```python
+import "fluid"
+
+func paddlepaddle() {
+ X = fluid.read(...)
+ W = fluid.Tensor(...)
+ Y = fluid.mult(X, W)
+}
+```
+
+This program consists of a [block](../concepts/block.md) of three operators --
+`read`, `assign`, and `mult`. Its `ProgramDesc` message looks like
+the following
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, W, Y],
+ ops = [
+ read(output = X)
+ assign(input = ..., output = W)
+ mult(input = {X, W}, output = Y)
+ ],
+ }
+}
+```
+
+## Transpilers
+
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`. Let us take some
+examples:
+
+1. *Memory optimization transpiler*: We can write a transpiler that
+ inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+ to free memory early, before the end of an iteration, so to keep a
+ small memory footprint.
+
+1. *Distributed training transpiler*: We can write a transpiler that
+ converts a`ProgramDesc` into its distributed version of two
+ `ProgramDesc`s -- one for running by the trainer processes and the
+ other for the parameter server.
+
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+
+## Native Code Generator
+
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+
+```c++
+void main() {
+ auto X = fluid_cuda_read(...);
+ auto W = fluid_cuda_create_tensor(...);
+ auto Y = fluid_cuda_mult(X, W);
+}
+```
+
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`. Please be aware
+that each function could just define a C++ instance of an operator and
+run it. For example
+
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+ paddle::Tensor t;
+ paddle::operator::Read r(&t, ...);
+ r.Run();
+ return t;
+}
+```
+
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
+ const paddle::Tensor& b) {
+ paddle::Tensor t;
+ paddle::operator::Mult m(a, b, ...);
+ Mult.Run(cuda_context);
+}
+```
+
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+
+## Multi-Block Code Generation
+
+Most Fluid application programs may have more than one blocks. To
+execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7706e73eca644ed6db772fd77da947395313237f
--- /dev/null
+++ b/doc/fluid/design/motivation/index_cn.rst
@@ -0,0 +1,10 @@
+设计动机和目标
+-------------
+
+.. toctree::
+ :maxdepth: 1
+
+ api.md
+ refactorization.md
+ fluid.md
+ fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10b64b257c604ced6b957d6d6018e8a363f00fac
--- /dev/null
+++ b/doc/fluid/design/motivation/index_en.rst
@@ -0,0 +1,10 @@
+Design Motivations and Goals
+--------------------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ api.md
+ refactorization.md
+ fluid.md
+ fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d0f6d3f3ad9884f108826e8410871fffd51bf
--- /dev/null
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -0,0 +1,275 @@
+# Design Doc: Refactorization Overview
+
+The goals of refactoring include:
+
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
+
+## Computation Graphs
+
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
+
+ 1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
+
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
+
+1. A graph is composed of *variables* and *operators*.
+
+1. The description of graphs must be serializable/deserializable, so that:
+
+ 1. It can be sent to the cloud for distributed execution, and
+ 1. It can be sent to clients for mobile or enterprise deployment.
+
+1. The Python program does two things
+
+ 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
+ 1. the C++ library `libpaddle.so` for local execution,
+ 1. the master process of a distributed training job for training, or
+ 1. the server process of a Kubernetes serving job for distributed serving.
+ 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
+
+## Description and Realization of Computation Graph
+
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
+
+At runtime, the C++ program realizes the graph and runs it.
+
+
+
+
+ |
+Representation (protobuf messages) |
+Realization (C++ class objects) |
+
+
+
+
+Data |
+
+VarDesc |
+
+Variable |
+
+
+Operation |
+
+OpDesc |
+
+Operator |
+
+
+Block |
+BlockDesc |
+Block |
+
+
+
+
+
+The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+
+## Compilation and Execution
+
+1. Run a Python program to describe the graph. In particular, the Python application program does the following:
+
+ 1. Create `VarDesc` to represent local/intermediate variables,
+ 1. Create operators and set attributes,
+ 1. Validate attribute values,
+ 1. Infer the type and the shape of variables,
+ 1. Plan memory-reuse for variables,
+ 1. Generate the backward graph
+ 1. Add optimization operators to the computation graph.
+ 1. Optionally, split the graph for distributed training.
+
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
+
+ 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
+ 1. realize local variables defined in the BlockDesc message in the new scope,
+ 1. a scope is similar to the stack frame in programming languages,
+
+ 1. Create an instance of class `Block`, in which,
+ 1. realize operators in the BlockDesc message,
+
+ 1. Run the Block by calling
+ 1. `Block::Eval(vector* targets)` for forward and backward computations, or
+ 1. `Block::Eval(vector* targets)` for optimization.
+
+
+## Intermediate Representation (IR)
+
+```text
+Compile Time -> IR -> Runtime
+```
+
+### Benefits of IR
+
+- Optimization
+ ```text
+ Compile Time -> IR -> Optimized IR -> Runtime
+ ```
+- Automatically send partitioned IR to different nodes.
+ - Automatic Data Parallelism
+ ```text
+ Compile Time
+ |-> Single GPU IR
+ |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+ |-> Node-0 (runs trainer-IR-0)
+ |-> Node-1 (runs trainer-IR-1)
+ |-> Node-2 (runs pserver-IR)
+ ```
+ - Automatic Model Parallelism (planned for future)
+
+---
+
+## Operator/OpWithKernel/OpKernel
+
+
+
+---
+
+## Operator
+
+
+* `Operator` is the fundamental building block of the user interface.
+ * Operator stores input/output variable names and attributes.
+ * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+ * Use `Run` to compute the `output` variables from the `input` variables.
+
+---
+
+## OpWithKernel/Kernel
+
+
+
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+ * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+ * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+
+---
+
+## Why separate Kernel and Operator
+
+* Separate GPU and CPU code.
+ * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+ * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
+---
+
+## Libraries for Kernel development
+
+* `Eigen::Tensor` contains basic math and element-wise functions.
+ * Note that `Eigen::Tensor` has broadcast implementation.
+ * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+ * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+ * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+ * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
+---
+## Operator Registration
+
+### Why is registration necessary?
+We need a method to build mappings between Op type names and Op classes.
+
+### How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
+
+---
+## The Registry Map
+
+### `OpInfoMap`
+
+`op_type(string)` -> `OpInfo`
+
+`OpInfo`:
+
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+
+---
+## Related Concepts
+
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
+
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+
+---
+## Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+ 1. Call maker class to complete `proto` and `checker`
+ 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
+
+---
+## Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forward Op to backward Op
+
+
+---
+## Backward Module (2/2)
+### Build Backward Network
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+ - Shared Variables => insert an `Add` operator to combine gradients
+ - No Gradient => insert a `fill_zero_grad` operator
+ - Recursive NetOp => call `Backward` recursively
+ - RNN Op => recursively call `Backward` on stepnet
+ - RNN Op => recursively call `Backward` on stepnet
+
+
+---
+## Scope, Variable, Tensor
+
+* `Tensor` is an n-dimension array with type.
+ * Only dims and data pointers are stored in `Tensor`.
+ * All operations on `Tensor` are written in `Operator` or global functions.
+ * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+ * `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+ * map
+ * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
+
+---
+## Block (in design)
+### the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+ - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+ - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
+
+---
+## Milestone
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+ - Framework development gives **priority support** to model migration, for example,
+ - the MNIST demo needs a Python interface,
+ - the RNN models require the framework to support `LoDTensor`.
+ - Determine some timelines,
+ - Frequently used Ops need to be migrated first,
+ - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
+
+---
+## Control the migration quality
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+ - The documentation of layers and ops should be written inside the code.
+ - Take the documentation quality into account when submitting pull requests.
+ - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/fluid/design/multi_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f8439e8623e1c1ae9a12c24d08079f0ec3d761f
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_cn.rst
@@ -0,0 +1,9 @@
+多设备支持
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ operator_kernel_type.md
+ kernel_selection.md
+ kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_en.rst
@@ -0,0 +1,9 @@
+Multi-Device Support
+----------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ operator_kernel_type.md
+ kernel_selection.md
+ kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..6edc14ca73b1abf824981b59511a9aca4e0f3b47
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
@@ -0,0 +1,59 @@
+# Kernel Hint Design
+
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+ Place place_;
+ DataType data_type_;
+ LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+ if (Attr(kForceCPU)) {
+ return KernelType(CPUPlace, ...)
+ } else {
+ ...
+ }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+ layer_helper = LayerHelper(...)
+ layer_helper.append_op(
+ type="xx",
+ attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d2aab87b8cf30d03075e96cc4c67070efaf963a
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -0,0 +1,101 @@
+# Kernel Selection
+
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
+
+The `OpKernelType ` is as follows:
+
+```cpp
+struct OpKernelType {
+ Place place_;
+ DataType data_type_;
+ DataLayout data_layout_;
+ LibraryType library_type_;
+};
+```
+
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
+
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
+
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+ |
+ op1_2_op2
+ |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution: data transform
+
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
+
+The algorithm is described as following
+
+```cpp
+void OperatorWithKernel::Run(
+ const Scope& scope,
+ const platform::Place& place) const {
+ ExecutionContext ctx(...);
+ auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+ Scope& new_scope = scope.NewScope();
+
+ for (auto& var_name : this->Inputs()) {
+ auto* tensor_in = GetTensor(var_name);
+ auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+ if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+ auto* trans_var = new_scope.Var(var_name);
+ auto* out = TransformData(expected_kernel_key,
+ kernel_type_for_var,
+ *tensor_in);
+ SetTensorToVariable(...);
+ }
+ }
+
+ auto kernel = kernels.find(expected_kernel_key);
+ kernel->Compute(ExecutionContext(...));
+}
+```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+ |
+op1_2_op2(on CPU)
+ |
+[transform](from CPU to GPU)
+ |
+op1_2_op2(on GPU)
+ |
+OP2(CUDAPlace)
+```
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e391bd62b4f4e123a9a6f35b7adf5726f205635
--- /dev/null
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+ platform::Place place_;
+ proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+ platform::Place place_;
+ platform::Library library_;
+ proto::DataType data_type_;
+ framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+ kNCHW,
+ kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+ knChw8c
+ ...
+#endif
+};
+```
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
new file mode 100644
index 0000000000000000000000000000000000000000..f32a5b7e8a4d820319a666dab4c3129360e2c924
--- /dev/null
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -0,0 +1,235 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+ - [Overview](#overview)
+ - [Row Convolution](#row-convolution)
+ - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+ - Json filelist generator.
+ - Audio file format transformer.
+ - Spectrogram feature extraction, power normalization etc.
+ - Batch data reader with SortaGrad.
+ - Data augmentation (optional).
+ - Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+ - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+ - With only bidirectional-GRU (otherwise need *Task 4*).
+ - With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+ - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+ - Lookahead convolution windows.
+ - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+ - Use KenLM toolkit.
+ - Prepare the corpus & train the model.
+ - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+ - Beam search with CTC.
+ - Beam search with external custom scorer (e.g. LM).
+ - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+ - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+ - Dataset, baseline, evaluation details.
+ - Particular data preprocessing for Mandarin.
+ - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+ - With variable-length audio sequences (need *Task 3*).
+ - With unidirectional-GRU + row-convolution (need *Task 4*).
+ - With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+ - With public English dataset.
+ - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+
+
+
+Roadmap |
+Description |
+ Parallelizable Tasks |
+
+
+
+
+Phase I |
+Simplified model & components |
+Task 1 ~ Task 8 |
+
+
+Phase II |
+ Standard model & benchmarking & profiling |
+Task 9 ~ Task 12 |
+
+
+Phase III |
+ Documentations |
+ Task13 ~ Task14 |
+
+
+
+
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+
+

+Figure 1. Archetecture of Deep Speech 2 Network.
+
+
+We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
+
+Key ingredients about the layers:
+
+- **Data Layers**:
+ - Frame sequences data of audio **spectrogram** (with FFT).
+ - Token sequences data of **transcription** text (labels).
+ - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
+- **2D Convolution Layers**:
+ - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
+ - With striding for only the first convlution layer.
+ - No pooling for all convolution layers.
+- **Uni-directional RNNs**
+ - Uni-directional + row convolution: for low-latency inference.
+ - Bi-direcitional + without row convolution: if we don't care about the inference latency.
+- **Row convolution**:
+ - For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
+ - Not nessesary if with bi-direcitional RNNs.
+ - "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
+- **Batch Normalization Layers**:
+ - Added to all above layers (except for data and loss layer).
+ - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
+
+
+
+
+Required Components |
+ PaddlePaddle Support |
+ Need to Develop |
+
+
+
+
+Data Layer I (Spectrogram) |
+Not supported yet. |
+TBD (Task 3) |
+
+
+Data Layer II (Transcription) |
+ paddle.data_type.integer_value_sequence |
+ - |
+
+
+2D Convolution Layer |
+ paddle.layer.image_conv_layer |
+ - |
+
+
+DataType Converter (vec2seq) |
+ paddle.layer.block_expand |
+ - |
+
+
+Bi-/Uni-directional RNNs |
+paddle.layer.recurrent_group |
+ - |
+
+
+Row Convolution Layer |
+Not supported yet. |
+TBD (Task 4) |
+
+
+CTC-loss Layer |
+paddle.layer.warp_ctc |
+ - |
+
+
+Batch Normalization Layer |
+paddle.layer.batch_norm |
+ - |
+
+
+CTC-Beam search |
+Not supported yet. |
+ TBD (Task 6) |
+
+
+
+
+
+### Row Convolution
+
+TODO by Assignees
+
+### Beam Search with CTC and LM
+
+
+

+Figure 2. Algorithm for CTC Beam Search Decoder.
+
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+ - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+ - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
+
+## Future Work
+
+- Efficiency Improvement
+- Accuracy Improvement
+- Low-latency Inference Library
+- Large-scale benchmarking
+
+## References
+
+1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
+2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/design/network/images/beam_search.png b/doc/fluid/design/network/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/design/network/images/beam_search.png differ
diff --git a/doc/fluid/design/network/images/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/design/network/images/ds2_network.png differ
diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3557d55fe4dbae1f712e0760ca15111ec6f6792d
--- /dev/null
+++ b/doc/fluid/design/network/index_cn.rst
@@ -0,0 +1,7 @@
+复杂网络设计
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ sequence_decoder.md
diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73a7137236bdf0548d35721609351d6deca3013b
--- /dev/null
+++ b/doc/fluid/design/network/index_en.rst
@@ -0,0 +1,7 @@
+Complex Network Design
+------------------------
+
+.. toctree::
+ :maxdepth: 1
+
+ sequence_decoder.md
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..b95773c50ca0dcbd1b93529332e035d4de90faa8
--- /dev/null
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -0,0 +1,229 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
+
+During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
+
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
+
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following examples are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+ name='source_language_word',
+ type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+ encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+ decoder = pd.while_loop()
+ with decoder.step():
+ decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory
+ generated_ids = decoder.memory() # TODO init to batch_size s
+ generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+ target_word = pd.lookup(trg_embedding, gendrated_ids)
+ # expand encoder_ctx's batch to fit target_word's lod
+ # for example
+ # decoder_mem.lod is
+ # [[0 1 3],
+ # [0 1 3 6]]
+ # its tensor content is [a1 a2 a3 a4 a5]
+ # which means there are 2 sentences to translate
+ # - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+ # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+ # the target_word.lod is
+ # [[0, 1, 6]
+ # [0, 2, 4, 7, 9 12]]
+ # which means 2 sentences to translate, each has 1 and 5 prefixes
+ # the first prefix has 2 candidates
+ # the following has 2, 3, 2, 3 candidates
+ # the encoder_ctx_expanded's content will be
+ # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+ encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+ decoder_input = pd.fc(
+ act=pd.activation.Linear(),
+ input=[target_word, encoder_ctx_expanded],
+ size=3 * decoder_dim)
+ gru_out, cur_mem = pd.gru_step(
+ decoder_input, mem=decoder_mem, size=decoder_dim)
+ scores = pd.fc(
+ gru_out,
+ size=trg_dic_size,
+ bias=None,
+ act=pd.activation.Softmax())
+ # K is an config
+ topk_scores, topk_ids = pd.top_k(scores, K)
+ topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+ selected_ids, selected_generation_scores = decoder.beam_search(
+ topk_ids, topk_generated_scores)
+
+ # update the states
+ decoder_mem.update(cur_mem) # tells how to update state
+ generated_ids.update(selected_ids)
+ generated_scores.update(selected_generation_scores)
+
+ decoder.output(selected_ids)
+ decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
+
+In this way, users can customize anything on the input or output of beam search, for example:
+
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`:
+
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
+
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state:
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state is stored in `encoder_ctx_expanded`:
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+ - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+ - a2 a2
+ - b1 b1 b1
+ - b2
+ - b3 b3
+ - None (c1 has 0 candidates, so c1 is dropped)
+
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
+
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
+
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
+
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+
+
+
+
+According to the image above, the only phase that changes the LoD is beam search.
+
+## Beam search design
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
+
+1. `topk_ids`, the top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables:
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab1c2ff23cfff586516876684348bb15bd2084fc
Binary files /dev/null and b/doc/fluid/design/onnx/images/project_structure.png differ
diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc1665d7c33eb54cb63e5306a439c1ca67016d1e
--- /dev/null
+++ b/doc/fluid/design/onnx/onnx_convertor.md
@@ -0,0 +1,131 @@
+# Background
+
+[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
+
+Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
+
+
+# How it works
+
+ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
+
+When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
+
+Here are a few major considerations when it comes to converting models:
+
+- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
+- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
+- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
+- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
+- **Model validation**: There are two kinds of validations that are necessary:
+ 1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
+ 2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
+- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
+
+One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
+
+
+# Project structure
+
+
+
+
+
+The project contains four important parts:
+
+* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
+
+* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
+
+* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
+
+* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
+
+
+# Usage
+The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
+
+* Convert Fluid inference model to ONNX binary model
+
+ ```
+ python convert.py --fluid_model --onnx_model validate True
+ ```
+
+* Validate the converted model
+
+ ```
+ python validate.py --fluid_model --onnx_model
+ ```
+
+The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
+
+
+# Challenges and mitigation
+
+## Cycles
+
+Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
+
+*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
+
+## Sequences
+
+Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
+
+*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
+
+## Ops that can't easily be mapped
+
+There are ops that just aren't possible to map today:
+
+**Control flow operators**
+
+Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
+
+*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
+
+
+**Non-existent in Fluid**
+
+There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
+
+*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
+
+
+**Concurrency**
+
+ONNX does not have any considerations for concurrency right now.
+
+*Resolution*: There are two ways to approach this:
+
+a. We choose to not support concurrent models.
+b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
+
+
+**Overloaded in Fluid**
+
+There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
+
+*Resolution*: Chain multiple Paddle operators.
+
+
+## Lack of LoDTensors
+
+As stated above, ONNX only supports simple Tensor values.
+
+*Resolution*: Deprecate to plain old numpy-able tensors.
+
+
+## Reconstruction from deprecated ONNX ops
+
+For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
+
+*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
+
+
+# Supported models
+
+As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
+
+- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
+- Recurrent models: language model, stacked LSTMs etc.
diff --git a/doc/fluid/design/others/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md
new file mode 100644
index 0000000000000000000000000000000000000000..773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf
--- /dev/null
+++ b/doc/fluid/design/others/auto_gradient_check.md
@@ -0,0 +1,150 @@
+## Auto Gradient Check Design
+
+## Background:
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+ 1. The formula for backpropagation formula should be correct according to the forward computation.
+ 2. The Implementation of the above shoule be correct in CPP.
+ 3. It is difficult to prepare an unbiased test data.
+
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+ 1. Numerical gradient checker only needs the forward operator.
+ 2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
+
+## Mathematical Theory
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numerical Gradient Implementation
+### Python Interface
+```python
+def get_numerical_gradient(op,
+ input_values,
+ output_name,
+ input_to_check,
+ delta=0.005,
+ local_scope=None):
+ """
+ Get Numerical Gradient for the input of an operator.
+
+ :param op: C++ operator instance, could be an network.
+ :param input_values: The input variables. Should be an dictionary, whose key is
+ variable name, and value is a numpy array.
+ :param output_name: The final output variable name.
+ :param input_to_check: The input variable with respect to which the gradient has to be computed.
+ :param delta: The perturbation value for numerical gradient method. The
+ smaller the delta, the more accurate the result. But if the delta is too
+ small, it will suffer from the numerical stability problem.
+ :param local_scope: The local scope used for get_numeric_gradient.
+ :return: The gradient array in numpy format.
+ """
+```
+
+### Explanation:
+
+- Why do we need an `output_name`
+ - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
+
+- Why do we need `input_to_check`
+ - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
+
+
+### Core Algorithm Implementation
+
+
+```python
+ # we only compute the gradient of one element a time.
+ # we use a for loop to compute the gradient of each element.
+ for i in xrange(tensor_size):
+ # get one input element using the index i.
+ original = tensor_to_check.get_float_element(i)
+
+ # add delta to it, run the forward op and then
+ # get the new value of the result tensor.
+ x_pos = original + delta
+ tensor_to_check.set_float_element(i, x_pos)
+ y_pos = get_output()
+
+ # Subtract delta from this element, run the op again
+ # and get the new value of the result tensor.
+ x_neg = original - delta
+ tensor_to_check.set_float_element(i, x_neg)
+ y_neg = get_output()
+
+ # restore old value
+ tensor_to_check.set_float_element(i, original)
+
+ # compute the gradient of this element and store
+ # it into a numpy array.
+ gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+ # reshape the gradient result to the shape of the source tensor.
+ return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Gradient Check Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+1. Numerical gradient
+2. CPU kernel gradient
+3. GPU kernel gradient (if supported by the device)
+
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
+
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
+
+#### Python Interface
+
+```python
+ def check_grad(self,
+ forward_op,
+ input_vars,
+ inputs_to_check,
+ output_name,
+ no_grad_set=None,
+ only_cpu=False,
+ max_relative_error=0.005):
+ """
+ :param forward_op: used to create backward_op
+ :param input_vars: numpy value of input variable. The following
+ computation will use these variables.
+ :param inputs_to_check: the input variable with respect to which the
+ gradient will be computed.
+ :param output_name: The final output variable name.
+ :param max_relative_error: The relative tolerance parameter.
+ :param no_grad_set: used to create backward ops
+ :param only_cpu: only compute and check gradient on cpu kernel.
+ :return:
+ """
+```
+
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
+
+```python
+numerical_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numerical_grad = numpy.abs(numerical_grad)
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
+abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes:
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
+
+
+#### References:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/fluid/design/others/dcgan.png b/doc/fluid/design/others/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/design/others/dcgan.png differ
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..7167470088766985fa5ad31657410309330fd725
--- /dev/null
+++ b/doc/fluid/design/others/gan_api.md
@@ -0,0 +1,253 @@
+# Design for GAN
+
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
+
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+
+
+
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+
+
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+
+
+
+Figure 2. Photo borrowed from the original DC-GAN paper.
+
+
+## The Conditional-GAN might be a class.
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one.
+Returns a 0/1 binary label.
+
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+
+To be more detailed, we introduce our design of DCGAN as following:
+
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+ def __init__(self, y_dim=None):
+
+ # hyper parameters
+ self.y_dim = y_dim # conditional gan or not
+ self.batch_size = 100
+ self.z_dim = z_dim # input noise dimension
+
+ # define parameters of discriminators
+ self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+ self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
+ self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+ self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
+ self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+ self.D_b2 = pd.Variable(np.zeros(128))
+ self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+
+ # define parameters of generators
+ self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+ self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
+ self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+ self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
+ self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+ self.G_b2 = pd.Variable(np.zeros(128))
+ self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+ def generator(self, z, y = None):
+ # input z: the random noise
+ # input y: input data label (optional)
+ # output G_im: generated fake images
+
+ if not self.y_dim:
+ z = pd.layer.concat(1, [z, y])
+
+ G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+ G_h0_bn = pd.layer.batch_norm(G_h0)
+ G_h0_relu = pd.layer.relu(G_h0_bn)
+
+ G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+ G_h1_bn = pd.layer.batch_norm(G_h1)
+ G_h1_relu = pd.layer.relu(G_h1_bn)
+
+ G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+ G_im = pd.layer.tanh(G_im)
+ return G_im
+```
+
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+ def discriminator(self, image):
+ # input image: either generated images or real ones
+ # output D_h2: binary logit of the label
+
+ D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+ D_h0_bn = pd.layer.batchnorm(h0)
+ D_h0_relu = pd.layer.lrelu(h0_bn)
+
+ D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+ D_h1_bn = pd.layer.batchnorm(D_h1)
+ D_h1_relu = pd.layer.lrelu(D_h1_bn)
+
+ D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+ return D_h2
+```
+
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively.
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+ def build_model(self):
+ if self.y_dim:
+ self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+ self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+ self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+ self.z = pd.data(tf.float32, [None, self.z_size])
+
+ # step 1: generate images by generator, classify real/fake images with discriminator
+ if self.y_dim: # if conditional GAN, includes label
+ self.G = self.generator(self.z, self.y)
+ self.D_t = self.discriminator(self.images)
+ # generated fake images
+ self.sampled = self.sampler(self.z, self.y)
+ self.D_f = self.discriminator(self.G)
+ else: # original version of GAN
+ self.G = self.generator(self.z)
+ self.D_t = self.discriminator(self.images)
+ # generate fake images
+ self.sampled = self.sampler(self.z)
+ self.D_f = self.discriminator(self.images)
+
+ # step 2: define the two losses
+ self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+ self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+ self.d_loss = self.d_loss_real + self.d_loss_fake
+
+ self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+ def build_model(self, default_block):
+ # input data in the default block
+ if self.y_dim:
+ self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+ self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+ # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+ self.z = pd.data(tf.float32, [None, self.z_size])
+
+ # step 1: generate images by generator, classify real/fake images with discriminator
+ with pd.default_block().g_block():
+ if self.y_dim: # if conditional GAN, includes label
+ self.G = self.generator(self.z, self.y)
+ self.D_g = self.discriminator(self.G, self.y)
+ else: # original version of GAN
+ self.G = self.generator(self.z)
+ self.D_g = self.discriminator(self.G, self.y)
+ self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+
+ with pd.default_block().d_block():
+ if self.y_dim: # if conditional GAN, includes label
+ self.D_t = self.discriminator(self.images, self.y)
+ self.D_f = self.discriminator(self.G, self.y)
+ else: # original version of GAN
+ self.D_t = self.discriminator(self.images)
+ self.D_f = self.discriminator(self.G)
+
+ # step 2: define the two losses
+ self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+ self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+ self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+
+if __name__ == "__main__":
+ # dcgan class in the default graph/block
+ # if we use dependency engine as tensorflow
+ # the codes, will be slightly different like:
+ # dcgan = DCGAN()
+ # dcgan.build_model()
+ with pd.block() as def_block:
+ dcgan = DCGAN()
+ dcgan.build_model(def_block)
+
+ # load mnist data
+ data_X, data_y = self.load_mnist()
+
+ # Two subgraphs required!!!
+ with pd.block().d_block():
+ d_optim = pd.train.Adam(lr = .001, beta= .1)
+ d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+ with pd.block.g_block():
+ g_optim = pd.train.Adam(lr = .001, beta= .1)
+ g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+
+ # executor
+ sess = pd.executor()
+
+ # training
+ for epoch in xrange(10000):
+ for batch_id in range(N / batch_size):
+ idx = ...
+ # sample a batch
+ batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+ # sample z
+ batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+
+ if batch_id % 2 == 0:
+ sess.run(d_step,
+ feed_dict = {dcgan.images: batch_im,
+ dcgan.y: batch_label,
+ dcgan.z: batch_z})
+ else:
+ sess.run(g_step,
+ feed_dict = {dcgan.z: batch_z})
+```
+
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/fluid/design/others/graph.md b/doc/fluid/design/others/graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..7519a65df835a39fe14f6ef45530afff170191ff
--- /dev/null
+++ b/doc/fluid/design/others/graph.md
@@ -0,0 +1,70 @@
+# Design Doc: Computations as a Graph
+
+A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
+
+This document explains that the construction of a graph as three steps:
+
+- construct the forward part
+- construct the backward part
+- construct the optimization part
+
+## The Construction of a Graph
+
+Let us take the problem of image classification as a simple example. The application program that trains the model looks like:
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+### Forward Part
+
+The first four lines of above program build the forward part of the graph.
+
+
+
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
+
+In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message.
+
+### Backward Part
+
+The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
+
+`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
+
+
+
+According to the chain rule of gradient computation, `ConstructBackwardGraph` would
+
+1. create a gradient operator G for each operator F,
+1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
+1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
+1. make all these gradients as outputs of G.
+
+### Optimization Part
+
+For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph:
+
+
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+ repeated OpDesc ops = 1;
+ repeated VarDesc vars = 2;
+}
+```
+
+in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
new file mode 100644
index 0000000000000000000000000000000000000000..97f395133b48a1d0ed5136f0ebc8720b8ca87ded
--- /dev/null
+++ b/doc/fluid/design/others/graph_survey.md
@@ -0,0 +1,232 @@
+## Survey on Graph
+
+Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
+
+### Mxnet
+
+The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
+
+
+`Symbol` is help class used to represent the operator node in Graph.
+`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
+
+
+A simple network topology wrote by Symbol is as follows:
+
+```python
+def get_symbol(num_classes=10, **kwargs):
+ data = mx.symbol.Variable('data')
+ data = mx.symbol.Flatten(data=data)
+ fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+ act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+ fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+ act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+ fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+ mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+ return mlp
+```
+
+
+
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
+
+Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
+
+And Symbol can be saved to a Json file.
+
+Here is a detailed example:
+
+```
+>>> import mxnet as mx
+>>> data = mx.symbol.Variable('data')
+>>> print data.debug_str()
+Variable:data
+
+>>> data = mx.symbol.Flatten(data=data)
+>>> print data.debug_str()
+Symbol Outputs:
+ output[0]=flatten0(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+ arg[0]=data(0) version=0
+
+>>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+>>> print fc1.debug_str()
+Symbol Outputs:
+ output[0]=fc1(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+ arg[0]=data(0) version=0
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+ arg[0]=flatten0(0)
+ arg[1]=fc1_weight(0) version=0
+ arg[2]=fc1_bias(0) version=0
+Attrs:
+ num_hidden=128
+
+```
+
+
+### TensorFlow
+
+
+The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
+
+A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
+
+A simple example is as follows:
+
+```python
+ # Build a dataflow graph.
+ c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+ d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+ e = tf.matmul(c, d)
+
+ # Construct a `Session` to execute the graph.
+ sess = tf.Session()
+
+ # Execute the graph and store the value that `e` represents in `result`.
+ result = sess.run(e)
+```
+
+
+The main method of `Tensor` is as follows:
+
+
+```python
+@property
+def op(self):
+ """The `Operation` that produces this tensor as an output."""
+ return self._op
+
+@property
+def dtype(self):
+ """The `DType` of elements in this tensor."""
+ return self._dtype
+
+@property
+def graph(self):
+ """The `Graph` that contains this tensor."""
+ return self._op.graph
+
+@property
+def name(self):
+ """The string name of this tensor."""
+ if not self._op.name:
+ raise ValueError("Operation was not named: %s" % self._op)
+ return "%s:%d" % (self._op.name, self._value_index)
+
+@property
+def device(self):
+ """The name of the device on which this tensor will be produced, or None."""
+ return self._op.device
+```
+
+
+Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
+
+
+Here is a detailed example:
+
+
+```
+>>> import tensorflow as tf
+>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+>>> print c.graph
+
+>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+>>> print d.graph
+
+>>> e = tf.matmul(c, d)
+>>> print e.graph
+
+```
+
+### Dynet
+
+
+The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
+
+
+A simple example is as follows:
+
+```cpp
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+
+Expression in = input(cg, xs[i]);
+Expression label = input(cg, ys[i]);
+Expression pred = W * in;
+Expression loss = square(pred - label);
+```
+
+The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node.
+
+Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
+
+
+Here is a detailed example:
+
+write topology in C++
+
+```
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+cg.print_graphviz();
+
+Expression pred = W * xs[i];
+cg.print_graphviz();
+
+Expression loss = square(pred - ys[i]);
+cg.print_graphviz();
+```
+
+compile and print
+
+```
+# first print
+digraph G {
+ rankdir=LR;
+ nodesep=.05;
+ N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+}
+# second print
+digraph G {
+ rankdir=LR;
+ nodesep=.05;
+ N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+ N1 [label="v1 = v0 * -0.98"];
+ N0 -> N1;
+}
+# third print
+digraph G {
+ rankdir=LR;
+ nodesep=.05;
+ N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+ N1 [label="v1 = v0 * -0.98"];
+ N0 -> N1;
+ N2 [label="v2 = -1.88387 - v1"];
+ N1 -> N2;
+ N3 [label="v3 = -v2"];
+ N2 -> N3;
+ N4 [label="v4 = square(v3)"];
+ N3 -> N4;
+}
+```
+
+### Conclusion
+
+
+Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
+
+- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
+- Expression corresponds with a global Graph, and Expression can also be composed.
+- Expression tracks all dependency and can be taken as a run target
diff --git a/doc/fluid/design/others/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/design/others/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+ sed 's/color=red/color=red, style=invis/g' | \
+ sed 's/color=green/color=green, style=invis/g' | \
+ dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+ sed 's/color=green/color=green, style=invis/g' | \
+ dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+ dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/design/others/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/design/others/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+ ///////// The forward part /////////
+ FeedX [label="Feed", color=blue, shape=box];
+ FeedY [label="Feed", color=blue, shape=box];
+ InitW [label="Init", color=blue, shape=diamond];
+ Initb [label="Init", color=blue, shape=diamond];
+ FC [label="FC", color=blue, shape=box];
+ MSE [label="MSE", color=blue, shape=box];
+
+ x [label="x", color=blue, shape=oval];
+ l [label="l", color=blue, shape=oval];
+ y [label="y", color=blue, shape=oval];
+ W [label="W", color=blue, shape=doublecircle];
+ b [label="b", color=blue, shape=doublecircle];
+ cost [label="cost", color=blue, shape=oval];
+
+ FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+ FeedY -> l [color=blue];
+ InitW -> W [color=blue];
+ Initb -> b [color=blue];
+ W -> FC [color=blue];
+ b -> FC [color=blue];
+ l -> MSE [color=blue];
+
+ ////////// The backward part /////////
+ MSE_Grad [label="MSE_grad", color=red, shape=box];
+ FC_Grad [label="FC_grad", color=red, shape=box];
+
+ d_cost [label="d cost", color=red, shape=oval];
+ d_y [label="d y", color=red, shape=oval];
+ d_b [label="d b", color=red, shape=oval];
+ d_W [label="d W", color=red, shape=oval];
+
+ cost -> MSE_Grad [color=red];
+ d_cost -> MSE_Grad [color=red];
+ l -> MSE_Grad [color=red];
+ y -> MSE_Grad -> d_y [color=red];
+
+ x -> FC_Grad [color=red];
+ y -> FC_Grad [color=red];
+ d_y -> FC_Grad [color=red];
+ W -> FC_Grad -> d_W [color=red];
+ b -> FC_Grad -> d_b [color=red];
+
+ ////////// The optimizaiton part //////////
+
+ OPT_W [label="SGD", color=green, shape=box];
+ OPT_b [label="SGD", color=green, shape=box];
+
+ W -> OPT_W [color=green];
+ b -> OPT_b [color=green];
+ d_W -> OPT_W -> W [color=green];
+ d_b -> OPT_b -> b [color=green];
+
+ ////////// Groupings //////////
+
+ subgraph clusterMSE {
+ style=invis;
+ MSE;
+ MSE_Grad;
+ }
+
+ subgraph clusterFC {
+ style=invis;
+ FC;
+ FC_Grad;
+ }
+}
diff --git a/doc/fluid/design/others/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/design/others/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/design/others/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7ac3f17c44ca94a669a8f1e283b291bceb42317
--- /dev/null
+++ b/doc/fluid/design/others/parameters_in_cpp.md
@@ -0,0 +1,41 @@
+# Design Doc: The C++ Class `Parameters`
+
+`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+
+We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
+* We just use `memcpy` to share Parameters between topologies, but this is very inefficient.
+* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
+
+It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
+
+1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
+It is evident that we should use `paddle::Parameter` when developing `Parameters`.
+However, the `Parameter` class contains many functions and does not have a clear interface.
+It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
+When we developing `Parameters`, we only use `create/store Parameter` functionality.
+We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
+
+2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
+We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
+Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
+`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
+
+3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle.
+So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
+
+
+The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
+
+1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
+
+2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
+
+3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
+Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
+`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
+ * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
+ * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
+
+4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
+
+5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/fluid/design/others/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7aeed7f9b4637e1c29d530f37b42d12500af82f
--- /dev/null
+++ b/doc/fluid/design/others/simple_op_design.md
@@ -0,0 +1,202 @@
+## Interaction between C++ and Python
+
+Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++.
+
+The Interaction between Python and C++ can be simplified as two steps:
+
+1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
+
+2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
+
+### Message from C++ to Python
+
+We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
+
+Following message are necessary:
+
+1. Op's name, and its simple comment.
+2. Input and output variable number; each variable's name, type, and comment.
+3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
+
+So `OpProto` can be defined as follows:
+
+```proto
+enum AttrType {
+ INT = 1;
+ FLOAT = 2;
+ STRING = 3;
+ INTS = 4;
+ FLOATS = 5;
+ STRINGS = 6;
+};
+
+message AttrValue {
+ AttrType type = 1;
+ optional int iv = 2;
+ optional float fv = 3;
+ optional string sv = 4;
+ repeated int ivs = 5;
+ repeated float fvs = 6;
+ repeated string svs = 7;
+};
+
+message AttrProto {
+ required string name = 1;
+ required string comment = 2;
+ required AttrType type = 3;
+};
+
+message VarProto {
+ required string name = 1;
+ required string comment = 2;
+ required bool is_tensor = 3;
+};
+
+message OpProto {
+ repeated VarProto inputs = 1;
+ repeated VarProto outputs = 2;
+ repeated AttrProto attrs = 3;
+ required string type = 4;
+ required string comment = 5;
+};
+```
+
+To generate Python code automatically:
+
+```python
+def create_python_ops_creatation_functions():
+ op_protos = paddle.framework.OpRegistry.get_all_op_proto()
+ for type_name in op_protos:
+ op_proto = op_protos[type_name]
+ def __impl__(**kwargs): # User must use key word args in Paddle API
+ inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs]
+ outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs]
+ attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs]
+ opdesc = (input, outputs, type_name, attrs)
+ return paddle.framework.OpRegistry.CreateOp(opdesc)
+ __impl__.__doc__ = create_doc_string(op_proto)
+ globals()[type_name] = __impl__
+
+create_python_ops_creatation_functions()
+```
+
+### Message from Python to C++
+
+To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
+
+```proto
+message OpDesc {
+ required string type = 1;
+ repeated string inputs = 2;
+ repeated string outputs = 3;
+ map attrs = 4;
+};
+```
+
+## OpProto Register
+
+Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
+
+```cpp
+class OpProtoMaker {
+public:
+ OpProtoMaker(OpProto* proto): proto_(proto) {}
+protected:
+ OpProto* proto_;
+ void AddInput(const std::string& name, const std::string& desc) {...}
+ void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
+ void AddComment(const std::string& comment) { ... }
+};
+
+class OpRegistry {
+public:
+ using OpCreator = std::function;
+
+ template
+ static void RegisterOp(const std::string& name) {
+ gCreators_[name] = [](const OpDesc& desc) {
+ return new OpType(desc);
+ };
+ OpProto& opProto = gProtos_[name];
+ OpMaker()(&opProto);
+ }
+
+ static map gCreators_;
+ static map gProtos_;
+};
+
+template
+class OpRegister {
+ public:
+ OpRegister(std::string type) {
+ OpRegistry::RegisterOp(type);
+ }
+};
+
+#define REGISTER_OP(op_class, op_maker_class, type_name) \
+ class op_class##Register { \
+ private: \
+ const static OpRegister<#op_class, #op_maker_class> reg; \
+ }; \
+ const Register op_class##Register::reg(#type_name);
+
+class CosineOp {
+// ...
+}
+
+struct CosineOpProtoMaker : public OpProtoMaker {
+ CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
+ AddInput("input", "input of cosine op");
+ AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
+ AddType("cos");
+ AddComment("This is cos op");
+ }
+}
+
+REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
+```
+
+In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here.
+
+## Python API
+
+Python APIs are divided into two types, high-level API and low-level API.
+
+### High-Level API
+
+High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
+
+Here is a sample about how a define a fc layer:
+
+```python
+hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
+```
+
+`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
+
+The definition of `fc_layer()`:
+
+```python
+def fc_layer(input, size, with_bias, activation):
+ attr_map = {"size":size}
+ check_attrs(attr_map)
+ w = make_variable('w')
+ if with_bias:
+ b = make_variable('b')
+ else:
+ b = None
+ fc_output = make_variable('fc_output');
+ fc_op(input, w, b, fc_output, attr_map)
+ act_output = make_variable('sigmod_output');
+ if activation == "sigmod":
+ sigmod_op(fc_output, act_output);
+ elif:
+ # ...
+ return act_output;
+```
+
+### Low Leval API
+
+In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
+
+*TODO*
diff --git a/doc/fluid/design/others/test.dot b/doc/fluid/design/others/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/design/others/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+ z -> generator -> G_img;
+ G_img -> discriminator -> D_f -> d_loss_f;
+ label0 -> d_loss_f -> d_loss;
+
+ img -> discriminator -> D_t -> d_loss_t;
+ label1 -> d_loss_t -> d_loss;
+
+ d_loss -> d_loss_t[color=red, style=dashed];
+ d_loss -> d_loss_f[color=red, style=dashed];
+ d_loss_t -> D_t[color=red, style=dashed];
+ d_loss_f -> D_f[color=red, style=dashed];
+ D_t -> discriminator[color=red, style=dashed];
+ D_f -> discriminator[color=red, style=dashed];
+
+ D_f -> g_loss;
+ label2 -> g_loss;
+
+ g_loss -> D_f[color=green, style=dashed];
+ D_f -> discriminator[color=green, style=dashed];
+ discriminator -> G_img[color=green, style=dashed];
+ G_img -> generator[color=green, style=dashed];
+
+ discriminator [color=red, shape=box];
+ generator [color=green, shape=box];
+ z [shape=diamond];
+ img [shape=diamond];
+ label0 [shape=diamond];
+ label1 [shape=diamond];
+ label2 [shape=diamond];
+
+ d_loss [color=red];
+ g_loss [color=green];
+}
diff --git a/doc/fluid/design/others/test.dot.png b/doc/fluid/design/others/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/design/others/test.dot.png differ
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements. It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer.
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M = max(abs(x)) $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer.
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure.
+
+
+
+Figure 1. Forward in training with simulated quantization.
+
+
+- Firstly, both input and weight will be quantized to 8-bit integers.
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+ &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+ &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m)
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+
+
+Figure 2. Equivalent forward in training with simulated quantization.
+
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+
+
+Figure 3. Backward and weight updating in training with simulated quantization.
+
+
+So the quantization transipler will change some inputs of the corresponding backward operators.
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+ $$ Vt = (1 - k) * V + k * V_{t-1} $$
+
+ where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -0,0 +1,221 @@
+# API注释撰写标准
+
+- [API注释撰写标准](#api)
+ - [API注释模块](#api)
+ - [格式及示例](#)
+ - [完整示例](#)
+
+
+## API注释模块
+
+API文档须包含以下几个模块(排列顺序为文档撰写顺序):
+
+- Python API Definition
+
+ API的代码定义。
+
+- Function Description
+
+ API的功能描述。描述该API的含义、作用或对输入所做的操作,及参考文献和对应链接(如果有),必要时给出公式,并解释公式中关键变量的含义。
+
+- Args Description
+
+ API参数介绍。按代码定义中的参数顺序逐个介绍,介绍内容包含数据类型、默认值(如果有)、含义等。
+
+- Returns
+
+ API返回值介绍。介绍返回值含义,必要时给出对应的形状。若返回值为包含多个参数的tuple,则按顺序逐个介绍各参数。
+
+- Raises(如果有)
+
+ 可能抛出的异常或错误及可能的产生原因,当可能抛出多种异常或错误时应分条列出。
+
+- Note(如果有)
+
+ 注意事项。当有多条注意事项时,应分条列出。
+
+- Examples
+
+ API的使用示例。
+
+
+## 格式及示例
+
+API文档须使用reStructuredText格式撰写,该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下(以下以fc为例进行说明):
+
+- Python API Definition
+
+ - 格式:
+
+ [Python API Definition]
+
+ - 示例
+
+ ```
+ fc(input,
+ size,
+ num_flatten_dims=1,
+ param_attr=None,
+ bias_attr=None,
+ act=None,
+ name=None,
+ main_program=None,
+ startup_program=None)
+ ```
+
+- Function Description
+
+ - 格式
+
+ 本模块应包含以下内容(排列顺序为文档撰写顺序):
+
+ [Function Description]
+
+ [Formula]
+
+ [Symbols' Descriptions if necessary]
+
+ [References if necessary]
+
+ - 示例
+
+ [Function Description]
+
+ ```
+ **Fully Connected Layer**
+
+ The fully connected layer can take multiple tensors as its inputs. It
+ creates a variable called weights for each input tensor, which represents
+ a fully connected weight matrix from each input unit to each output unit.
+ The fully connected layer multiplies each input tensor with its coresponding
+ weight to produce an output Tensor. If multiple input tensors are given,
+ the results of multiple multiplications will be sumed up. If bias_attr is
+ not None, a bias variable will be created and added to the output. Finally,
+ if activation is not None, it will be applied to the output as well.
+ ```
+
+ [Formula]
+
+ ```
+ This process can be formulated as follows:
+
+ .. math::
+
+ Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+ ```
+
+ [Symbols' Descriptions if necessary]
+
+ ```
+ In the above equation:
+
+ * :math:`N`: Number of the input.
+ * :math:`X_i`: The input tensor.
+ * :math:`W`: The weights created by this layer.
+ * :math:`b`: The bias parameter created by this layer (if needed).
+ * :math:`Act`: The activation function.
+ * :math:`Out`: The output tensor.
+ ```
+
+ [References if necessary]
+
+ 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例:
+
+ ```
+ Refer to `Layer Normalization `_ for more details.
+ ```
+
+
+- Args Description
+
+ - 格式
+
+ \[Arg's Name\][(Data Type, Default Value)][Description]
+
+ - 示例
+
+ fc的部分参数注释如下:
+
+ ```
+ Args:
+ input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+ the input tensor(s) is at least 2.
+ param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+ parameters/weights of this layer.
+ name (str, default None): The name of this layer.
+ ```
+
+- Returns
+
+ - 格式
+
+ [Name][Shape]
+
+ - 示例
+
+ ```
+ Returns:
+ A tensor variable storing the transformation result.
+ ```
+
+ 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例:
+
+ ```
+ Returns:
+ A tuple containing:
+ The hidden state of LSTM whose shape is (T X D).
+ The cell state of LSTM whose shape is (T X D).
+ ```
+
+- Raises
+
+ - 格式
+
+ [Exception Type][Condition]
+
+ - 示例
+
+ ```
+ Raises:
+ ValueError: If the rank of the input is less than 2.
+ ```
+
+- Note
+
+ - 格式
+
+ [Note]
+
+ - 示例
+
+ fc没有注意事项,故该模块省略不写。如有注意事项应明确给出,当有多条注意事项,须分条列出,以scaled\_dot\_product\_attention为例:
+
+ ```
+ Note:
+ 1. When num_heads > 1, three linear projections are learned respectively
+ to map input queries, keys and values into queries', keys' and values'.
+ queries', keys' and values' have the same shapes with queries, keys
+ and values.
+ 2. When num_heads == 1, scaled_dot_product_attention has no learnable
+ parameters.
+ ```
+
+- Examples
+
+ - 格式
+
+ \[Python Code Snipper]
+
+ - 示例
+
+ ```
+ Examples:
+ .. code-block:: python
+
+ data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+ fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+ ```
+
+## 完整示例
+
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f175b219750d1c765a6a111c2ec3aa732fa46175
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,227 @@
+# API Doc Standard
+
+- [API Doc Standard](#api-doc-standard)
+ - [API Doc Structure](#api-doc-structure)
+ - [Format and Examples](#format-and-examples)
+ - [Complete Example](#complete-example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+ The definition of API
+
+- Function Description
+
+ Description of API's function.
+ The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+ Description of API parameters.
+ Introduce parameters one by one according to the order in API definition.
+ The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+ Introduction of API returned value.
+ Introduce meaning of returned value, provide correspoding format if necessary.
+ If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises(if any)
+
+ Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order.
+
+- Note(if any)
+
+ Matters needing attention. If there are more than one matters, they should be listed in order.
+
+- Examples
+
+ Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+ - Format
+
+ [Python API Definition]
+
+ - Example
+
+ ```
+ fc(input,
+ size,
+ num_flatten_dims=1,
+ param_attr=None,
+ bias_attr=None,
+ act=None,
+ name=None,
+ main_program=None,
+ startup_program=None)
+ ```
+
+- Function Description
+
+ - Format
+
+ This part contains (please write them in order):
+
+ [Function Description]
+
+ [Formula]
+
+ [Symbols' Descriptions if necessary]
+
+ [References if necessary]
+
+ - Example
+
+ [Function Description]
+
+ ```
+ **Fully Connected Layer**
+
+ The fully connected layer can take multiple tensors as its inputs. It
+ creates a variable called weights for each input tensor, which represents
+ a fully connected weight matrix from each input unit to each output unit.
+ The fully connected layer multiplies each input tensor with its coresponding
+ weight to produce an output Tensor. If multiple input tensors are given,
+ the results of multiple multiplications will be sumed up. If bias_attr is
+ not None, a bias variable will be created and added to the output. Finally,
+ if activation is not None, it will be applied to the output as well.
+ ```
+
+ [Formula]
+
+ ```
+ This process can be formulated as follows:
+
+ .. math::
+
+ Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+ ```
+
+ [Symbols' Descriptions if necessary]
+
+ ```
+ In the above equation:
+
+ * :math:`N`: Number of the input.
+ * :math:`X_i`: The input tensor.
+ * :math:`W`: The weights created by this layer.
+ * :math:`b`: The bias parameter created by this layer (if needed).
+ * :math:`Act`: The activation function.
+ * :math:`Out`: The output tensor.
+ ```
+
+ [References if necessary]
+
+ Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example:
+
+ ```
+ Refer to `Layer Normalization `_ for more details.
+ ```
+
+
+- Args Description
+
+ - Format
+
+ \[Arg's Name\][(Data Type, Default Value)][Description]
+
+ - Example
+
+ part of fc parameters are as follows:
+
+ ```
+ Args:
+ input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+ the input tensor(s) is at least 2.
+ param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+ parameters/weights of this layer.
+ name (str, default None): The name of this layer.
+ ```
+
+- Returns
+
+ - Format
+
+ [Name][Shape]
+
+ - Example
+
+ ```
+ Returns:
+ A tensor variable storing the transformation result.
+ ```
+
+ when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+ ```
+ Returns:
+ A tuple containing:
+ The hidden state of LSTM whose shape is (T X D).
+ The cell state of LSTM whose shape is (T X D).
+ ```
+
+- Raises
+
+ - Format
+
+ [Exception Type][Condition]
+
+ - Example
+
+ ```
+ Raises:
+ ValueError: If the rank of the input is less than 2.
+ ```
+
+- Note
+
+ - Format
+
+ [Note]
+
+ - Example
+
+ there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+ ```
+ Note:
+ 1. When num_heads > 1, three linear projections are learned respectively
+ to map input queries, keys and values into queries', keys' and values'.
+ queries', keys' and values' have the same shapes with queries, keys
+ and values.
+ 2. When num_heads == 1, scaled_dot_product_attention has no learnable
+ parameters.
+ ```
+
+- Examples
+
+ - Format
+
+ \[Python Code Snipper]
+
+ - Example
+
+ ```
+ Examples:
+ .. code-block:: python
+
+ data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+ fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+ ```
+
+## Complete Example
+
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/dev/ci_build_whl.png differ
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3244eedf918b93f9351258f1218dfb2d507c1a9c
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,243 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献,欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+
+## 代码要求
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开,并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+- 请遵守[提交代码的一些约定](#提交代码的一些约定)。
+
+以下教程将指导您提交代码。
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页,然后单击 `Fork` 按钮,生成自己目录下的仓库,比如 。
+
+## 克隆(Clone)
+
+将远程仓库 clone 到本地:
+
+```bash
+➜ git clone https://github.com/USERNAME/Paddle
+➜ cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发,测试,发行和维护,具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成,一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜ git checkout -b my-cool-stuff
+```
+
+值得注意的是,在 checkout 之前,需要保持当前分支目录 clean,否则会把 untracked 的文件也带到新分支上,这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 Paddle,首先安装并在当前目录运行它:
+
+```bash
+➜ pip install pre-commit
+➜ pre-commit install
+```
+
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。
+
+注:通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的,Paddle 开发人员使用的是`pip install pre-commit`。
+
+## 开始开发
+
+在本例中,我删除了 README.md 中的一行,并创建了一个新文件。
+
+通过 `git status` 查看当前状态,这会提示当前目录的一些变化,同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜ git status
+On branch test
+Changes not staged for commit:
+ (use "git add ..." to update what will be committed)
+ (use "git checkout -- ..." to discard changes in working directory)
+
+ modified: README.md
+
+Untracked files:
+ (use "git add ..." to include in what will be committed)
+
+ test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。
+
+如要build这个开发镜像,在源码目录树的根目录中运行:
+
+```bash
+➜ docker build -t paddle:latest-dev .
+```
+
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以:
+
+```bash
+➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
+```
+
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`):
+
+```bash
+➜ docker build -t paddle:prod -f build/Dockerfile .
+```
+
+如果要运行所有的单元测试,可以用如下命令:
+
+```bash
+➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
+```
+
+关于构建和测试的更多信息,请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
+
+## 提交(commit)
+
+接下来我们取消对 README.md 文件的改变,然后提交新添加的 test 文件。
+
+```bash
+➜ git checkout -- README.md
+➜ git status
+On branch test
+Untracked files:
+ (use "git add ..." to include in what will be committed)
+
+ test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜ git add test
+```
+
+Git 每次提交代码,都需要写提交说明,这可以让其他人知道这次提交做了哪些改变,这可以通过`git commit` 完成。
+
+```bash
+➜ git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前,需要同步原仓库()最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜ git remote
+origin
+➜ git remote -v
+origin https://github.com/USERNAME/Paddle (fetch)
+origin https://github.com/USERNAME/Paddle (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字,也就是自己用户名下的 Paddle,接下来我们创建一个原始 Paddle 仓库的远程主机,命名为 upstream。
+
+```bash
+➜ git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜ git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜ git fetch upstream
+➜ git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上,也就是 https://github.com/USERNAME/Paddle。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜ git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题,并记录它的编号。
+
+切换到所建分支,然后点击 `New pull request`。
+
+
+
+选择目标分支:
+
+
+
+在 PR 的描述说明中,填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后,自动关闭对应的 Issue,具体请见 。
+
+接下来等待 review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。
+
+
+
+也可以使用 `git push origin :分支名` 删除远程分支,如:
+
+```bash
+➜ git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后,删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜ git checkout develop
+
+# 删除 my-cool-stuff 分支
+➜ git branch -D my-cool-stuff
+```
+
+至此,我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。
+2. 提交PUll Request前:
+ - 请注意commit的数量:
+ - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。
+ - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+ - 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。
+3. 如果解决了某个Issue的问题,请在该PUll Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外,在回复评审人意见时,请您遵守以下约定:
+
+1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢):
+ - 对评审意见同意且按其修改完的,给个简单的`Done`即可;
+ - 对评审意见不同意的,请给出您自己的反驳理由。
+2. 如果评审意见比较多:
+ - 请给出总体的修改情况。
+ - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1,162 @@
+# Contribute Code
+
+You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the
+[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329).
+
+We sincerely appreciate your contribution. This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/). The following steps guide usual contributions.
+
+1. Fork
+
+ Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo. So, please file Pull Requests from your fork. To make a fork, just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+ To make a copy of your fork to your local computers, please run
+
+ ```bash
+ git clone https://github.com/your-github-account/paddle
+ cd paddle
+ ```
+
+1. Create the local feature branch
+
+ For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+ ```bash
+ git checkout -b my-cool-stuff
+ ```
+
+1. Commit
+
+ Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+ ```bash
+ pip install pre-commit
+ pre-commit install
+ ```
+
+ Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+ Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`:
+
+ ```
+ ➜ git commit
+ CRLF end-lines remover...............................(no files to check)Skipped
+ yapf.................................................(no files to check)Skipped
+ Check for added large files..............................................Passed
+ Check for merge conflicts................................................Passed
+ Check for broken symlinks................................................Passed
+ Detect Private Key...................................(no files to check)Skipped
+ Fix End of Files.....................................(no files to check)Skipped
+ clang-formater.......................................(no files to check)Skipped
+ [my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+ ```
+
+ NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
+1. Build and test
+
+ Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+ An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+ ```bash
+ git remote add upstream https://github.com/PaddlePaddle/Paddle
+ git pull upstream develop
+ ```
+
+1. Push and file a pull request
+
+ You can "push" your local work into your forked repo:
+
+ ```bash
+ git push origin my-cool-stuff
+ ```
+
+ The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+ To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+ If your change is for fixing an issue, please write ["Fixes "](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request. Github would close the issue when the owners merge your pull request.
+
+ Please remember to specify some reviewers for your pull request. If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+ To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+ ```bash
+ git push origin :my-cool-stuff
+ git checkout develop
+ git pull upstream develop
+ git branch -d my-cool-stuff
+ ```
+
+### Code Review
+
+- Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email. Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment. If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits. Some developers commit often. It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style. In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default. This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`. To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`. For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose). The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter. For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold. For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37e608160db0ad5a92297987937bbbfa8f842ea8
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,16 @@
+开发标准
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ contribute_to_paddle_cn.md
+ write_docs_cn.md
+ api_doc_std_cn.md
+ new_op_cn.md
+ new_op_kernel.md
+ use_eigen_cn.md
+ name_convention.md
+ support_new_device.md
+ releasing_process_cn.md
+ op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7f83035010f13c30514673ecbee301f194dc175
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,16 @@
+Development
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ contribute_to_paddle_en.md
+ write_docs_en.md
+ api_doc_std_en.md
+ new_op_en.md
+ new_op_kernel.md
+ use_eigen_en.md
+ name_convention.md
+ support_new_device.md
+ releasing_process_en.md
+ op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b4244d0f506c8cd6c08739141eabad27c581ca7
--- /dev/null
+++ b/doc/fluid/dev/name_convention.md
@@ -0,0 +1,65 @@
+# Operator's Parameter Name Convention
+
+To make the operator document itself more clear, we recommend operator names obey the listing conventions.
+
+## OpProtoMaker names
+
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
+
+- Input/Output.
+ - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
+ - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
+
+- Attribute.
+ - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
+
+- Comments.
+ - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g. Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
+ - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
+
+- Order.
+ - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
+
+## Best Practice
+
+Here we give some examples to show how these rules will be used.
+
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
+
+- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
+
+- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
+
+ We give a full example of Accumulator Operator.
+
+```c++
+class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+ AccumulateOpMaker(OpProto *proto,
+ OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+ If the output size is not the same as input size,
+ the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
+ AddOutput("Out", "(Tensor) Accumulated output tensor");
+ AddAttr("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
+ AddComment(R"DOC(
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
+output tensor already has the right size, we add to it; otherwise, we first
+initialize the output tensor to all zeros, and then do accumulation. Any
+further calls to the operator, given that no one else fiddles with the output
+in the interim, will do simple accumulations.
+
+Accumulation is done as follows:
+
+Out = 1*X + gamma*Out
+
+where X is the input tensor, Out is the output tensor and gamma is the multiplier
+argument.
+
+)DOC");
+ }
+};
+```
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff7408111fa20a7a6a3a2fe9f9ba20835918f399
--- /dev/null
+++ b/doc/fluid/dev/new_op_cn.md
@@ -0,0 +1,435 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现c类)
+ - [定义ProtoMaker类](#定义protomaker类)
+ - [定义Operator类](#定义operator类)
+ - [定义OpKernel类](#定义opkernel类)
+ - [注册Operator](#注册operator)
+ - [编译](#编译)
+ - [绑定Python](#绑定python)
+ - [实现单元测试](#实现单元测试)
+ - [前向Operator单测](#前向operator单测)
+ - [反向Operator单测](#反向operator单测)
+ - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
+
+
+## 概念简介
+
+简单介绍需要用到基类,详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写,Op)基类。
+- `framework::OpKernel`: Op计算函数的基类,称作Kernel。
+- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
+- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
+
+
+
+
+内容 |
+定义位置 |
+
+
+
+
+OpProtoMake定义 |
+.cc 文件,Backward Op不需要定义OpProtoMake |
+
+
+Op定义 |
+ .cc 文件 |
+
+
+Kernel实现 |
+ CPU、CUDA共享Kernel实现在.h 文件中,否则,CPU 实现在.cc 文件中,CUDA 实现在.cu 文件中。 |
+
+
+注册Op |
+ Op注册实现在.cc 文件;Kernel注册CPU实现在.cc 文件中,CUDA实现在.cu 文件中 |
+
+
+
+
+
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 定义ProtoMaker类
+
+矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+ AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+ AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+ AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+ }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数:
+
+ - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
+ - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
+
+构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
+
+上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例:
+
+```cpp
+template
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X", "(Tensor) Input tensor of scale operator.");
+ AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+ AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
+)DOC");
+ AddAttr("scale",
+ "(float, default 1.0)"
+ "The scaling factor of the scale operator.")
+ .SetDefault(1.0);
+ }
+};
+```
+
+这个例子有`AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
+
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker,若未定制对应前向Op的GradProtoMaker,fluid提供了DefaultGradProtoMaker,默认注册会使用全部输入输出,包括Input, Output, Output@Grad等,使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ std::unique_ptr Apply() const override {
+ auto *grad_op = new framework::OpDesc();
+ grad_op->SetType("scale");
+ grad_op->SetInput("X", OutputGrad("Out"));
+ grad_op->SetOutput("Out", InputGrad("X"));
+ grad_op->SetAttr("scale", GetAttr("scale"));
+ return std::unique_ptr(grad_op);
+ }
+};
+```
+
+### 定义Operator类
+
+下面实现了MulOp的定义:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(const framework::InferShapeContext &ctx) const override {
+ auto dim0 = ctx.Input("X")->dims();
+ auto dim1 = ctx.Input("Y")->dims();
+ PADDLE_ENFORCE_EQ(dim0.size(), 2,
+ "input X(%s) should be a tensor with 2 dims, a matrix",
+ ctx.op_.Input("X"));
+ PADDLE_ENFORCE_EQ(dim1.size(), 2,
+ "input Y(%s) should be a tensor with 2 dims, a matrix",
+ ctx.op_.Input("Y"));
+ PADDLE_ENFORCE_EQ(
+ dim0[1], dim1[0],
+ "First matrix's width must be equal with second matrix's height.");
+ ctx.Output("Out")->Resize({dim0[0], dim1[1]});
+ }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员:
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
+
+ - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
+ - 2). 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中
+
+### 定义OpKernel类
+
+`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
+
+- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型,如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现:
+
+ ```cpp
+ template
+ class MulKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto* X = context.Input("X");
+ auto* Y = context.Input