Merge branch 'develop' of https://github.com/PaddlePaddle/FluidDoc into develop

9aa6e363 · seiriosPlus · 4f522a6a · e263e1c0 · 9aa6e363 · 9aa6e363
1000 changed file
--- a/.gitignore
+++ b/.gitignore
+*.DS_Store
 .vscode/
 /doc/fluid/menu.zh.json
 /doc/fluid/menu.en.json

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+-   repo: https://github.com/pre-commit/mirrors-yapf.git
+    sha: v0.16.0
+    hooks:
+    -   id: yapf
+        files: \.py$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*paddle)^.*$
+    -   id: end-of-file-fixer
+        files: \.md$
+    -   id: trailing-whitespace
+        files: \.md$
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    sha: v1.0.1
+    hooks:
+    -   id: forbid-crlf
+        files: \.md$
+    -   id: remove-crlf
+        files: \.md$
+    -   id: forbid-tabs
+        files: \.md$
+    -   id: remove-tabs
+        files: \.md$
+-   repo: https://github.com/reyoung/pre-commit-hooks-jinja-compile.git
+    sha: 4a369cc72a4a2b8d3813ab8cc17abb5f5b21ef6c
+    hooks:
+    -   id: convert-jinja2-into-html
+        # The argument means repleace filename from pattern `.*/([^/]*)\.tmpl` to `\1`
+        args: ['--filename_pattern=.*/([^/]*)\.tmpl', '--filename_repl=\1']
+-   repo: local
+    hooks:
+    -   id: convert-markdown-into-html
+        name: convert-markdown-into-html
+        description: Convert README.md into index.html and README.cn.md into index.cn.html
+        entry: python .pre-commit-hooks/convert_markdown_into_html.py
+        language: system
+        files: .+README(\.cn)?\.md$
+
--- a/.pre-commit-hooks/convert_markdown_into_html.py
+++ b/.pre-commit-hooks/convert_markdown_into_html.py
+import argparse
+import re
+import sys
+
+HEAD = """
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+"""
+
+TAIL = """
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
+"""
+
+
+def convert_markdown_into_html(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+
+    retv = 0
+
+    for filename in args.filenames:
+        with open(
+                re.sub(r"README", "index", re.sub(r"\.md$", ".html",
+                                                  filename)), "w") as output:
+            output.write(HEAD)
+            with open(filename) as input:
+                for line in input:
+                    output.write(line)
+            output.write(TAIL)
+
+    return retv
+
+
+if __name__ == '__main__':
+    sys.exit(convert_markdown_into_html())
--- a/.pre-commit-hooks/convert_markdown_into_ipynb.sh
+++ b/.pre-commit-hooks/convert_markdown_into_ipynb.sh
+#!/bin/sh
+for file in $@ ; do
+    markdown-to-ipynb < $file > ${file%.*}".ipynb"
+    if [ $? -ne 0 ]; then
+        echo >&2 "markdown-to-ipynb $file error"
+        exit 1
+    fi
+done
+
--- a/ci_scripts/api_white_list.txt
+++ b/ci_scripts/api_white_list.txt
+paddle/fluid/DistributeTranspiler_cn.rst
+paddle/fluid/DistributeTranspilerConfig_cn.rst
+paddle/fluid/transpiler/HashName_cn.rst
+paddle/fluid/memory_optimize_cn.rst
+paddle/fluid/release_memory_cn.rst
+paddle/optimizer/Dpsgd_cn.rst
+paddle/reader/ComposeNotAligned_cn.rst
+paddle/fluid/layers/scatter_cn.rst
+paddle/tensor/manipulation/scatter_cn.rst
+paddle/distributed/fleet/Fleet_cn.rst
--- a/ci_scripts/check_api_cn.sh
+++ b/ci_scripts/check_api_cn.sh
+#!/bin/bash
+
+git_files=`git diff --numstat upstream/$BRANCH | awk '{print $NF}'`
+
+for file in `echo $git_files`;do
+  grep "code-block" ../$file
+  if [ $? -eq 0 ] ;then 
+    echo $file | grep "doc/paddle/api/paddle/.*_cn.rst"
+    if [ $? -eq 0 ];then
+        api_file=`echo $file | sed 's#doc/paddle/api/##g'`
+        grep -w "${api_file}" ${DIR_PATH}/api_white_list.txt
+        if [ $? -ne 0 ];then
+            python chinese_samplecode_processor.py ../$file
+            if [ $? -ne 0 ];then
+                echo "chinese sample code failed"
+                exit 5
+            fi
+        fi 
+    fi
+  fi
+done
+
--- a/ci_scripts/check_code.sh
+++ b/ci_scripts/check_code.sh
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#=================================================
+#                   Utils
+#=================================================
+
+set -ex
+
+if [ -z ${BRANCH} ]; then
+    BRANCH="develop"
+fi
+
+BENCHMARK_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/.." && pwd )"
+echo ${BENCHMARK_ROOT}
+
+function prepare_env(){
+    # Install tensorflow and other packages
+    pip install pre-commit==1.21 pylint==1.9.5 pytest==4.6.9
+}
+
+function abort(){
+    echo "Your change doesn't follow benchmark's code style." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
+    exit 1
+}
+
+
+function check_style(){
+	trap 'abort' 0
+	pre-commit install
+	commit_files=on
+    	for file_name in `git diff --numstat upstream/$BRANCH| awk '{print $NF}'`;do
+		if  ! pre-commit run --files ../$file_name ; then
+            		git diff
+            		commit_files=off
+        	fi
+    	done
+    	if [ $commit_files == 'off' ];then
+        	echo "code format error"
+        	exit 1
+    	fi
+    	trap 0
+}
+
+prepare_env
+check_style
--- a/ci_scripts/check_pr_approval.py
+++ b/ci_scripts/check_pr_approval.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import json
+
+
+def check_approval(count, required_reviewers):
+    json_buff = ""
+    for line in sys.stdin:
+        json_buff = "".join([json_buff, line])
+    json_resp = json.loads(json_buff)
+    approves = 0
+    approved_user_ids = []
+    for review in json_resp:
+        if review["state"] == "APPROVED":
+            approves += 1
+            approved_user_ids.append(review["user"]["id"])
+
+    # convert to int
+    required_reviewers_int = set()
+    for rr in required_reviewers:
+        required_reviewers_int.add(int(rr))
+
+    if len(set(approved_user_ids) & required_reviewers_int) >= count:
+        print("TRUE")
+    else:
+        print("FALSE")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1].isdigit():
+        check_approval(int(sys.argv[1]), sys.argv[2:])
+    else:
+        print(
+            "Usage: python check_pr_approval.py [count] [required reviewer id] ..."
+        )
--- a/ci_scripts/checkapproval.sh
+++ b/ci_scripts/checkapproval.sh
+#!/bin/bash
+
+API_FILES=("doc/paddle/api/paddle")
+
+for API_FILE in ${API_FILES[*]}; do
+  API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}"`
+  if [ "${API_CHANGE}" ];then
+    approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/FluidDoc/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+    if [ "${API_FILE}" == "doc/paddle/api/paddle" ];then
+      APPROVALS=`echo ${approval_line} | python ./check_pr_approval.py 1 2870059 27208573 29231 28379894 23093488 11935832`
+    fi
+  fi
+  if [ "${APPROVALS}" == "FALSE" ]; then
+    if [ "${API_FILE}" == "doc/paddle/api/paddle" ];then
+      echo "You must have one TPM (saxon-zh or swtkiwi or jzhang533 or Heeenrrry or dingjiaweiww or TCChenlong) approval for the api change! ${API_FILE} for the management reason of API interface and API document."
+    fi
+    exit 1
+  fi
+done
+
--- a/ci_scripts/chinese_samplecode_processor.py
+++ b/ci_scripts/chinese_samplecode_processor.py
+import math
+import os
+import pickle
+import shutil
+import subprocess
+import multiprocessing
+import sys
+
+
+def remove_desc_code(srcls, filename):
+    if filename == 'fluid_cn/one_hot_cn.rst':
+        srcls.pop(13)
+        srcls.pop(28)
+        srcls.pop(44)
+    if filename == 'layers_cn/one_hot_cn.rst':
+        srcls.pop(15)
+        srcls.pop(30)
+        srcls.pop(46)
+    if filename == 'profiler_cn/profiler_cn.rst':
+        srcls.pop(41)
+    if filename == 'layers_cn/natural_exp_decay_cn.rst':
+        srcls.pop(13)
+    if filename == 'layers_cn/transpose_cn.rst':
+        srcls.pop(20)
+    if filename == 'layers_cn/array_length_cn.rst':
+        srcls.pop(36)
+    if filename == 'layers_cn/inverse_time_decay_cn.rst':
+        srcls.pop(13)
+    if filename == 'layers_cn/stack_cn.rst':
+        srcls.pop(12)
+        srcls.pop(33)
+    if filename == 'layers_cn/sums_cn.rst':
+        srcls.pop(11)
+    if filename == 'layers_cn/sum_cn.rst':
+        for i in range(len(srcls) - 1, 61, -1):
+            srcls.pop(i)
+    if filename == 'layers_cn/softmax_cn.rst':
+        srcls.pop(30)
+        srcls.pop(57)
+    if filename == 'layers_cn/array_write_cn.rst':
+        srcls.pop(37)
+    if filename == 'layers_cn/lod_append_cn.rst':
+        srcls.pop(11)
+    if filename == 'layers_cn/reorder_lod_tensor_by_rank_cn.rst':
+        srcls.pop(25)
+    if filename == 'layers_cn/round_cn.rst':
+        srcls.pop(10)
+    if filename == 'layers_cn/squeeze_cn.rst':
+        srcls.pop(11)
+        srcls.pop(19)
+        srcls.pop(27)
+    if filename == 'layers_cn/unsqueeze_cn.rst':
+        srcls.pop(11)
+    if filename == 'layers_cn/array_read_cn.rst':
+        srcls.pop(51)
+    if filename == 'layers_cn/scatter_cn.rst':
+        srcls.pop(9)
+    if filename == 'layers_cn/topk_cn.rst':
+        srcls.pop(11)
+    if filename == 'optimizer_cn/ModelAverage_cn.rst':
+        srcls.pop(15)
+    return srcls
+
+
+def check_indent(code_line):
+    indent = ""
+    for c in code_line:
+        if c == '\t':
+            indent += '    '
+        elif c == ' ':
+            indent += ' '
+        if c != ' ' and c != '\t':
+            break
+    return indent
+
+
+def find_all(src_str, substr):
+    indices = []
+    get_one = src_str.find(substr)
+    while get_one != -1:
+        indices.append(get_one)
+        get_one = src_str.find(substr, get_one + 1)
+    return indices
+
+
+def extract_sample_code(srcfile, status_all):
+    filename = srcfile.name
+    srcc = srcfile.read()
+    srcfile.seek(0, 0)
+    srcls = srcfile.readlines()
+    srcls = remove_desc_code(
+        srcls, filename)  # remove description info for samplecode
+    status = []
+    sample_code_begins = find_all(srcc, " code-block:: python")
+    if len(sample_code_begins) == 0:
+        status.append(-1)
+
+    else:
+        for i in range(0, len(srcls)):
+            if srcls[i].find(".. code-block:: python") != -1:
+                content = ""
+                start = i
+
+                blank_line = 1
+                while srcls[start + blank_line].strip() == '':
+                    blank_line += 1
+
+                startindent = ""
+                # remove indent error
+                if srcls[start + blank_line].find("from") != -1:
+                    startindent += srcls[start + blank_line][:srcls[
+                        start + blank_line].find("from")]
+                elif srcls[start + blank_line].find("import") != -1:
+                    startindent += srcls[start + blank_line][:srcls[
+                        start + blank_line].find("import")]
+                else:
+                    startindent += check_indent(srcls[start + blank_line])
+                content += srcls[start + blank_line][len(startindent):]
+                for j in range(start + blank_line + 1, len(srcls)):
+                    # planish a blank line
+                    if not srcls[j].startswith(startindent) and srcls[
+                            j] != '\n':
+                        break
+                    if srcls[j].find(" code-block:: python") != -1:
+                        break
+                    content += srcls[j].replace(startindent, "", 1)
+                status.append(run_sample_code(content, filename))
+
+    status_all[filename] = status
+    return status_all
+
+
+def run_sample_code(content, filename):
+    # three status ,-1:no sample code; 1: running error; 0:normal
+    fname = filename.split("/")[-1].replace("_cn", "").replace(".rst",
+                                                               "") + ".py"
+    tempf = open("temp/" + fname, 'w')
+    content = "# -*- coding: utf-8 -*-\n" + content
+    tempf.write(content)
+    tempf.close()
+    cmd = ["python", "temp/" + fname]
+
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _, error = subprc.communicate()
+    err = "".join(error.decode(encoding='utf-8'))
+
+    if subprc.returncode != 0:
+        print("\nSample code error found in ", filename, ":\n")
+        print(err)
+        status = 1
+    else:
+        status = 0
+    os.remove("temp/" + fname)
+    return status
+
+
+def test(file):
+    temp = []
+    src = open(file, 'r')
+    status_all = {}
+    extract_sample_code(src, status_all)
+    temp.append(status_all)
+    src.close()
+    return temp
+
+
+if os.path.isdir("temp"):
+    shutil.rmtree("temp")
+if os.path.isdir("infer_model"):
+    shutil.rmtree("infer_model")
+if os.path.isdir("image"):
+    shutil.rmtree("image")
+if os.path.isdir("my_paddle_model"):
+    shutil.rmtree("my_paddle_model")
+if os.path.isdir("my_paddle_vars"):
+    shutil.rmtree("my_paddle_vars")
+
+if not os.path.isdir("temp"):
+    os.mkdir("temp")
+
+output = []
+
+if len(sys.argv) < 2:
+    print("Error: inadequate number of arguments")
+    print("Please one file")
+    sys.exit(1)
+else:
+    if not os.path.exists(sys.argv[1]):
+        print("File not found")
+        sys.exit(1)
+    res = test(sys.argv[1])
+    output.append(res)
+
+status_groups = {-1: [], 0: [], 1: []}
+# polishes show format
+ci_pass = True
+for one_file in output:
+    for dicts in one_file:
+        for key in dicts:
+            status = dicts[key]
+            for ele in status:
+                if ele != 0:
+                    ci_pass = False
+                    break
+            if len(status) == 1:
+                status_groups[status[0]].append(key)
+            else:
+                for u in range(0, len(status)):
+                    status_groups[status[u]].append(key + '_' + str(u + 1))
+
+error_api = status_groups[-1] + status_groups[1]
+total_error_number = len(error_api)
+
+print("****************************************************")
+print("----------------End of the Check--------------------")
+print("****************************************************")
+if total_error_number > 0:
+    print("Error sample code number is:{}".format(total_error_number))
+    type_one_number = len(status_groups[-1])
+    type_two_number = len(status_groups[1])
+    if type_one_number > 0:
+        print("Error type one sample number is:{}".format(type_one_number))
+        print("Error raised from type one:no sample code.",
+              str(status_groups[-1]))
+    if type_two_number > 0:
+        print("Error type two sample number is:{}".format(type_two_number))
+        print("Error raised from type two:running error sample code.",
+              str(status_groups[1]))
+if not ci_pass:
+    print("Mistakes found in sample codes.")
+    exit(1)
+else:
+    print("Sample code check is successful!")
--- a/ci_scripts/ci_start.sh
+++ b/ci_scripts/ci_start.sh
+#!/bin/bash
+
+export DIR_PATH=${PWD}
+
+/bin/bash  ${DIR_PATH}/check_code.sh
+if [ $? -ne 0 ];then
+    echo "code format error"
+    exit 1
+fi
+
+/bin/bash -x ${DIR_PATH}/check_api_cn.sh
+if [ $? -ne 0 ];then
+  exit 1
+fi
+
+/bin/bash  ${DIR_PATH}/checkapproval.sh
\ No newline at end of file
--- a/doc/fluid/advanced_guide/addon_development/contribute_code/local_dev_guide.md
+++ b/doc/fluid/advanced_guide/addon_development/contribute_code/local_dev_guide.md
@@ -9,7 +9,24 @@
 - 通过所有单元测试。
 - 请遵守[提交代码的一些约定](#提交代码的一些约定)。

-以下教程将指导您提交代码。
+
+## 使用官方开发镜像（推荐）
+
+```
+# 第一次启动（CPU开发）
+docker run -it --cpu-shares=20000 --name=username --net=host --privileged --rm -v $(pwd):/Paddle hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash
+# 第一次启动（GPU开发）
+nvidia-docker run -it --cpu-shares=20000 --name=username --net=host --privileged --rm -v $(pwd):/Paddle hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash
+# 后面几次启动
+docker exec -it username bash
+```
+
+不同开发者启动docker的命令不一样，以上只是推荐命令。如果使用自己习惯的命令，一定要加参数--privileged（GPU的CUPTI库调用需要）
+
+**推荐使用官方开发镜像 hub.baidubce.com/paddlepaddle/paddle:latest-dev 提交代码。**
+
+**以下教程将指导您提交代码。**
+
 ## [Fork](https://help.github.com/articles/fork-a-repo/)

 跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
@@ -42,7 +59,7 @@ Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-b

 Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。

-`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+`pre-commit`测试是 CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：

 ```bash
 ➜  pip install pre-commit
@@ -51,7 +68,7 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G

 Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。

-注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`，使用Paddle docker镜像会自带`pre-commit`不需要单独安装。

 ## 开始开发

@@ -76,9 +93,43 @@ Untracked files:
 no changes added to commit (use "git add" and/or "git commit -a")
 ```

-## 编译和单元测试
+## 编译
+
+创建并进入/Paddle/build路径下：
+
+    mkdir -p /Paddle/build && cd /Paddle/build
+
+执行cmake：
+
+
+    * 对于需要编译**CPU版本PaddlePaddle**的用户：
+
+    For Python2: cmake .. -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+    For Python3: cmake .. -DPY_VERSION=3.5 -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+
+    * 对于需要编译**GPU版本PaddlePaddle**的用户：
+
+    For Python2: cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+    For Python3: cmake .. -DPY_VERSION=3.5 -DWITH_GPU=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+
+执行编译：
+
+    make -j$(nproc)
+
+    如：make -j16，使用16核编译
+
+安装编译好的whl包：首先进入/Paddle/build/python/dist目录下找到生成的.whl包后，然后当前机器或目标机器安装编译好的.whl包：
+
+    For Python2: pip install -U（whl包的名字）
+    For Python3: pip3.5 install -U（whl包的名字）

 关于编译 PaddlePaddle 的源码，请参见[从源码编译](../../../install/compile/fromsource.html) 选择对应的操作系统。
+
+## 单元测试
+
+    单测运行（重复运行多次，避免随机失败）如重复运行100次的命令如下:
+    ctest --repeat-until-fail 100 -R test_xx
+
 关于单元测试，可参考[Op单元测试](../new_op/new_op.html#id7) 的运行方法。

 ## 提交（commit）
@@ -115,15 +166,6 @@ clang-formater.......................................(no files to check)Skipped
 create mode 100644 233
 ```

-<b> <font color="red">需要注意的是：您需要在commit中添加说明（commit message）以触发CI单测，写法如下：</font> </b>
-
-```bash
-# 触发develop分支的CI单测
-➜  git commit -m "test=develop"
-
-# 触发release/1.1分支的CI单侧
-➜  git commit -m "test=release/1.1"
-```

 ## 保持本地仓库最新


--- a/doc/fluid/advanced_guide/addon_development/contribute_code/local_dev_guide_en.md
+++ b/doc/fluid/advanced_guide/addon_development/contribute_code/local_dev_guide_en.md
@@ -9,7 +9,22 @@ You will learn how to develop programs in local environment under the guidelines
 - Pass through all unit tests.
 - Please follow [regulations of submitting codes](#regulations of submitting codes).

-The following guidiance tells you how to submit code.
+## Use official development images(recommended)
+
+```
+# First start（CPU development）
+docker run -it --cpu-shares=20000 --name=username --net=host --privileged --rm -v $(pwd):/Paddle hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash
+# First start（GPU development）
+nvidia-docker run -it --cpu-shares=20000 --name=username --net=host --privileged --rm -v $(pwd):/Paddle hub.baidubce.com/paddlepaddle/paddle:latest-dev /bin/bash
+# Next start
+docker exec -it username bash
+```
+Different developers have different commands to start docker. The above are only recommended commands. If you use the command you are used to, you must add the parameter --privileged (needed by the GPU CUPTI library call)
+
+**It is recommended to use the official development mirror hub.baidubce.com/paddlepaddle/paddle:latest-dev to submit the code.**
+
+**The following guidiance tells you how to submit code.**
+
 ## [Fork](https://help.github.com/articles/fork-a-repo/)

 Transfer to the home page of Github [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) ,and then click button `Fork`  to generate the git under your own file directory,such as <https://github.com/USERNAME/Paddle>。
@@ -44,7 +59,7 @@ It is worth noting that before the checkout, you need to keep the current branch

 Paddle developers use the [pre-commit](http://pre-commit.com/) tool to manage Git pre-commit hooks. It helps us format the source code (C++, Python) and automatically check some basic things before committing (such as having only one EOL per file, not adding large files in Git, etc.).

-The `pre-commit` test is part of the unit test in Travis-CI. A PR that does not satisfy the hook cannot be submitted to Paddle. Install `pre-commit` first and then run it in current directory：
+The `pre-commit` test is part of the unit test in CI. A PR that does not satisfy the hook cannot be submitted to Paddle. Install `pre-commit` first and then run it in current directory：


 ```bash
@@ -54,7 +69,7 @@ The `pre-commit` test is part of the unit test in Travis-CI. A PR that does not

 Paddle modify the format of C/C++ source code with `clang-format` .Make sure the version of `clang-format` is above 3.8.

-Note：There are differences between the installation of `yapf` with `pip install pre-commit` and that with `conda install -c conda-forge pre-commit` . Paddle developers use `pip install pre-commit` 。
+Note：There are differences between the installation of `yapf` with `pip install pre-commit` and that with `conda install -c conda-forge pre-commit` . Paddle developers use `pip install pre-commit`, Using Paddle docker image will `pre-commit`without separate installation .

 ## Start development

@@ -76,7 +91,45 @@ Untracked files:
 no changes added to commit (use "git add" and/or "git commit -a")
 ```

-## Build and test
+## Build
+
+Create and enter the /Paddle/build path
+
+    mkdir -p /Paddle/build && cd /Paddle/build
+
+Execute cmake:
+
+
+    * For users who need to compile the **CPU version PaddlePaddle**:
+
+    For Python2: cmake .. -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+    For Python3: cmake .. -DPY_VERSION=3.5 -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+
+
+    * For users who need to compile the **GPU version PaddlePaddle**:
+
+    For Python2: cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+    For Python3: cmake .. -DPY_VERSION=3.5 -DWITH_GPU=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release
+
+
+Execute compilation:
+
+    make -j$(nproc)
+
+    Such as: make -j16, using 16 core compilation
+
+After compiling successfully, go to the `/paddle/build/python/dist` directory and find the generated `.whl` package.Install the compiled .whl package on the current machine or target machine:
+
+    For Python2: pip install -U（whl package name）
+    For Python3: pip3.5 install -U（whl package name）
+
+Please refer to [Compile From Source Code](../../../install/compile/fromsource_en.html) about more information of building PaddlePaddle source codes.
+
+## Test
+
+    Run Test (Run 100 times)
+    ctest --repeat-until-fail 100 -R test_xx
+

 Please refer to [Compile From Source Code](../../../install/compile/fromsource_en.html) about more information of building PaddlePaddle source codes.
 Please refer to [Op Unit Tests](../new_op/new_op_en.html#unit-tests) about more information of running unit tests.
@@ -113,14 +166,6 @@ clang-formater.......................................(no files to check)Skipped
 create mode 100644 233
 ```

-<b> <font color="red">Attention needs to be paid：you need to add commit message to trigger CI test.The command is as follows:</font> </b>
-
-```bash
-# Touch CI single test of develop branch
-➜  git commit -m "test=develop"
-# Touch CI single test of release/1.1 branch
-➜  git commit -m "test=release/1.1"
-```

 ## Keep the latest local repository


--- a/doc/fluid/advanced_guide/addon_development/contribute_code/submit_pr_guide.md
+++ b/doc/fluid/advanced_guide/addon_development/contribute_code/submit_pr_guide.md
@@ -26,7 +26,7 @@

 <div align="center">

-<img src="https://github.com/PaddlePaddle/FluidDoc/blob/release/1.1/doc/fluid/advanced_usage/development/contribute_to_paddle/img/cla_unsigned.png?raw=true"  height="330" width="400">
+<img src="https://github.com/PaddlePaddle/FluidDoc/blob/release/1.1/doc/fluid/advanced_usage/development/contribute_to_paddle/img/cla_unsigned.png?raw=true"  height="40" width="500">

 </div>


--- a/doc/fluid/advanced_guide/addon_development/contribute_code/submit_pr_guide_en.md
+++ b/doc/fluid/advanced_guide/addon_development/contribute_code/submit_pr_guide_en.md
@@ -26,7 +26,7 @@ For the first time to submit Pull Request,you need to sign CLA(Contributor Licen

 <div align="center">

-<img src="https://github.com/PaddlePaddle/FluidDoc/blob/release/1.1/doc/fluid/advanced_usage/development/contribute_to_paddle/img/cla_unsigned.png?raw=true"  height="330" width="400">
+<img src="https://github.com/PaddlePaddle/FluidDoc/blob/release/1.1/doc/fluid/advanced_usage/development/contribute_to_paddle/img/cla_unsigned.png?raw=true"  height="40" width="500">

 </div>


--- a/doc/fluid/advanced_guide/addon_development/design_idea/fluid_design_idea.md
+++ b/doc/fluid/advanced_guide/addon_development/design_idea/fluid_design_idea.md
@@ -56,12 +56,20 @@ blocks中包含：
 block的概念与通用程序一致，例如在下列这段C++代码中包含三个block：

 ``` cpp
-int main(){ //block 0
-	int i = 0;
-	if (i<10){ //block 1
-		for (int j=0;j<10;j++){ //block 2
-		}
+#include <iostream>
+
+int main() {
+	int x = 5; // block 0
+	int y = 4; // block 0
+	int out;   // block 0
+	
+	if (x < y) { // block 0
+	    out = 1; // block 1
+	} else {
+	    out = 0; // block 2
 	}
+	
+	std::cout << out << std::endl;
 	return 0;
 }
 ```
@@ -69,27 +77,20 @@ int main(){ //block 0
 类似的，在下列 Paddle 的 Program 包含3段block：

 ```python
-import paddle.fluid as fluid  # block 0
-
-limit = fluid.layers.fill_constant_batch_size_like(
-    input=label, dtype='int64', shape=[1], value=5.0)
-cond = fluid.layers.less_than(x=label, y=limit)
-
-ie = fluid.layers.IfElse(cond)
-with ie.true_block(): # block 1
-    true_image = ie.input(image)
-    hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
-    prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    ie.output(prob)
-
-with ie.false_block(): # block 2
-    false_image = ie.input(image)
-    hidden = fluid.layers.fc(
-        input=false_image, size=200, act='tanh')
-    prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    ie.output(prob)
-
-prob = ie()
+import paddle.fluid as fluid
+
+x = fluid.data(name='x', shape=[1], dtype='int64') # block 0
+y = fluid.data(name='y', shape=[1], dtype='int64') # block 0
+
+def true_block():
+    return fluid.layers.fill_constant(dtype='int64', value=1, shape=[1]) # block 1
+    
+def false_block():
+    return fluid.layers.fill_constant(dtype='int64', value=0, shape=[1]) # block 2
+
+condition = fluid.layers.less_than(x, y) # block 0
+
+out = fluid.layers.cond(condition, true_block, false_block) # block 0
 ```
 ### BlockDesc and ProgramDesc


--- a/doc/fluid/advanced_guide/addon_development/design_idea/fluid_design_idea_en.md
+++ b/doc/fluid/advanced_guide/addon_development/design_idea/fluid_design_idea_en.md
@@ -59,12 +59,20 @@ The blocks contain:
 The concept of block is the same with that in generic programs. For example, there are three blocks in the following C++ code:

 ``` cpp
-int main(){ //block 0
-    int i = 0;
-    if (i<10){ //block 1
-        for (int j=0;j<10;j++){ //block 2
-        }
+#include <iostream>
+
+int main() {
+	int x = 5; // block 0
+	int y = 4; // block 0
+	int out;   // block 0
+	
+	if (x < y) { // block 0
+	    out = 1; // block 1
+	} else {
+	    out = 0; // block 2
 	}
+	
+	std::cout << out << std::endl;
 	return 0;
 }
 ```
@@ -72,27 +80,20 @@ int main(){ //block 0
 Similarly, the following Program contains 3 blocks:

 ```python
-import paddle.fluid as fluid # block 0
-
-limit = fluid.layers.fill_constant_batch_size_like(
-    Input=label, dtype='int64', shape=[1], value=5.0)
-cond = fluid.layers.less_than(x=label, y=limit)
-
-ie = fluid.layers.IfElse(cond)
-with ie.true_block(): # block 1
-    true_image = ie.input(image)
-    hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
-    prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    ie.output(prob)
-
-with ie.false_block(): # block 2
-    false_image = ie.input(image)
-    hidden = fluid.layers.fc(
-        input=false_image, size=200, act='tanh')
-    prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    ie.output(prob)
-
-prob = ie()
+import paddle.fluid as fluid
+
+x = fluid.data(name='x', shape=[1], dtype='int64') # block 0
+y = fluid.data(name='y', shape=[1], dtype='int64') # block 0
+
+def true_block():
+    return fluid.layers.fill_constant(dtype='int64', value=1, shape=[1]) # block 1
+    
+def false_block():
+    return fluid.layers.fill_constant(dtype='int64', value=0, shape=[1]) # block 2
+
+condition = fluid.layers.less_than(x, y) # block 0
+
+out = fluid.layers.cond(condition, true_block, false_block) # block 0
 ```
 ### BlockDesc and ProgramDesc

@@ -229,8 +230,8 @@ import numpy
 train_data=numpy.array([[1.0],[2.0],[3.0],[4.0]]).astype('float32')
 y_true = numpy.array([[2.0],[4.0],[6.0],[8.0]]).astype('float32')
 # Define the network
-x = fluid.layers.data(name="x",shape=[1],dtype='float32')
-y = fluid.layers.data(name="y",shape=[1],dtype='float32')
+x = fluid.data(name="x",shape=[None, 1],dtype='float32')
+y = fluid.data(name="y",shape=[None, 1],dtype='float32')
 y_predict = fluid.layers.fc(input=x,size=1,act=None)
 #definition loss function
 cost = fluid.layers.square_error_cost(input=y_predict,label=y)
@@ -299,7 +300,7 @@ As you can see from the output, the entire definition process is transformed int

 BlockDesc contains defined vars and a series of ops. Take input x as an example. In python code, x is 1D data of data type "float 32":
 ```python
-x = fluid.layers.data(name="x",shape=[1],dtype='float32')
+x = fluid.data(name="x",shape=[None, 1],dtype='float32')
 ```
 In BlockDesc, the variable x is described as:
 ```

--- a/doc/fluid/advanced_guide/addon_development/new_op/custom_op.md
+++ b/doc/fluid/advanced_guide/addon_development/new_op/custom_op.md
@@ -77,14 +77,12 @@ class Relu2GradMaker : public framework::SingleGradOpMaker<T> {
 public:
  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;

-  std::unique_ptr<T> Apply() const override {
-    auto* op = new T();
+  void Apply(GradOpPtr<T> op) const override {
    op->SetType("relu2_grad");
    op->SetInput("Y", this->Output("Y"));
    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
    op->SetAttrMap(this->Attrs());
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    return std::unique_ptr<T>(op);
  }
 };

@@ -142,7 +140,7 @@ REGISTER_OP_CPU_KERNEL(relu2_grad,



-ReLU OP的GPU实现， ``relu_op.cc`` 文件:
+ReLU OP的GPU实现， ``relu_op.cu`` 文件:

 ```
 // relu_op.cu
@@ -272,8 +270,8 @@ g++ relu_op.cc relu_op.cu.o -o relu2_op.so -shared -fPIC -std=c++11 -O3 -DPADDLE

 注意点:

-1. NVCC编译GPU OP的cu文件时，需要加 `-DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO` 。
-2. 如果安装的PaddlePaddle不包含MKLDNN，则需要去掉编译选项`-DPADDLE_WITH_MKLDNN`。默认的安装包已包含MKLDNN。
+1. 通过NVCC编译CUDA源文件时，需要加编译选项 `-DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO`，在框架源码中会使用这些宏定义进行条件编译。用户自定义的C++ OP实现编译时，选项的开启状态需要和核心框架编译行为一致。如`EIGEN_USE_GPU`是使用Eigen数学库的GPU实现时需要增加的编译选项。
+2. 如果飞桨安装包中不包含MKLDNN库，则需要去掉编译选项`-DPADDLE_WITH_MKLDNN`。核心框架源码中(比如tensor.h)有使用此宏定义进行条件编译，该选项是否打开同样需要和核心框架编译行为保持一致。默认的飞桨安装包中含有MKLDNN库。
 3. 可多个OP编译到同一个动态库中。
 4. 通过pip方式安装的PaddlePaddle由GCC 4.8编译得到，由于GCC 4.8和GCC 5以上**C++11 ABI不兼容**，您编写的自定义OP，需要通过GCC 4.8编译。若是GCC 5及以上的环境上使用自定义OP，推荐使用[Docker安装PaddlePaddle](https://www.paddlepaddle.org.cn/install/doc/docker)，使得编Paddle和编译自定义OP的GCC版本相同。

@@ -333,6 +331,11 @@ np.allclose(out, np.maximum(x,0.))

 ## FAQ

-1. Q:如果出现类似错误: cannot open shared object file: No such file or directory.
+1. Q: 如果出现类似错误: `relu2_op.so: cannot open shared object file: No such file or directory` 以及 `libpaddle_framework.so: cannot open shared object file: No such file or directory`。

-   A:  需要设置动态库的路径到环境变量LD_LIBRARY_PATH中。
+   A: 需要将`relu2_op.so`所在路径以及`libpaddle_framework.so`路径(即`paddle.sysconfig.get_lib()`得到路径)设置到环境变量LD_LIBRARY_PATH中:
+
+     ``` 
+      # 假如relu2_op.so路径是：`paddle/test`，对于Linux环境设置:
+      export LD_LIBRARY_PATH=paddle/test:$( python -c 'import paddle; print(paddle.sysconfig.get_lib())'):$LD_LIBRARY_PATH
+     ```
--- a/doc/fluid/advanced_guide/addon_development/new_op/new_op.md
+++ b/doc/fluid/advanced_guide/addon_development/new_op/new_op.md
--- a/doc/fluid/advanced_guide/addon_development/new_op/op_notes.md
+++ b/doc/fluid/advanced_guide/addon_development/new_op/op_notes.md
@@ -157,13 +157,31 @@ ShareDataWith的功能是使两个Tensor共享底层buffer，在调用这个操
 目前稀疏梯度在做更新的时候会先对梯度做merge，即对相同参数的梯度做累加，然后做参数以及附加参数（如velocity）的更新。

 ### 8.显存优化
+
+#### 8.1 为可原位计算的Op注册Inplace
+有些Op的计算逻辑中，输出可以复用输入的显存空间，也可称为原位计算。例如[`reshape_op`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/reshape_op.cc)中，输出`Out`可以复用输入`X`的显存空间，因为该Op的计算逻辑不会改变`X`的实际数据，只是修改它的shape，输出和输入复用同一块显存空间不影响结果。对于这类OP，可以注册`Inlace`，从而让框架在运行时自动地进行显存优化。
+
+fluid提供了`DECLARE_INPLACE_OP_INFERER`宏用于注册`Inplace`，该宏第一个参数是一个类名，如`ReshapeOpInplaceInToOut`；第二个参数是一对复用的输入输出，以`{"X", "Out"}`的形式给出。在`REGISTER_OPERATOR`时，
+可以将类名传传入，从而为该Op注册`Inplace`。
+
+```
+DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInToOut, {"X", "Out"});
+
+REGISTER_OPERATOR(
+    reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ops::ReshapeOpInplaceInToOut);
+```
+
+#### 8.2 减少OP中的无关变量
 通常反向Op会依赖于前向Op的某些输入(Input)、输出(Output)，以供反向Op计算使用。但有些情况下，反向Op不需要前向Op的所有输入和输出；有些情况下，反向Op只需要前向Op的部分输入和输出；有些情况下，反向Op只需要使用前向Op中输入和输出变量的Shape和LoD信息。若Op开发者在注册反向Op时，将不必要的前向Op输入和输出作为反向Op的输入，会导致这部分显存无法被框架现有的显存优化策略优化，从而导致模型显存占用过高。

 所以在写注册反向Op时需要注意以下几点：

- Fluid提供的`DefaultGradOpDescMaker`，默认会将前向op的所有输入(`Input`）、输出(`Output`)以及输出变量所对应的梯度(`Output@Grad`)作为反向Op的输入，将前向Op输入所对应的梯度(`Input@Grad`)作为反向Op的输出。所以在使用`DefaultGradOpDescMaker`时需要考虑是否有些变量在计算中不被用到。
- 如果`DefaultGradOpDescMaker`不能够满足需求，需要用户自己手动构建`GradOpDescMaker`，具体实现请参考[相关文档](new_op.html#permalink-4--gradprotomaker-);
- 如果有些反向Op需要依赖前向Op的输入或输出变量的的Shape或LoD，但不依赖于变量中Tensor的Buffer，且不能根据其他变量推断出该Shape和LoD，需要对该变量（以下称该变量为`X`）在反向Op中进行注册`NoNeedBufferVarsInference`。**一旦注册了`NoNeedBufferVarsIference`，反向op中就不能读写该变量对应的Tensor中的buffer，只能调用Tensor的dims()和lod()方法，同时，反向Op中的`GetExpectedKernelType()`必须要重写，并且`GetExpectedKernelType()`中不能访问`X`变量中Tensor的type()方法**。比如在`SliceOpGrad`中只会用到`Input`中变量的Shape信息，所以需要为对`Input`在`SliceOpGrad`上进行注册：
+- Fluid提供的`DefaultGradOpMaker`，默认会将前向op的所有输入(`Input`）、输出(`Output`)以及输出变量所对应的梯度(`Output@Grad`)作为反向Op的输入，将前向Op输入所对应的梯度(`Input@Grad`)作为反向Op的输出。所以在使用`DefaultGradOpMaker`时需要考虑是否有些变量在计算中不被用到。
+- 如果`DefaultGradOpMaker`不能够满足需求，需要用户自己手动构建`GradOpMaker`，具体实现请参考[相关文档](new_op.html#gradopmaker);
+- 如果有些反向Op需要依赖前向Op的输入或输出变量的的Shape或LoD，但不依赖于变量中Tensor的Buffer，且不能根据其他变量推断出该Shape和LoD，则可以通过`DECLARE_NO_NEED_BUFFER_VARS_INFERER`接口对该变量（以下称该变量为`X`）在反向Op中进行注册`NoNeedBufferVars`。**一旦注册了`NoNeedBufferVars`，反向op中就不能读写该变量对应的Tensor中的buffer，只能调用Tensor的dims()和lod()方法，同时，反向Op中的`GetExpectedKernelType()`必须要重写，并且`GetExpectedKernelType()`中不能访问`X`变量中Tensor的type()方法**。比如在`SliceOpGrad`中只会用到`Input`中变量的Shape信息，所以需要为对`Input`在`SliceOpGrad`上进行注册：
 ```
 namespace paddle {
 namespace operators {
@@ -185,30 +203,44 @@ class SliceOpGrad : public framework::OperatorWithKernel {
 };


-class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
+template <typename T>
+class SliceOpGradMaker : public framework::SingleGradOpMaker<T> {
 public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;

 protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* bind = new framework::OpDesc();
-    bind->SetInput("Input", Input("Input"));
-    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
-    bind->SetAttrMap(Attrs());
+  void Apply(GradOpPtr<T> bind) const override {
+    bind->SetInput("Input", this->Input("Input"));
+    if (this->HasInput("StartsTensor")) {
+      bind->SetInput("StartsTensor", this->Input("StartsTensor"));
+    }
+    if (this->HasInput("EndsTensor")) {
+      bind->SetInput("EndsTensor", this->Input("EndsTensor"));
+    }
+    if (this->HasInput("StartsTensorList")) {
+      bind->SetInput("StartsTensorList", this->Input("StartsTensorList"));
+    }
+    if (this->HasInput("EndsTensorList")) {
+      bind->SetInput("EndsTensorList", this->Input("EndsTensorList"));
+    }
+    bind->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+    bind->SetAttrMap(this->Attrs());
    bind->SetType("slice_grad");
-    return std::unique_ptr<framework::OpDesc>(bind);
  }
 };

-DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SliceOpGradNoNeedBufferVarsInference,
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(SliceOpGradNoNeedBufferVarsInference,
                                    "Input");
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
-                  ops::SliceOpGradMaker);
+                  ops::SliceOpGradMaker<paddle::framework::OpDesc>,
+                  ops::SliceOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad,
+                  ops::SliceDoubleOpGradMaker<paddle::framework::OpDesc>,
+                  ops::SliceDoubleOpGradMaker<paddle::imperative::OpBase>,
                  ops::SliceOpGradNoNeedBufferVarsInference);
 ```


--- a/doc/fluid/advanced_guide/data_preparing/feeding_data.rst
+++ b/doc/fluid/advanced_guide/data_preparing/feeding_data.rst
@@ -4,7 +4,7 @@
 同步数据读取
 ##############

-PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
+PaddlePaddle Fluid支持使用 :code:`fluid.data()` 配置数据层；
 再使用 Numpy Array 或者直接使用Python创建C++的
 :code:`fluid.LoDTensor` , 通过 :code:`Executor.run(feed=...)` 传给
 :code:`fluid.Executor` 或 :code:`fluid.ParallelExecutor` 。
@@ -12,29 +12,25 @@ PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
 数据层配置
 ##########

-通过 :code:`fluid.layers.data()` 可以配置神经网络中需要的数据层。具体方法为:
+通过 :code:`fluid.data()` 可以配置神经网络中需要的数据层。具体方法为:

 .. code-block:: python

   import paddle.fluid as fluid

-   image = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   image = fluid.data(name="image", shape=[None, 3, 224, 224])
+   label = fluid.data(name="label", shape=[None, 1], dtype="int64")

   # use image/label as layer input
   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
   loss = fluid.layers.cross_entropy(input=prediction, label=label)
   ...

-上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.layers.data`
-创建的两个输入数据层。其中 :code:`image` 是 :code:`[3, 224, 224]` 维度的浮点数据;
-:code:`label` 是 :code:`[1]` 维度的整数数据。这里需要注意的是:
+上段代码中，:code:`image` 和 :code:`label` 是通过 :code:`fluid.data`
+创建的两个输入数据层。其中 :code:`image` 是 :code:`[None, 3, 224, 224]` 维度的浮点数据;
+:code:`label` 是 :code:`[None, 1]` 维度的整数数据。这里需要注意的是:

-1. Fluid中默认使用 :code:`-1` 表示 batch size 维度，默认情况下会在 :code:`shape`
-   的第一个维度添加 :code:`-1` 。 所以 上段代码中， 我们可以接受将一个
-   :code:`[32, 3, 224, 224]` 的numpy array传给 :code:`image` 。 如果想自定义batch size
-   维度的位置的话，请设置 :code:`fluid.layers.data(append_batch_size=False)` 。
-   请参考进阶使用中的 :ref:`user_guide_customize_batch_size_rank` 。
+1. Executor在执行的时候，会检查定义的数据层数据和feed的数据的 :code:`shape` 和 :code:`dtype` 是否一致，如果不一致，程序会报错退出。对于一些任务，在不同的轮数，数据的某些维度会变化，可以将维度的值设置为None，例如第0维会变化，可以将 :code:`shape` 设置为 :code:`[None, 3, 224, 224]` 。


 2. Fluid中用来做类别标签的数据类型是 :code:`int64`，并且标签从0开始。可用数据类型请参考 :ref:`user_guide_paddle_support_data_types`。
@@ -69,17 +65,17 @@ PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
 序列数据是PaddlePaddle Fluid支持的特殊数据类型，可以使用 :code:`LoDTensor` 作为
 输入数据类型。它需要用户: 1. 传入一个mini-batch需要被训练的所有数据;
 2.每个序列的长度信息。
-用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor`。
+用户可以使用 :code:`fluid.create_lod_tensor` 来创建 :code:`LoDTensor` 。

-传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level`。
-例如训练数据是词汇组成的句子，:code:`lod_level=1`；训练数据是 词汇先组成了句子，
-句子再组成了段落，那么 :code:`lod_level=2`。
+传入序列信息的时候，需要设置序列嵌套深度，:code:`lod_level` 。
+例如训练数据是词汇组成的句子，:code:`lod_level=1` ；训练数据是 词汇先组成了句子，
+句子再组成了段落，那么 :code:`lod_level=2` 。

 例如:

 .. code-block:: python

-   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
+   sentence = fluid.data(name="sentence", dtype="int64", shape=[None, 1], lod_level=1)

   ...

@@ -91,8 +87,8 @@ PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
     )
   })

-训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2`。
-他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]`。
+训练数据 :code:`sentence` 包含三个样本，他们的长度分别是 :code:`4, 1, 2` 。
+他们分别是 :code:`data[0:4]`， :code:`data[4:5]` 和 :code:`data[5:7]` 。

 如何分别设置ParallelExecutor中每个设备的训练数据
 ------------------------------------------------
@@ -123,36 +119,6 @@ PaddlePaddle Fluid支持使用 :code:`fluid.layers.data()` 配置数据层；
 上述代码中，GPU0会训练 32 个样本，而 GPU1训练 16 个样本。


-.. _user_guide_customize_batch_size_rank:
-
-自定义BatchSize维度
-------------------
-
-PaddlePaddle Fluid默认batch size是数据的第一维度，以 :code:`-1` 表示。但是在高级
-使用中，batch_size 可以固定，也可以是其他维度或者多个维度来表示。这都需要设置
-:code:`fluid.layers.data(append_batch_size=False)` 来完成。
-
-1. 固定batch size维度
-
-  .. code-block:: python
-
-     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
-
-  这里，:code:`image` 永远是一个 :code:`[32, 784]` 大小的矩阵。
-
-2. 使用其他维度表示batch size
-
-  .. code-block:: python
-
-     sentence = fluid.layers.data(name="sentence",
-                                  shape=[80, -1, 1],
-                                  append_batch_size=False,
-                                  dtype="int64")
-
-  这里 :code:`sentence` 的中间维度是batch size。这种数据排布会用在定长的循环神经
-  网络中。
-
-
 .. _user_guide_paddle_support_data_types:

 Fluid目前支持的数据类型

--- a/doc/fluid/advanced_guide/data_preparing/feeding_data_en.rst
+++ b/doc/fluid/advanced_guide/data_preparing/feeding_data_en.rst
@@ -4,7 +4,7 @@
 Take Numpy Array as Training Data
 #################################

-PaddlePaddle Fluid supports configuring data layer with :code:`fluid.layers.data()` .
+PaddlePaddle Fluid supports configuring data layer with :code:`fluid.data()` .
 Then you can use Numpy Array or directly use Python to create C++
 :code:`fluid.LoDTensor` , and then feed it to :code:`fluid.Executor` or :code:`fluid.ParallelExecutor` 
 through :code:`Executor.run(feed=...)` .
@@ -12,23 +12,23 @@ through :code:`Executor.run(feed=...)` .
 Configure Data Layer
 ############################

-With :code:`fluid.layers.data()` , you can configure data layer in neural network. Details are as follows:
+With :code:`fluid.data()` , you can configure data layer in neural network. Details are as follows:

 .. code-block:: python

   import paddle.fluid as fluid

-   image = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+   image = fluid.data(name="image", shape=[None, 3, 224, 224])
+   label = fluid.data(name="label", shape=[None, 1], dtype="int64")

   # use image/label as layer input
   prediction = fluid.layers.fc(input=image, size=1000, act="softmax")
   loss = fluid.layers.cross_entropy(input=prediction, label=label)
   ...

-In the code above, :code:`image` and :code:`label` are two input data layers created by :code:`fluid.layers.data` . :code:`image` is float data of shape :code:`[3, 224, 224]` ; :code:`label` is the int data of shape :code:`[1]` . Note that:
+In the code above, :code:`image` and :code:`label` are two input data layers created by :code:`fluid.data` . :code:`image` is float data of shape :code:`[None, 3, 224, 224]` ; :code:`label` is the int data of shape :code:`[None, 1]` . Note that:

-1. :code:`-1` is represented for the dimension of batch size by default in Fluid. And :code:`-1` is added to the first dimension of :code:`shape` by default. Therefore in the code above, it would be alright to transfer numpy array of :code:`[32, 3, 224, 224]` to :code:`image` . If you want to customize the position of the batch size dimension, please set :code:`fluid.layers.data(append_batch_size=False)` .Please refer to the tutorial in the advanced user guide: :ref:`user_guide_customize_batch_size_rank_en` .
+1. When the program is executing, executor will check whether the :code:`shape` and :code:`dtype` defined and feeded are consistent. If they are not consistent, the program will exit with an error. In some tasks, the dimension will change in different training steps. For this case, the value of the dimension can be set to None. For example, the :code:`shape` can be set to :code:`[None, 3, 224, 224]` when the 0th dimension will change.

 2. Data type of category labels in Fluid is :code:`int64` and the label starts from 0. About the supported data types,please refer to :ref:`user_guide_paddle_support_data_types_en` .

@@ -76,7 +76,7 @@ For example:

 .. code-block:: python

-   sentence = fluid.layers.data(name="sentence", dtype="int64", shape=[1], lod_level=1)
+   sentence = fluid.data(name="sentence", dtype="int64", shape=[None, 1], lod_level=1)

   ...

@@ -122,32 +122,6 @@ For example:

 In the code above, GPU0 will train 32 samples and GPU1 will train 16 samples.

-.. _user_guide_customize_batch_size_rank_en:
-
-Customize the BatchSize dimension
------------------------------------
-
-Batch size is the first dimension of data by default in PaddlePaddle Fluid, indicated by :code:`-1` .But in advanced usage, batch_size could be fixed or respresented by other dimension or multiple dimensions, which could be implemented by setting :code:`fluid.layers.data(append_batch_size=False)` .
-
-1. fixed BatchSize dimension
-
-  .. code-block:: python
-
-     image = fluid.layers.data(name="image", shape=[32, 784], append_batch_size=False)
-
-  Here :code:`image` is always a matrix with size of :code:`[32, 784]` .
-
-2. batch size expressed by other dimension
-
-  .. code-block:: python
-
-     sentence = fluid.layers.data(name="sentence",
-                                  shape=[80, -1, 1],
-                                  append_batch_size=False,
-                                  dtype="int64")
-
-  Here the middle dimension of :code:`sentence` is batch size. This type of data layout is applied in fixed-length recurrent neural networks.
-
 .. _user_guide_paddle_support_data_types_en:

 Data types supported by Fluid

--- a/doc/fluid/advanced_guide/data_preparing/reader.md
+++ b/doc/fluid/advanced_guide/data_preparing/reader.md
@@ -193,14 +193,3 @@ def image_reader_creator(image_path, label_path, n):
 reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
 paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 ```
-
-### How is `paddle.train` implemented
-
-An example implementation of paddle.train is:
-
-```python
-def train(batch_reader, mapping, batch_size, total_pass):
-    for pass_idx in range(total_pass):
-        for mini_batch in batch_reader(): # this loop will never end in online learning.
-            do_forward_backward(mini_batch, mapping)
-```
--- a/doc/fluid/advanced_guide/distributed_training/cluster_quick_start.rst
+++ b/doc/fluid/advanced_guide/distributed_training/cluster_quick_start.rst
@@ -14,7 +14,7 @@


 * 
-  [x] 成功安装Paddle Fluid，如果尚未安装，请参考 `快速开始 <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/quick_start_cn.html>`_
+  [x] 成功安装Paddle Fluid，如果尚未安装，请参考 `快速开始 <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.7/beginners_guide/quick_start_cn.html>`_

 * 
  [x] 学会最基本的单机训练方法，请参考 `单机训练 <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/single_node.html>`_ 中描述的单卡训练，进行学习
@@ -113,7 +113,7 @@
       main_function(args.is_local)


-* 说明：示例中使用的IO方法是dataset，想了解具体的文档和用法请参考 `Dataset API <hhttps://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/dataset_cn.html>`_ 。示例中使用的 ``train_from_dataset`` 接口，想了解具体的文档和使用方法请参考 `Executor API <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/executor_cn.html>`_ 。示例中的 ``from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet`` 表示引入参数服务器架构进行分布式训练，如果想更进一步了解Fleet API的更多选项和示例，请参考 `Fleet API <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/fleet_api_howto_cn.html>`_
+* 说明：示例中使用的IO方法是dataset，想了解具体的文档和用法请参考 `Dataset API <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.7/api_cn/dataset_cn.html>`_ 。示例中使用的 ``train_from_dataset`` 接口，想了解具体的文档和使用方法请参考 `Executor API <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.7/api_cn/executor_cn.html>`_ 。示例中的 ``from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet`` 表示引入参数服务器架构进行分布式训练，如果想更进一步了解Fleet API的更多选项和示例，请参考 `Fleet API <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.6/user_guides/howto/training/fleet_api_howto_cn.html>`_


 单机训练启动命令

--- a/doc/fluid/advanced_guide/distributed_training/cluster_quick_start_en.rst
+++ b/doc/fluid/advanced_guide/distributed_training/cluster_quick_start_en.rst
-..  _cluster_quick_start_en:
+Quick start for distributed training
+====================================

-Quick Start with Distributed Training
-==========================
+Distributed training with Fleet API
+-----------------------------------

-Preparation
--------------------
-In this article, we'll show you how to quickly start a PaddlePaddle distributed training task in a cluster. Before you start, do some preparatory work as follows:
-
-1. Prepare a connected training cluster. Here we use 4 training nodes with format ``*.paddlepaddle.com`` to represent the host name of the node. You can modify it according to the actual situation.
-
-2. Make sure you have read :ref:`install_steps` before you start and can run PaddlePaddle on all nodes of the cluster.
-
-Example code
-------------
+Since Paddle Fluid `Release
+1.5.1 <https://github.com/PaddlePaddle/Paddle/releases/tag/v1.5.1>`__,
+it is officially recommended to use the Fleet API for distributed
+training. For the introduction of the Fleet API, please refer to `Fleet
+Design Doc <https://github.com/PaddlePaddle/Fleet>`__.

-Let's use a very simple linear regression model as an example to explain how to start a distributed training task with 2 pserver server nodes and 2 trainer nodes. You can save this code as ``dist_train.py`` .
+Preparation
+~~~~~~~~~~~
+
+-  [x] Install Paddle Fluid. If not already installed, please refer to
+   `Beginner’s
+   Guide <https://www.paddlepaddle.org.cn/documentation/docs/en/1.7/beginners_guide/index_en.html>`__.
+-  [x] Master the most basic single node training method. Please refer
+   to the single card training described in `Single-node
+   training <https://www.paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/single_node_en.html>`__.
+
+Click-through rate prediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here, we will use a simple example, click-through rate prediction task,
+to illustrate how to configure Fleet API for distributed training, and
+gives an example by using a single node environment to simulate the
+distributed environment. The source code of the example comes from `CTR
+with
+Fleet <https://github.com/PaddlePaddle/Fleet/tree/develop/examples/ctr>`__.
+
+In order to facilitate learning, the example given here is a mixed code
+of single node and multi node. You can start single node or multi node
+tasks through different startup commands. For the part of obtaining data
+and the logic of data preprocessing, please refer to the source code and
+description of `CTR with
+Fleet <https://github.com/PaddlePaddle/Fleet/tree/develop/examples/ctr>`__.

 .. code:: python

-
+    from __future__ import print_function
+    from args import parse_args
    import os
-    import paddle
    import paddle.fluid as fluid
+    import sys
+    from network_conf import ctr_dnn_model_dataset
+    import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+
+    from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
+    from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
+
+    dense_feature_dim = 13
+    sparse_feature_dim = 10000001
+    batch_size = 100
+    thread_num = 10
+    embedding_size = 10
+    args = parse_args()
+
+    def main_function(is_local):
+      # common code for local training and distributed training
+      dense_input = fluid.layers.data(
+        name="dense_input", shape=[dense_feature_dim], dtype='float32')
+
+      sparse_input_ids = [
+            fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1,
+                              dtype="int64") for i in range(1, 27)]
+
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        dataset = fluid.DatasetFactory().create_dataset()
+        dataset.set_use_var([dense_input] + sparse_input_ids + [label])
+        pipe_command = "python criteo_reader.py %d" % sparse_feature_dim
+        dataset.set_pipe_command(pipe_command)
+        dataset.set_batch_size(batch_size)
+        dataset.set_thread(thread_num)
+
+        whole_filelist = ["raw_data/part-%d" % x 
+                           for x in range(len(os.listdir("raw_data")))]
+
+        dataset.set_filelist(whole_filelist)
+        loss, auc_var, batch_auc_var = ctr_dnn_model_dataset(
+            dense_input, sparse_input_ids, label, embedding_size,
+            sparse_feature_dim)

-    # train reader
-    BATCH_SIZE = 20
-    EPOCH_NUM = 30
-    BATCH_SIZE = 8
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.uci_housing.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-
-    def train():
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-        x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-        loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_loss = fluid.layers.mean(loss)
-        opt = fluid.optimizer.SGD(learning_rate=0.001)
-        opt.minimize(avg_loss)
-
-        place = fluid.CPUPlace()
-        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        exe = fluid.Executor(place)
-
-        # fetch distributed training environment setting
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", None)
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            trainer_id = trainer_id,
-            pservers = pserver_endpoints,
-            trainers = trainers)
-
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-            exe.run(startup_prog)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            trainer_prog = t.get_trainer_program()
+        exe = fluid.Executor(fluid.CPUPlace())
+        def train_loop(epoch=20):
+            for i in range(epoch):
+                exe.train_from_dataset(program=fluid.default_main_program(),
+                                       dataset=dataset,
+                                       fetch_list=[auc_var],
+                                       fetch_info=["auc"],
+                                       debug=False)
+        # local training
+        def local_train():
+            optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
+            optimizer.minimize(loss)
            exe.run(fluid.default_startup_program())
-
-            for epoch in range(EPOCH_NUM):
-                for batch_id, batch_data in enumerate(train_reader()):
-                    avg_loss_value, = exe.run(trainer_prog,
-                                          feed=feeder.feed(batch_data),
-                                          fetch_list=[avg_loss])
-                    if (batch_id + 1) % 10 == 0:
-                        print("Epoch: {0}, Batch: {1}, loss: {2}".format(
-                            epoch, batch_id, avg_loss_value[0]))
-            # destory the resource of current trainer node in pserver server node
-            exe.close()
+            train_loop()
+
+      # distributed training
+        def dist_train():
+            role = role_maker.PaddleCloudRoleMaker()
+            fleet.init(role)
+            strategy = DistributeTranspilerConfig()
+            strategy.sync_mode = False
+            optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+            optimizer.minimize(loss)
+
+            if fleet.is_server():
+                fleet.init_server()
+                fleet.run_server()
+            elif fleet.is_worker():
+                fleet.init_worker()
+                exe.run(fluid.default_startup_program())
+                train_loop()
+        if is_local:
+            local_train()
        else:
-            raise AssertionError("PADDLE_TRAINING_ROLE should be one of [TRAINER, PSERVER]")
-
-    train()
-
-
-Environment Variables
------------------------------------
-
-When starting a distributed training task, different environment variables are used to represent different node roles, details as follows:
-
-.. list-table::
-  :header-rows: 1
-
-  * - Environment Variable
-    - Data Type 
-    - Example 
-    - Description
-  * - :code:`PADDLE_TRAINING_ROLE`
-    - str 
-    - :code:`PSERVER,TRANERR`
-    - role of current training node
-  * - :code:`PADDLE_PSERVER_IPS`
-    - str 
-    - :code:`ps0.paddlepaddle.com, ps1.paddlepaddle.com`
-    - The IP addresses or hostnames of all pserver nodes in the distributed training task, separated by ","
-  * - :code:`PADDLE_PSERVER_PORT`
-    - int 
-    - 6174 
-    - port that the pserver process listens to
-  * - :code:`PADDLE_TRAINERS`
-    - int
-    - 2 
-    - Number of trainer nodes in a distributed training task
-  * - :code:`PADDLE_CURRENT_IP`
-    - str 
-    - :code:`ps0.paddlepaddle.com`
-    - IP address or hostname of the current pserver node
-  * - :code:`PADDLE_TRAINER_ID`
-    - str 
-    - 0 
-    - ID of the current trainer node (unique), in the range of [0, PADDLE_TRAINERS)
-
-**Note:** Environment variables are just a way to get runtime information. In practical tasks, you can use command line parameters to obtain runtime information.
-
-API related to Distributed Training
---------------------------------
-
-DistributeTranspiler
-~~~~~~~~~~~~~~~~~~~~~~
-
-The machines in distributed training tasks based on the pserver-trainer architecture are divided into two roles: Parameter Server (pserver) and trainer. In Fluid, users only need to configure the network configuration required for single node training. The ``DistributeTranspiler`` module automatically modifies the single-node network settings into settings on which pserver and trainer needs to run based on the role of current training node:
+            dist_train()

-.. code:: python
+    if __name__ == '__main__':
+        main_function(args.is_local)

-  t = fluid.DistributeTranspiler()
-  t.transpile(
-    trainer_id = trainer_id,
-    pservers = pserver_endpoints,
-    trainers = trainers)
-  if PADDLE_TRAINING_ROLE == "TRAINER":
-    # fetch the trainer program and execute it
-    trainer_prog = t.get_trainer_program()
-    ...
+-  Note: The IO method used in this example is dataset, please refer to
+   `Dataset
+   API <https://www.paddlepaddle.org.cn/documentation/docs/en/1.7/api/dataset.html>`__
+   for specific documents and usage. For the ``train_from_dataset``
+   interface, please refer to `Executor
+   API <https://www.paddlepaddle.org.cn/documentation/docs/en/1.7/api/executor.html>`__.
+   ``from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet``
+   in this example means to introduce parameter server architecture for
+   distributed training, which you can refer to `Fleet
+   API <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.6/user_guides/howto/training/fleet_api_howto_cn.html>`__
+   for getting more about the options and examples of Fleet API.

-  elif PADDLE_TRAINER_ROLE == "PSERVER":
-    # fetch the pserver program and execute it
-    pserver_prog = t.get_pserver_program(current_endpoint)
-    ...
+Start command of single node training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

+.. code:: bash

-Exe.close()
-~~~~~~~~~~~~~~
+    python train.py --is_local 1

+Start command of single machine simulation distributed training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-The status information of all trainer nodes is saved in the pserver node. When trainer finishes training, ``exe.close()`` should be called to notify all PServer nodes to release the resources of the current Trainer nodes:
+Here we use launch\_ps, a built-in launcher of paddle, which users can
+specify the number of workers and servers to start the parameter server
+tasks.

-.. code:: python
+.. code:: bash

-  exe = fluid.Executor(fluid.CPUPlace())
-  # training process ...
-  exe.close() # notify PServer to destory the resource
-
-Note: every trainer needs to call exe.close() when the trainer finishes.
-
-Start a Distributed Training Task
----------------------------------
-
-.. list-table::
-   :header-rows: 1
-
-
-   * - Start Node 
-     - Start Command 
-     - Description
-   * - ps0.paddlepaddle.com 
-     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps0.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com, ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
-     - Start pserver node
-   * - ps1.paddlepaddle.com
-     - :code:`PADDLE_TRAINING_ROLE=PSERVER PADDLE_CURRENT_IP=ps1.paddlepaddle.com PADDLE_PSERVER_IPS=ps0.paddlepaddle.com, ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
-     - Start pserver node
-   * - trainer0.paddlepaddle.com       
-     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com, ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=0 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
-     - Start the number 0 Trainer Node 
-   * - trainer1.paddlepaddle.com       
-     - :code:`PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_IPS=ps0.paddlepaddle.com, ps1.paddlepaddle.com PADDLE_TRAINERS=2 PADDLE_TRAINER_ID=1 PADDLE_PSERVER_PORT=6174 python fluid_dist.py`
-     - Start the number 1 trainer node
+    python -m paddle.distributed.launch_ps --worker_num 2 --server_num 2 train.py
+
+The task running log can be viewed in the logs directory of the working
+directory. When you can use a single machine to simulate distributed
+training, you can perform true multi node distributed training. We
+recommend that users refer directly to
+`百度云运行分布式任务的示例 <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/deploy_ctr_on_baidu_cloud_cn.html>`__.
--- a/doc/fluid/advanced_guide/distributed_training/fleet_api_howto_cn.rst
+++ b/doc/fluid/advanced_guide/distributed_training/fleet_api_howto_cn.rst
@@ -51,8 +51,8 @@ API最常见的两种使用场景，用一个模型做示例，目的是让用
     from nets import mlp
     from utils import gen_data

-     input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-     input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+     input_x = fluid.data(name="x", shape=[None, 32], dtype='float32')
+     input_y = fluid.data(name="y", shape=[None, 1], dtype='int64')

     cost = mlp(input_x, input_y)
     optimizer = fluid.optimizer.SGD(learning_rate=0.01)
@@ -79,8 +79,8 @@ API最常见的两种使用场景，用一个模型做示例，目的是让用
     from paddle.fluid.incubate.fleet.base import role_maker
     from utils import gen_data

-     input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-     input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+     input_x = fluid.data(name="x", shape=[None, 32], dtype='float32')
+     input_y = fluid.data(name="y", shape=[None, 1], dtype='int64')

     cost = mlp(input_x, input_y)
     optimizer = fluid.optimizer.SGD(learning_rate=0.01)
@@ -119,8 +119,8 @@ API最常见的两种使用场景，用一个模型做示例，目的是让用
     from paddle.fluid.incubate.fleet.base import role_maker
     from utils import gen_data

-     input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
-     input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+     input_x = fluid.data(name="x", shape=[None, 32], dtype='float32')
+     input_y = fluid.data(name="y", shape=[None, 1], dtype='int64')

     cost = mlp(input_x, input_y)
     optimizer = fluid.optimizer.SGD(learning_rate=0.01)

--- a/doc/fluid/advanced_guide/distributed_training/index_cn.rst
+++ b/doc/fluid/advanced_guide/distributed_training/index_cn.rst
@@ -6,5 +6,4 @@
    :maxdepth: 1

    cluster_quick_start.rst
-    cluster_howto.rst
    fleet_api_howto_cn.rst
--- a/doc/fluid/advanced_guide/dygraph_to_static/debugging_cn.md
+++ b/doc/fluid/advanced_guide/dygraph_to_static/debugging_cn.md
+# 调试方法
+
+本节内容将介绍动态图转静态图（下文简称：动转静）推荐的几种调试方法。
+
+> **注解:**
+>
+> 请确保转换前的动态图代码能够成功运行，建议使用 [paddle.jit.ProgramTranslator().enable(False)](../../api_cn/dygraph_cn/ProgramTranslator_cn.html#enable)关闭动转静功能，直接运行动态图，如下：
+
+```python
+import paddle
+import numpy as np
+paddle.disable_static()
+# 关闭动转静动能
+paddle.jit.ProgramTranslator().enable(False)
+
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    if x > 3:
+        x = x - 1
+    return x
+
+func(np.ones([3, 2]))
+```
+
+## 断点调试
+使用动转静功能时，您可以使用断点调试代码。
+例如，在代码中，调用 `pdb.set_trace()`：
+```Python
+import pdb
+
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    pdb.set_trace()
+    if x > 3:
+        x = x - 1
+    return x
+```
+执行以下代码，将会在转化后的静态图代码中使用调试器：
+```Python
+func(np.ones([3, 2]))
+```
+
+运行结果：
+```bash
+> /tmp/tmpR809hf.py(6)func()
+-> def true_fn_0(x):
+(Pdb) n
+> /tmp/tmpR809hf.py(6)func()
+-> def false_fn_0(x):
+...
+```
+
+如果您想在原始的动态图代码中使用调试器，请先调用 [`paddle.jit.ProgramTranslator().enable(False)`](../../api_cn/dygraph_cn/ProgramTranslator_cn.html#enable)，如下：
+```python
+paddle.jit.ProgramTranslator().enable(False)
+func(np.ones([3, 2]))
+```
+运行结果：
+```bash
+> <ipython-input-22-0bd4eab35cd5>(10)func()
+-> if x > 3:
+...
+
+```
+
+## 打印转换后的代码
+您可以打印转换后的静态图代码，有2种方法：
+
+1. 使用被装饰后的函数的 `code` 属性
+   如下代码中，装饰器 `paddle.jit.to_static` 会将函数 `func` 转化为一个类对象 `StaticLayer`，可以使用 StaticLayer 的 `code` 属性来获得转化后的代码。
+    ```Python
+    @paddle.jit.to_static
+    def func(x):
+    x = paddle.to_tensor(x)
+        if x > 3:
+            x = x - 1
+        return x
+
+    print(func.code)
+    ```
+    运行结果：
+
+    ```bash
+
+    def func(x):
+        x = fluid.layers.assign(x)
+
+        def true_fn_0(x):
+            x = x - 1
+            return x
+
+        def false_fn_0(x):
+            return x
+        x = fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(x >
+            3, true_fn_0, false_fn_0, (x,), (x,), (x,))
+        return x
+    ```
+
+2. 使用 `set_code_level(level)` 或环境变量 `TRANSLATOR_CODE_LEVEL=level`
+
+    通过调用 `set_code_level` 或设置环境变量 `TRANSLATOR_CODE_LEVEL`，可以在日志中查看转换后的代码：
+
+    ```python
+    @paddle.jit.to_static
+       def func(x):
+       x = paddle.to_tensor(x)
+       if x > 3:
+           x = x - 1
+       return x
+
+    paddle.jit.set_code_level() # 也可设置 os.environ["TRANSLATOR_CODE_LEVEL"] = '100'，效果相同
+    func(np.ones([1]))
+    ```
+   运行结果：
+
+    ```bash
+    2020-XX-XX 00:00:00,980-INFO: After the level 100 ast transformer: 'All Transformers', the transformed code:
+    def func(x):
+        x = fluid.layers.assign(x)
+
+        def true_fn_0(x):
+            x = x - 1
+            return x
+
+        def false_fn_0(x):
+            return x
+        x = fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(x >
+            3, true_fn_0, false_fn_0, (x,), (x,), (x,))
+        return x
+    ```
+    `set_code_level` 函数可以设置查看不同的AST Transformer转化后的代码，详情请见 [set_code_level](../../../paddle/api/paddle/fluid/dygraph/jit/set_code_level_cn.html)。
+
+## 使用 `print`
+`print` 函数可以用来查看变量，该函数在动转静中会被转化。当仅打印 Paddle Tensor 时，实际运行时会被转换为 Paddle 算子 [Print](../../api_cn/layers_cn/Print_cn.html)，否则仍然运行 `print`。
+```python
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+
+    # 打印x，x是Paddle Tensor，实际运行时会运行Paddle Print(x)
+    print(x)
+
+    # 打印注释，非Paddle Tensor，实际运行时仍运行print
+    print("Here call print function.")
+
+    if len(x) > 3:
+        x = x - 1
+    else:
+        x = paddle.ones(shape=[1])
+    return x
+
+func(np.ones([1]))
+```
+
+运行结果：
+```bash
+Variable: assign_0.tmp_0
+  - lod: {}
+  - place: CPUPlace
+  - shape: [1]
+  - layout: NCHW
+  - dtype: double
+  - data: [1]
+Here call print function.  
+```
+
+## 日志打印
+ProgramTranslator在日志中记录了额外的调试信息，以帮助您了解动转静过程中函数是否被成功转换。
+您可以调用 [`paddle.jit.set_verbosity(level)`]((../../../paddle/api/paddle/fluid/dygraph/jit/set_verbosity_cn.html)) 或设置环境变量 `TRANSLATOR_VERBOSITY=level` 来设置日志详细等级，并查看不同等级的日志信息。目前，`level` 可以取值0-3：
+- 0: 无日志
+- 1: 包括了动转静转化流程的信息，如转换前的源码、转换的可调用对象
+- 2: 包括以上信息，还包括更详细函数转化日志
+- 3: 包括以上信息，以及更详细的动转静日志
+
+> **注意:**
+>
+> 日志中包括了源代码等信息，请在共享日志前确保它不包含敏感信息。
+
+可以在代码运行前调用 `paddle.jit.set_verbosity` 控制日志详细程度：
+```python
+paddle.jit.set_verbosity(3)
+```
+或者设置环境变量 `TRANSLATOR_VERBOSITY`：
+```python
+import os
+os.environ["TRANSLATOR_VERBOSITY"] = '3'
+```
+
+运行结果：
+```bash
+2020-XX-XX 00:00:00,123-Level 1:    Source code:
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    if len(x) > 3:
+        x = x - 1
+    else:
+        x = paddle.ones(shape=[1])
+    return x
+
+2020-XX-XX 00:00:00,152-Level 1: Convert callable object: convert <built-in function len>.
+```
--- a/doc/fluid/advanced_guide/dygraph_to_static/debugging_en.md
+++ b/doc/fluid/advanced_guide/dygraph_to_static/debugging_en.md
+# Debugging Methods
+
+This section will introduce several debugging methods recommended by Dynamic Graph to Static Graph (hereafter called Dynamic-to-Staic).
+
+> **NOTE:**
+>
+> Please ensure that the dynamic graph code before transformation can run successfully. It is recommended to call [paddle.jit.ProgramTranslator().enable(False)](../../api/dygraph/ProgramTranslator_en.html#enable) to disable Dynamic-to-Static, and run dynamic graph code as follows:
+
+
+```python
+import paddle
+import numpy as np
+paddle.disable_static()
+
+# Disable Dynamic-to-Static
+paddle.jit.ProgramTranslator().enable(False)
+
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    if x > 3:
+        x = x - 1
+    return x
+
+func(np.ones([3, 2]))
+```
+
+## Breakpoint Debugging
+When using Dynamic-to-Static, you can use breakpoints to debug.
+
+For example, call `pdb.set_trace()` in your code:
+```Python
+import pdb
+
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    pdb.set_trace()
+    if x > 3:
+        x = x - 1
+    return x
+```
+Executing the following code will land the debugger in the transformed static graph code:
+```Python
+func(np.ones([3, 2]))
+```
+
+```bash
+> /tmp/tmpR809hf.py(6)func()
+-> def true_fn_0(x):
+(Pdb) n
+> /tmp/tmpR809hf.py(6)func()
+-> def false_fn_0(x):
+...
+```
+
+Calling [`paddle.jit.ProgramTranslator().enable(False)`](../../api/dygraph/ProgramTranslator_en.html#enable) before executing the code will land the debugger in the original dynamic graph code:
+```python
+paddle.jit.ProgramTranslator().enable(False)
+func(np.ones([3, 2]))
+```
+
+```bash
+> <ipython-input-22-0bd4eab35cd5>(10)func()
+-> if x > 3:
+...
+
+```
+
+## Print Transformed Code
+
+There are two ways to print the transformed static graph code:
+
+1. Use the attribute `code` of the decorated function
+
+   In the following code, the decorator `paddle.jit.to_static` transforms `func` into a class object `StaticLayer`. You can use the `code` attribute of `StaticLayer` to get the transformed code.
+    ```Python
+    @paddle.jit.to_static
+    def func(x):
+    x = paddle.to_tensor(x)
+        if x > 3:
+            x = x - 1
+        return x
+
+    print(func.code)
+    ```
+    ```bash
+
+    def func(x):
+        x = fluid.layers.assign(x)
+
+        def true_fn_0(x):
+            x = x - 1
+            return x
+
+        def false_fn_0(x):
+            return x
+        x = fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(x >
+            3, true_fn_0, false_fn_0, (x,), (x,), (x,))
+        return x
+    ```
+2. Call `set_code_level(level)` or set environment variable `TRANSLATOR_CODE_LEVEL=level`
+
+    You can view the transformed code in the log by calling `set_code_level` or set environment variable `TRANSLATOR_CODE_LEVEL`.
+
+    ```python
+    @paddle.jit.to_static
+       def func(x):
+       x = paddle.to_tensor(x)
+       if x > 3:
+           x = x - 1
+       return x
+
+    paddle.jit.set_code_level() # the same effect to set os.environ["TRANSLATOR_CODE_LEVEL"] = '100'
+    func(np.ones([1]))
+    ```
+
+    ```bash
+    2020-XX-XX 00:00:00,980-INFO: After the level 100 ast transformer: 'All Transformers', the transformed code:
+    def func(x):
+        x = fluid.layers.assign(x)
+
+        def true_fn_0(x):
+            x = x - 1
+            return x
+
+        def false_fn_0(x):
+            return x
+        x = fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(x >
+            3, true_fn_0, false_fn_0, (x,), (x,), (x,))
+        return x
+    ```
+    `set_code_level` can set different levels to view the code transformed by different ast transformers. For details, please refer to [set_code_level](../../../paddle/api/paddle/fluid/dygraph/jit/set_code_level_en.html)。
+
+## `print`
+You can call `print` to view variables. `print` will be transformed when using Dynamic-to-Static. When only Paddle Tensor is printed, `print` will be transformed and call Paddle operator [Print](../../api/layers/Print.html) in runtime. Otherwise, call python `print`.
+
+```python
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    # x is a Paddle Tensor, so it will run Paddle Print(x) actually.
+    print(x)
+
+    # The string is not a Paddle Tensor, so it will run print as-is.
+    print("Here call print function.")
+
+    if len(x) > 3:
+        x = x - 1
+    else:
+        x = paddle.ones(shape=[1])
+    return x
+
+func(np.ones([1]))
+```
+
+```bash
+Variable: assign_0.tmp_0
+  - lod: {}
+  - place: CPUPlace
+  - shape: [1]
+  - layout: NCHW
+  - dtype: double
+  - data: [1]
+Here call print function.  
+```
+
+## Log Printing
+ProgramTranslator can log additional debugging information to help you know whether the function was successfully transformed or not.
+
+You can call [`paddle.jit.set_verbosity(level)`](../../../paddle/api/paddle/fluid/dygraph/jit/set_verbosity_en.html) or set environment variable `TRANSLATOR_VERBOSITY=level` to enable logging and view logs of different levels. The argument `level` varies from 0 to 3:
+- 0: no logging
+- 1: includes the information in Dynamic-to-Static tranformation process, such as the source code not transformed, the callable object to transform and so on
+- 2: includes above and more detailed function transformation logs
+- 3: includes above and extremely verbose logging
+
+> **WARNING:**
+>
+> The logs includes information such as source code. Please make sure logs don't contain any sensitive information before sharing them.
+
+You can call `paddle.jit.set_verbosity` to control the verbosity level of logs:
+```python
+paddle.jit.set_verbosity(3)
+```
+or use the environment variable `TRANSLATOR_VERBOSITY`：
+```python
+import os
+os.environ["TRANSLATOR_VERBOSITY"] = '3'
+```
+
+```bash
+2020-XX-XX 00:00:00,123-Level 1:    Source code:
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    if len(x) > 3:
+        x = x - 1
+    else:
+        x = paddle.ones(shape=[1])
+    return x
+
+2020-XX-XX 00:00:00,152-Level 1: Convert callable object: convert <built-in function len>.
--- a/doc/fluid/advanced_guide/dygraph_to_static/error_handling_cn.md
+++ b/doc/fluid/advanced_guide/dygraph_to_static/error_handling_cn.md
+# 报错信息处理
+
+本节内容将介绍使用动态图转静态图（下文简称：动转静）功能发生异常时，[ProgramTranslator](./program_translator_cn.html)对报错信息做的处理，以帮助您更好地理解动转静报错信息。使用动转静功能运行动态图代码时，内部可以分为2个步骤：动态图代码转换成静态图代码，运行静态图代码。接下来将分别介绍这2个步骤中的异常报错情况。
+
+## 动转静过程中的异常
+在动态图代码转换成静态图代码的过程中，如果ProgramTranslator无法转换一个函数时，将会显示警告信息，并尝试直接运行该函数。
+如下代码中，函数 `inner_func` 在调用前被转换成静态图代码，当 `x = inner_func(data)` 调用该函数时，不能重复转换，会给出警告信息：
+
+```python
+import paddle
+import numpy as np
+
+paddle.disable_static()
+
+@paddle.jit.to_static
+def func():
+    def inner_func(x):
+        x_tensor = paddle.to_tensor(x)
+        return x_tensor
+    data = np.ones([3]).astype("int32")
+    x = inner_func(data)
+    return x
+func()
+```
+
+ProgramTranslator打印的警告信息如下：
+
+```bash
+WARNING: <function inner_func at 0x7fa9bcaacf50> doesn't have to be transformed to static function because it has been transformed before, it will be run as-is.
+```
+
+## 运行转换后的代码报错
+
+如果在动转静后的静态图代码中发生异常，ProgramTranslator 会捕获该异常，增强异常报错信息，将静态图代码报错行映射到转换前的动态图代码，并重新抛出该异常。
+重新抛出的异常具有以下特点：
+
+- 隐藏了部分对用户无用的动转静过程调用栈；
+- 转换前的代码会给出提示："In User Code:"；
+- 报错信息中包含了转换前的原始动态图代码；
+
+例如，运行以下代码，在静态图构建时，即编译期会抛出异常：
+
+```python
+import paddle
+import numpy as np
+
+paddle.disable_static()
+
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    x = paddle.reshape(x, shape=[-1, -1])
+    return x
+
+func(np.ones([3, 2]))
+```
+
+运行结果：
+```bash
+Traceback (most recent call last):
+  <ipython-input-13-f9c3ea702e3a> in <module>()
+     func(np.ones([3, 2]))
+  File "paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 332, in __call__
+    raise new_exception
+AssertionError: In user code:
+
+    File "<ipython-input-13-f9c3ea702e3a>", line 7, in func
+        x = fluid.layers.reshape(x, shape=[-1, -1])
+    File "paddle/fluid/layers/nn.py", line 6193, in reshape
+        attrs["shape"] = get_attr_shape(shape)
+    File "paddle/fluid/layers/nn.py", line 6169, in get_attr_shape
+        "be -1. But received shape[%d] is also -1." % dim_idx)
+    AssertionError: Only one dimension value of 'shape' in reshape can be -1. But received shape[1] is also -1.
+```
+
+上述报错信息可以分为3点：
+
+1. 报错栈中，涉及代码转换过程的信息栈默认会被隐藏，不进行展示，以减少干扰信息。
+
+2. ProgramTranslator处理后的报错信息中，会包含提示"In user code:"，表示之后的报错栈中，包含动转静前的动态图代码，即用户写的代码：
+	```bash
+	AssertionError: In user code:
+
+        File "<ipython-input-13-f9c3ea702e3a>", line 7, in func
+	       x = fluid.layers.reshape(x, shape=[-1, -1])
+	    File "paddle/fluid/layers/nn.py", line 6193, in reshape
+	        attrs["shape"] = get_attr_shape(shape)
+	    File "paddle/fluid/layers/nn.py", line 6169, in get_attr_shape
+	        "be -1. But received shape[%d] is also -1." % dim_idx)
+	```
+	其中，`File "<ipython-input-13-f9c3ea702e3a>", line 7, in func` 是转换前的代码位置信息，`x = fluid.layers.reshape(x, shape=[-1, -1])` 是转换前的代码。
+
+3. 新的异常中，包含原始报错中的的报错信息，如下：
+	```bash
+	AssertionError: Only one dimension value of 'shape' in reshape can be -1. But received shape[1] is also -1.
+	```
+
+运行以下代码，在静态图运行时，即运行期会抛出异常：
+
+```Python
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    two = paddle.fill_constant(shape=[1], value=2, dtype="int32")
+    x = paddle.reshape(x, shape=[1, two])
+    return x
+
+func(np.ones([3]).astype("int32"))
+```
+
+运行结果：
+
+```bash
+Traceback (most recent call last):
+  File "<ipython-input-57-c63d6a351262>", line 10, in <module>()
+     func(np.ones([3]).astype("int32"))
+  File "paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 332, in __call__
+    raise new_exception
+
+EnforceNotMet: In user code:
+
+    File "<ipython-input-57-c63d6a351262>", line 7, in func
+      x = paddle.reshape(x, shape=[1, two])
+    File "paddle/tensor/manipulation.py", line 1347, in reshape
+      return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
+    File "paddle/fluid/layers/nn.py", line 6209, in reshape
+      "XShape": x_shape})
+    File "paddle/fluid/layer_helper.py", line 43, in append_op
+      return self.main_program.current_block().append_op(*args, **kwargs)
+    File "paddle/fluid/framework.py", line 2880, in append_op
+      attrs=kwargs.get("attrs", None))
+    File "paddle/fluid/framework.py", line 1977, in __init__
+      for frame in traceback.extract_stack():
+
+--------------------------------------
+C++ Traceback (most recent call last):
+--------------------------------------
+0   paddle::imperative::Tracer::TraceOp(std::string const&, paddle::imperative::NameVarBaseMap const&, paddle::imperative::NameVarBaseMap const&, paddle::framework::AttributeMap, paddle::platform::Place const&, bool)
+1   paddle::imperative::OpBase::Run(paddle::framework::OperatorBase const&, paddle::imperative::NameVarBaseMap const&, paddle::imperative::NameVarBaseMap const&, paddle::framework::AttributeMap const&, paddle::platform::Place const&)
+2   paddle::imperative::PreparedOp::Run(paddle::imperative::NameVarBaseMap const&, paddle::imperative::NameVarBaseMap const&, paddle::framework::AttributeMap const&)
+3   std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CPUPlace, false, 0ul, paddle::operators::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
+4   paddle::operators::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const
+5   paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
+6   paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
+7   paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
+8   paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
+9   paddle::operators::ReshapeKernel::operator()(paddle::framework::ExecutionContext const&) const
+10  paddle::operators::ReshapeOp::ValidateShape(std::vector<int, std::allocator<int> >, paddle::framework::DDim const&)
+11  paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
+12  paddle::platform::GetCurrentTraceBackString()
+
+----------------------
+Error Message Summary:
+----------------------
+InvalidArgumentError: The 'shape' in ReshapeOp is invalid. The input tensor X'size must be equal to the capacity of 'shape'. But received X's shape = [3], X's size = 3, 'shape' is [1, 2], the capacity of 'shape' is 2.
+  [Hint: Expected capacity == in_size, but received capacity:2 != in_size:3.] (at /paddle/paddle/fluid/operators/reshape_op.cc:206)
+  [operator < reshape2 > error]  [operator < run_program > error]
+```
+
+上述异常中，除了隐藏部分报错栈、报错定位到转换前的动态图代码外，报错信息中包含了C++报错栈 `C++ Traceback` 和 `Error Message Summary`，这是 Paddle 的 C++ 端异常信息，经处理后在 Python 的异常信息中显示。
--- a/doc/fluid/advanced_guide/dygraph_to_static/error_handling_en.md
+++ b/doc/fluid/advanced_guide/dygraph_to_static/error_handling_en.md
+# Error Handling
+
+This section will introduce the error information when an exception occurs, so as to help you better understand the Dynamic-to-Static error information.
+When running the transformed static graph code, the internal procedure can be divided into two steps: the dynamic graph code is transformed into the static graph code, and the static graph code is run. We will introduce the error reporting in these two steps.
+
+## Exceptions in Dynamic-to-Static Transformation
+
+If ProgramTranslator cannot transform a function, it will display a warning message and try to run the function as-is.
+
+In the following code, the function `inner_func` is transformed before calling. When calling `inner_func` in `x = inner_func(data)`, it is not allowed to transform repeatedly, and a warning message will be given:
+
+```python
+import paddle
+import numpy as np
+
+paddle.disable_static()
+
+@paddle.jit.to_static
+def func():
+    def inner_func(x):
+        x_tensor = paddle.to_tensor(x)
+        return x_tensor
+    data = np.ones([3]).astype("int32")
+    x = inner_func(data)
+    return x
+func()
+```
+
+The warning message is as follows:
+```bash
+WARNING: <function inner_func at 0x7fa9bcaacf50> doesn't have to be transformed to static function because it has been transformed before, it will be run as-is.
+```
+## Exceptions in Running Transformed Code
+
+When an exception occurs in the transformed code by ProgramTranslator, the exception is caught and the error message is augmented. It maps the error line of the static graph code to the un-transformed dynamic graph code, and then re-raises the exception.
+
+Among the features of the re-raised exception:
+
+- Some useless call stacks of Dynamic-to-Static are hidden;
+- A prompt will be given before the un-transformed code: "In User Code:";
+- The error message includes references to the original dynamic graph code before transformation;
+
+For example, if executing the following code, an exception is raised when the static graph is built, that is, at compile time:
+
+```python
+import paddle
+import numpy as np
+
+paddle.disable_static()
+
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    x = paddle.reshape(x, shape=[-1, -1])
+    return x
+
+func(np.ones([3, 2]))
+```
+
+```bash
+Traceback (most recent call last):
+  <ipython-input-13-f9c3ea702e3a> in <module>()
+     func(np.ones([3, 2]))
+  File "paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 332, in __call__
+    raise new_exception
+AssertionError: In user code:
+
+    File "<ipython-input-13-f9c3ea702e3a>", line 7, in func
+        x = fluid.layers.reshape(x, shape=[-1, -1])
+    File "paddle/fluid/layers/nn.py", line 6193, in reshape
+        attrs["shape"] = get_attr_shape(shape)
+    File "paddle/fluid/layers/nn.py", line 6169, in get_attr_shape
+        "be -1. But received shape[%d] is also -1." % dim_idx)
+    AssertionError: Only one dimension value of 'shape' in reshape can be -1. But received shape[1] is also -1.
+```
+
+The above error information can be divided into three points:
+
+1. In the error stack, the call stacks related to the code transformation process are hidden by default and not displayed, so as to avoid confusion.
+
+2. In the error message processed by ProgramTranslator, a prompt "In user code:" will be included, which means that the following error stacks contains the original dynamic graph code, that is, the code written by the user:
+
+    ```bash
+    AssertionError: In user code:
+
+        File "<ipython-input-13-f9c3ea702e3a>", line 7, in func
+           x = fluid.layers.reshape(x, shape=[-1, -1])
+        File "paddle/fluid/layers/nn.py", line 6193, in reshape
+            attrs["shape"] = get_attr_shape(shape)
+        File "paddle/fluid/layers/nn.py", line 6169, in get_attr_shape
+            "be -1. But received shape[%d] is also -1." % dim_idx)
+    ```
+    `File "<ipython-input-13-f9c3ea702e3a>", line 7, in func` is the location information of un-transformed code, `x = fluid.layers.reshape(x, shape=[-1, -1])` is the un-transformed code.
+
+3. The new exception contains the message that the exception originally reported, as follows:  
+    ```bash
+    AssertionError: Only one dimension value of 'shape' in reshape can be -1. But received shape[1] is also -1.
+    ```  
+
+If execute the following code, an exception is raised when the static graph is executed at runtime:
+
+```Python
+@paddle.jit.to_static
+def func(x):
+    x = paddle.to_tensor(x)
+    two = paddle.fill_constant(shape=[1], value=2, dtype="int32")
+    x = paddle.reshape(x, shape=[1, two])
+    return x
+
+func(np.ones([3]).astype("int32"))
+```
+
+```bash
+Traceback (most recent call last):
+  File "<ipython-input-57-c63d6a351262>", line 10, in <module>()
+     func(np.ones([3]).astype("int32"))
+  File "paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 332, in __call__
+    raise new_exception
+
+EnforceNotMet: In user code:
+
+    File "<ipython-input-57-c63d6a351262>", line 7, in func
+      x = paddle.reshape(x, shape=[1, two])
+    File "paddle/tensor/manipulation.py", line 1347, in reshape
+      return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
+    File "paddle/fluid/layers/nn.py", line 6209, in reshape
+      "XShape": x_shape})
+    File "paddle/fluid/layer_helper.py", line 43, in append_op
+      return self.main_program.current_block().append_op(*args, **kwargs)
+    File "paddle/fluid/framework.py", line 2880, in append_op
+      attrs=kwargs.get("attrs", None))
+    File "paddle/fluid/framework.py", line 1977, in __init__
+      for frame in traceback.extract_stack():
+
+--------------------------------------
+C++ Traceback (most recent call last):
+--------------------------------------
+0   paddle::imperative::Tracer::TraceOp(std::string const&, paddle::imperative::NameVarBaseMap const&, paddle::imperative::NameVarBaseMap const&, paddle::framework::AttributeMap, paddle::platform::Place const&, bool)
+1   paddle::imperative::OpBase::Run(paddle::framework::OperatorBase const&, paddle::imperative::NameVarBaseMap const&, paddle::imperative::NameVarBaseMap const&, paddle::framework::AttributeMap const&, paddle::platform::Place const&)
+2   paddle::imperative::PreparedOp::Run(paddle::imperative::NameVarBaseMap const&, paddle::imperative::NameVarBaseMap const&, paddle::framework::AttributeMap const&)
+3   std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CPUPlace, false, 0ul, paddle::operators::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&)
+4   paddle::operators::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const
+5   paddle::framework::Executor::RunPartialPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, long, long, bool, bool, bool)
+6   paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, paddle::platform::Place const&)
+7   paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&) const
+8   paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, paddle::platform::Place const&, paddle::framework::RuntimeContext*) const
+9   paddle::operators::ReshapeKernel::operator()(paddle::framework::ExecutionContext const&) const
+10  paddle::operators::ReshapeOp::ValidateShape(std::vector<int, std::allocator<int> >, paddle::framework::DDim const&)
+11  paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
+12  paddle::platform::GetCurrentTraceBackString()
+
+----------------------
+Error Message Summary:
+----------------------
+InvalidArgumentError: The 'shape' in ReshapeOp is invalid. The input tensor X'size must be equal to the capacity of 'shape'. But received X's shape = [3], X's size = 3, 'shape' is [1, 2], the capacity of 'shape' is 2.
+  [Hint: Expected capacity == in_size, but received capacity:2 != in_size:3.] (at /paddle/paddle/fluid/operators/reshape_op.cc:206)
+  [operator < reshape2 > error]  [operator < run_program > error]
+```
+
+In the above exception, in addition to hiding part of the error stack and locating the error to the un-transformed dynamic graph code, the error information includes the c++ error stack `C++ Traceback` and `Error Message Summary`, which are the exception from C++ and are displayed in Python exception after processing.
--- a/doc/fluid/advanced_guide/dygraph_to_static/grammar_list_cn.rst
+++ b/doc/fluid/advanced_guide/dygraph_to_static/grammar_list_cn.rst
+ProgramTranslator支持的语法
+==========================
+
+ProgramTranslator本质是把Python运行语法转写为PaddlePaddle静态图代码，但是Python语法的表达能力和PaddlePaddle静态图表达能力存在不同，这使得一些代码无法被转换。
+
+本章节我们将详细讲述在动转静过程中支持转化哪些语法，不支持哪些语法，并且讲述如何改写代码能够解决语法不支持的场景。
+
+动转静支持的语法分为以下几个大类：
+
+控制流相关关键词
+------------------
+
+控制流指if-elif-else，while等能够控制程序语句执行顺序的关键字。PaddlePaddle静态图通过cond，while_loop API来实现条件判断和循环，如果动态图Python控制流的判断条件或循环条件依赖 PaddlePaddle Tensor，动转静后会被转化为等价的PaddlePaddle控制流接口，否则仍然使用Python控制流逻辑运行。在动转静过程中这些关键字的转化情况为：
+
+1. if-elif-else 条件
+
+当 ``if <条件>`` 中的条件是Tensor时，ProgramTranslator会把该if-elif-else语句转化为等价的cond API语句。否则会按普通Python if-elif-else的逻辑运行。需注意cond支持的Tensor只能是numel为1的bool Tensor，所以请使用这种Tensor进行条件判断，其他Tensor会报错。
+
+2. while 循环
+
+当while循环中的条件是Tensor时，ProgramTranslator会把该while语句转化为等价的while_loop API语句，否则会按普通Python while运行。需注意while循环条件中的Tensor只能是numel为1的bool Tensor，所以请使用这种Tensor进行条件判断，其他Tensor会报错。
+
+
+3. for 循环
+
+3.1 ``for _ in range(__)`` 循环
+
+ProgramTranslator先将其转化为等价的Python while循环，然后按while循环的逻辑进行动静转换。
+
+3.2 ``for _ in x`` 循环
+
+当x是Python容器或迭代器，则会用普通Python逻辑运行。当x是Tensor时，会转化为循环中每次对应拿出x[0], x[1], ... 。
+
+3.3 ``for idx, val in enumerate(x)`` 循环
+
+当x是Python容器或迭代器，则会用普通Python逻辑运行。当x是Tensor时，idx会转化为依次0，1，...的1-D Tensor。val会转化为循环中每次对应拿出x[0], x[1], ... 。
+
+4. break，continue
+
+ProgramTranslator 可以支持在循环中添加break，continue语句，其底层实现原理是对于要break，continue的部分在相应时候使用cond在一定条件下跳过执行。
+
+5. return
+
+ProgramTranslator 支持在循环，条件判断中return结果而不需要一定在函数末尾return。也能够支持return不同长度tuple和不同类型的Tensor。其底层实现原理是对return后的部分相应使用cond在一定条件下跳过执行。
+
+
+一些需要转化的运算类型
+------------------------
+
+1. +，-，*，/，**, >, <, >= , <=, == 等Python内置运算
+
+由于静态图有重载这些基本运算符，所以这些被ProgramTranslator转化后都适用相应重载的运算符，动转静支持此类运算。
+
+2. and，or，not 逻辑运算
+
+Python内置and，or，not逻辑运算关键词，ProgramTranslator在语句的运算时会判断逻辑运算关键词运行的对象是否是Tensor，如果都是Tensor，我们将其转化为静态图对应的逻辑运算接口并运行。
+
+3. 类型转化
+
+动态图中可以直接用Python的类型转化语法来转化Tensor类型。例如x是Tensor时，float(x)可以将x的类型转化为float。ProgramTranslator在运行时判断x是否是Tensor，如果是，则在动转静时使用静态图cast接口转化相应的Tensor类型。
+
+Python 函数相关
+---------------------
+
+1. print
+
+如果x是Tensor，在动态图模式中print(x)可以打印x的值。在动转静过程中我们把此转化为静态图的Print接口实现，使得在静态图中也能打印。如果print的参数不是Tensor，那么我们没有把相应print语句进行转写。
+
+2. len
+
+如果x是Tensor，在动态图模式中len(x)可以获得x第0维度的长度。在动转静中我们把此转化为静态图shape接口，并返回shape的第0维。另外如果x是个TensorArray，那么len(x)将会使用静态图接口control_flow.array_length返回TensorArray的长度。对于其他情况，动转静时会按照普通Python len函数运行。
+
+3. lambda 表达式
+
+动转静允许写带有Python lambda表达式的语句，并且我们会适当改写使得返回对应结果。
+
+4. 函数内再调用函数
+
+对于函数内调用其他函数的情况，ProgramTranslator也会对内部的函数递归地进行动转静，这样做的好处是可以在最外层函数只需加一次装饰器即可，而不需要每个函数都加装饰器。但需要注意，动转静还不支持函数递归调用自己，详细原因请查看下文动转静无法正确运行的情况。
+
+报错异常相关
+--------------
+
+1. assert
+
+如果x是Tensor，在动态图中可以通过assert x来强制x为True或者非0值，在动转静中我们把此转化为静态图Assert接口支持此功能。
+
+
+Python基本容器
+---------------
+
+1. list：对于一个list如果里面元素都是Tensor，那么动转静会转化其为TensorArray，静态图TensorArray可以支持append，pop，修改操作。因此ProgramTranslator在元素皆为Tensor的list中支持上面三种操作。换言之，其他list操作，比如sort无法支持。对于list中并非所有元素是Tensor的情况，ProgramTranslator会将其作为普通Python list运行。
+
+2. dict：ProgramTranslator会将相应的dict中的Tensor添加进静态图Program，因此使用dict是动转静支持的语法。
+
+动转静无法正确运行的情况
+--------------------------
+
+1. Reshape后的变量调用其shape作为PaddlePaddle API参数。
+
+具体表现比如 ``x = reshape(x, shape=shape_tensor)`` ，再使用 ``x.shape[0]`` 的值进行其他操作。这种情况会由于动态图和静态图的本质不同而使得动态图能够运行，但静态图运行失败。其原因是动态图情况下，API是直接返回运行结果，因此 ``x.shape`` 在经过reshape运算后是确定的。但是在转化为静态图后，因为静态图API只是组网，``shape_tensor`` 的值在组网时是不知道的，所以 ``reshape`` 接口组网完，静态图并不知道 ``x.shape`` 的值。PaddlePaddle静态图用-1表示未知的shape值，此时 ``x`` 的shape每个维度会被设为-1，而不是期望的值。
+
+遇到这类情况我们建议尽量固定shape值，减少reshape操作。
+
+2. 多重list嵌套读写Tensor
+
+具体表现如 ``l = [[tensor1, tensor2], [tensor3, tensor4]]`` ，因为现在动转静将元素全是Tensor的list转化为TensorArray，而PaddlePaddle的TensorArray还不支持多维数组，因此这种情况下，动转静无法正确运行。
+
+遇到这类情况我们建议尽量用一维list，或者自己使用PaddlePaddle的create_array，array_read，array_write接口编写为TensorArray。
+
+3. Tensor值在被装饰函数中转成numpy array进行运算
+
+具体表现为在被装饰函数中没有返回Tensor时就使用 ``numpy.array(tensor)`` 将Tensor转化为numpy array并使用numpy接口进行运算。这种情况在动态图下因为Tensor有值是可以正常运行的，但是在静态图时由于Tensor只是组网变量，在没有运行时没有数值，因此无法进行numpy运算。
+
+遇到这种情况我们建议在动转静的函数中尽量使用PaddlePaddle接口替代numpy接口进行运算。
+
+4. 一个函数递归调用自己
+
+ProgramTranslator还无法支持一个函数递归调用自己，原因是递归常常会用 ``if-else`` 构造停止递归的条件。然而这样的停止条件在静态图下只是一个 ``cond`` 组网，组网并不能在编译阶段决定自己组多少次，会导致函数运行时一直组网递归直至栈溢出，因此ProgramTranslator还无法支持一个函数递归调用自己。
+
+遇到这种情况我们建议将代码改为非递归写法。
+
--- a/doc/fluid/advanced_guide/dygraph_to_static/grammar_list_en.rst
+++ b/doc/fluid/advanced_guide/dygraph_to_static/grammar_list_en.rst
+Supported Grammars
+==================
+
+The key part of ProgramTranslator is transforming Python grammar into PaddlePaddle static graph code, but there exists difference between Python and PaddlePaddle static graph which causes some limitation of the code transformation.
+
+In this section we will talk about the supported grammars and unsupported grammars, also give some suggestions when the grammar is unsupported. 
+
+There are several kinds of supported grammars:
+
+Control flow keywords
+---------------------
+
+Control flow means those keywords that controls the execution order of program statements, for example ``if-elif-else, while`` . Conditional operation and loop were implemented as ``cond, while_loop`` APIs in PaddlePaddle static graph. If the condition of a Python dygraph control flow depends on PaddlePaddle Tensor, the ProgramTranslator will convert the control flow into equivalent PaddlePaddle control flow APIs, else it will still be executed as Python control flow. The transformations of those control flow keywords are listed below:
+
+1. ``if-elif-else`` statements
+
+If the condition of ``if <condition>`` is Tensor, ProgramTranslator will turn this ``if-elif-else`` statement to equivalent PaddlePaddle static graph ``cond`` statements, otherwise the ``if-elif-else`` statement is executed as normal Python conditional statement. Note that ``cond`` API only accepts input conditional Tensor with numel equals to 1, so please use this kind of Tensor to write dygraph conditional statement, other Tensors will cause error.
+
+2. ``while`` loop
+
+If the condition of ``while`` is Tensor, ProgramTranslator will turn this ``while`` statement to equivalent PaddlePaddle static graph ``while_loop`` statements, otherwise the ``while`` statement is executed as normal Python ``while`` loop statement. Note that ``while_loop`` API only accepts input conditional Tensor with numel equals to 1, so please use this kind of Tensor to write dygraph loop condition statement, other Tensors will cause error.
+
+3. ``for`` loop
+
+3.1 ``for _ in range(__)`` loop
+
+Firstly, ProgramTranslator will transform it into equivalent Python while loop, then convert dygraph to static graph by same logic of ``while`` loop.
+
+3.2 ``for _ in x`` loop
+
+If ``x`` is a Python container, iterator, or generator, it will be executed as original Python statement. Otherwise ``x`` is a Tensor, ProgramTranslator will transform the loop into PaddlePaddle static graph loop and fetches ``x[0], x[1], ...`` as loop iteration variable in each loop iteration.
+
+3.3 ``for idx, val in enumerate(x)`` loop
+
+If ``x`` is a Python container, iterator, or generator, it will be executed as original Python statement. Otherwise ``x`` is a Tensor, Program
+Translator will transform the  loop into PaddlePaddle static graph loop. The ``idx`` will be transformed to 1-D tensor with value ``0, 1, ...`` and the ``val`` will be transformed to ``x[0], x[1], ...`` in each loop iteration.
+
+4. ``break, continue``
+
+ProgramTranslator supports ``break, continue`` statements in loop. ProgramTranslator will add some PaddlePaddle static graph ``cond`` statements to skip execution of corresponding part when ``break, continue`` condition is meet.
+
+5. ``return``
+
+ProgramTranslator supports ``return`` in a conditonal block or loop body, not necessary to be at the end of a function. It also supports returning tuple with various length of Tensors with different dtype. The implementation is adding some PaddlePaddle static graph ``cond`` statement to skipparts of code when ``return`` is triggered.
+
+
+Some Python basic operators
+---------------------------
+
+1. ``+, -, *, /, **, >, <, >= , <=, ==`` etc. 
+
+Because PaddlePaddle static graph overrides those Python basic arithmetic operators and comparison operators, ProgramTranslator can support those operators.
+
+2. ``and, or, not`` logical operators
+
+Python has ``and, or, not`` keywards as basic logical operators, ProgramTranslator will check whether the variables of the logical operators are Tensors, if they are Tensors, ProgramTranslator replaces the ``and, or, not`` statements into corresponding PaddlePaddle static graph logical operator and run it.
+
+3. Type casting
+
+In dygraph mode, users can use Python type casting grammar. For instance, if ``x`` is a Tensor, ``float(x)`` casts the data type of ``x`` to float. ProgramTranslator will check whether ``x`` is a Tensor during run time, if it is, the casting sentence will be modified to PaddlePaddle static graph ``cast`` API so that its dtype can be changed in the dygraph to static transformation.
+
+Python functions
+------------------------------
+
+1. ``print``
+
+In dygraph mode, ``print(x)`` will print Tensor value if ``x`` is a Tensor. ProgramTranslator converts the built-in ``print`` to PaddlePaddle static graph ``Print`` API during dygraph to static graph transformation if the arguments are Tensors, otherwise ProgramTranslator won't convert the ``print``. 
+
+2. ``len``
+
+If ``x`` is a Tensor, ``len(x)`` can get the length at 0-dimension of ``x`` in dygraph mode. ProgramTranslator turns it to PaddlePaddle static graph ``shape`` API and returns the 0-dimension of the ``shape``, else if ``x`` is a TensorArray, then ``len(x)`` will be transformed to static graph API ``control_flow.array_length`` to return the length of TensorArray. In other cases, the ``len`` function will be executed as Python built-in ``len``
+
+3. lambda expression
+
+ProgramTranslator supports Python lambda expression and it modifies code to return the expected result.
+
+
+4. Calling function
+
+If the transformed function calls another function, ProgramTranslator also transform the called function. The benefit is that users can add one decorator at the outside function to do transformation, no need to add the decorator for each function. Note that ProgramTranslator doesn't support 
+that a function calls itself recursively, the details is in the unsupported grammars section below.
+
+
+Errors and Exceptions
+---------------------
+
+1. ``assert``
+
+If ``x`` is a Tensor, ``assert x`` statement can assert ``x`` to be ``True`` or non-zero value in dygraph mode. ProgramTranslator converts the statement into PaddlePaddle static graph ``Assert`` API to support this grammar.
+
+
+Python containers
+-----------------
+
+1. ``list``: if all elements in a list are Tensors, then ProgramTranslator converts it to TensorArray. PaddlePaddle static graph TensorArray supports append, pop, and modify, other list operations such as sort cannot be supported. When not all elements in a list are Tensors, ProgramTranslator will treat it as normal Python list.
+
+2. ``dict``: ProgramTranslator will add the Tensors in a dict into PaddlePaddle static graph ``Program``, so ``dict`` is supported by ProgramTranslator.
+
+Unsupported grammars
+--------------------
+
+1. Use the shape of output tensor of ``reshape``
+
+For example, ``x = reshape(x, shape=shape_tensor)`` , then use ``x.shape[0]`` to do other operation. Due to the difference between dygraph and static graph, it is okay in dygraph but it will fail in static graph. The reason is that APIs return computation result in dygraph mode, so ``x.shape`` has deterministic value after calling ``reshape`` . However, static graph doesn't have the value ``shape_tensor`` during building network, so PaddlePaddle doesn't know the value of ``x.shape`` after calling ``reshape``. PaddlePaddle static graph will set -1 to represent unknown shape value for each dimension of ``x.shape`` in this case, not the expected value.
+
+We suggest to set fixed shape value as much as possible, reduce the reshape operation.
+
+2. List of list of Tensor
+
+For example: ``l = [[tensor1, tensor2], [tensor3, tensor4]]``, because ProgramTranslator transformed a list whose elements are all Tensors into PaddlePaddle static graph TensorArray, but TensorArray doesn't support multi-dimensions, ProgramTranslator cannot run this case.
+
+We suggest to use 1-D list at most time, or use PaddlePaddle API ``create_array, array_read, array_write`` to control TensorArray.
+
+3. Convert Tensor to numpy array and do operation
+
+For example, user doesn't return Tensor in the decorated function but call ``numpy.array(tensor)`` to convert Tensor to numpy array and then use numpy API to compute on it. In dygraph mode, it is okey because Tensor has value, but Tensor is variable for building network in static graph mode, it doesn't contain value if not in static graph running time, so we cannot do numpy calculation on it.
+
+We suggest to use PaddlePaddle APIs to replace numpy API in this case.
+
+4. A function calls itself recursively
+
+ProgramTranslator doesn't support a function calls itself recursively, the reason is that recursive function usually uses ``if-else`` for a condition to stop the recursion, the stop condition will be transformed to a ``cond`` in static graph mode. Since ``cond`` just builds network, it cannot determine how many times it recursively builds network during network built stage, so the function will recursively call itself and build network until stack overflow. Due to above reason, ProgramTranslator cannot support a function calls itself recursively now.
+
+We suggest to write non-recursive function in this case.
--- a/doc/fluid/advanced_guide/dygraph_to_static/index_cn.rst
+++ b/doc/fluid/advanced_guide/dygraph_to_static/index_cn.rst
+###############
+动态图转静态图
+###############
+
+- `动态图转静态图 <program_translator_cn.html>`_ ：介绍了动态图转静态图的基本使用方法和架构原理
+
+- `支持语法列表 <grammar_list_cn.html>`_ ：介绍了动态图转静态图支持的语法以及罗列不支持的语法写法
+
+- `报错信息处理 <error_handling_cn.html>`_ ：介绍了动态图转静态图的报错信息处理方法
+
+- `调试方法 <debugging_cn.html>`_ ：介绍了动态图转静态图支持的调试方法
+
+
+..  toctree::
+    :hidden:
+
+    grammar_list_cn.rst
+    program_translator_cn.rst
+    error_handling_cn.md
+    debugging_cn.md
--- a/doc/fluid/advanced_guide/dygraph_to_static/index_en.rst
+++ b/doc/fluid/advanced_guide/dygraph_to_static/index_en.rst
+#######################
+Dygraph to Static Graph
+#######################
+
+- `Dygraph to Static Graph <program_translator_cn.html>`_ ：Introduce the basic usage for transforming dygraph code into static code and the architecture of ProgramTranslator.
+
+- `Supported Grammars <grammar_list_en.html>`_ ：Introduce the grammars supported by ProgramTranslator and list unsupport grammars.
+
+- `Error Handling <error_handling_en.html>`_ ：Introduce the error handling by ProgramTranslator.
+
+- `Debugging Methods <debugging_en.html>`_ ：Introduce the debugging methods when using ProgramTranslator.
+
+..  toctree::
+    :hidden:
+
+    grammar_list_en.rst
+    program_translator_en.rst
+    error_handling_en.md
+    debugging_en.md
+
--- a/doc/fluid/advanced_guide/dygraph_to_static/program_translator_cn.rst
+++ b/doc/fluid/advanced_guide/dygraph_to_static/program_translator_cn.rst
+动态图转静态图
+================
+
+动态图有诸多优点，包括易用的接口，python风格的编程体验，友好的debug交互机制等。在动态图模式下，代码是按照我们编写的顺序依次执行。这种机制更符合Python程序员的习惯，可以很方便地将大脑中的想法快速地转化为实际代码，也更容易调试。但在性能方面，Python执行开销较大，与C++有一定差距。因此在工业界的许多部署场景中（如大型推荐系统、移动端）都倾向于直接使用C++来提速。
+
+相比动态图，静态图在部署方面更具有性能的优势。静态图程序在编译执行时，先搭建模型的神经网络结构，然后再对神经网络执行计算操作。预先搭建好的神经网络可以脱离Python依赖，在C++端被重新解析执行，而且拥有整体网络结构也能进行一些网络结构的优化。
+
+动态图代码更易编写和debug，但在部署性能上，静态图更具优势。因此我们新增了动态图转静态图的功能，支持用户依然使用动态图编写组网代码。PaddlePaddle会对用户代码进行分析，自动转换为静态图网络结构，兼顾了动态图易用性和静态图部署性能两方面优势。
+
+基本使用方法
+--------------
+
+PaddlePaddle提供了两种动态图转静态图的方式，基于动态图trace的TracedLayer与基于源代码级别转换的ProgramTranslator。
+
+1. 基于trace的TracedLayer：
+
+trace是指在模型运行时记录下其运行过哪些算子。TracedLayer就是基于这种技术，在一次执行动态图的过程中，记录所有运行的算子，并构建和保存静态图模型。一个使用例子如下：
+
+我们先定义一个简单的Fully Connected网络：
+
+.. code-block:: python
+
+    import numpy as np
+    import paddle
+
+    class SimpleFcLayer(paddle.nn.Layer):
+        def __init__(self, feature_size, batch_size, fc_size):
+            super(SimpleFCLayer, self).__init__()
+            self._linear = paddle.nn.Linear(feature_size, fc_size)
+            self._offset = paddle.to_tensor(
+                np.random.random((batch_size, fc_size)).astype('float32'))
+
+        def forward(self, x):
+            fc = self._linear(x)
+            return fc + self._offset
+
+
+接下来是TracedLayer如何存储模型：
+
+.. code-block:: python
+    import paddle
+    from paddle.jit import TracedLayer
+
+    paddle.disable_static()
+
+    fc_layer = SimpleFcLayer(3, 4, 2)
+    in_np = np.random.random([3, 4]).astype('float32')
+    # 将numpy的ndarray类型的数据转换为Tensor类型
+    input_var = paddle.to_tensor(in_np)
+    # 通过 TracerLayer.trace 接口将命令式模型转换为声明式模型
+    out_dygraph, static_layer = TracedLayer.trace(fc_layer, inputs=[input_var])
+    save_dirname = './saved_infer_model'
+    # 将转换后的模型保存
+    static_layer.save_inference_model(save_dirname, feed=[0], fetch=[0])
+
+
+载入的模型可以使用静态图方式运行
+
+.. code-block:: python
+
+    place = paddle.CPUPlace()
+    exe = paddle.Executor(place)
+    program, feed_vars, fetch_vars = paddle.io.load_inference_model(save_dirname, exe)
+    fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
+
+
+但是也正如我们阐述的原理，trace只是记录了一次执行涉及算子，若在用户的模型代码中，包含了依赖数据条件（包括输入的值或者shape）的控制流分支，即根据数据条件触发运行不同的算子，则TracedLayer无法正常工作。比如下面
+
+
+.. code-block:: python
+
+    import paddle
+
+    def func(input_var)
+        # if判断与输入input_var的shape有关
+        if input_var.shape[0] > 1:
+            return paddle.cast(input_var, "float64")
+        else:
+            return paddle.cast(input_var, "int64")
+
+    paddle.disable_static()
+    in_np = np.array([-2]).astype('int')
+    input_var = paddle.to_tensor(in_np)
+    out = func(input_var)
+
+
+上例如果在使用TracedLayer.trace(func, inputs=[input_var])，由于trace只能记录if-else其中跑的一次算子，模型就无法按用户想要的根据input_var的形状进行if-else控制流保存。类似的控制流还有while/for循环的情况
+
+2. 基于源代码转写的ProgramTranslator
+
+对于依赖数据的控制流，我们使用基于源代码转写的ProgramTranslator来进行动态图转静态图。其基本原理是通过分析Python代码来将动态图代码转写为静态图代码，并在底层自动帮用户使用执行器运行。其基本使用方法十分简便，只需要在要转化的函数（该函数也可以是用户自定义动态图Layer的forward函数）前添加一个装饰器 ``@paddle.jit.to_static`` ，上面的例子转化如下，并且可以依旧使用该函数运行得到结果：
+
+.. code-block:: python
+
+    import paddle
+
+    @paddle.jit.to_static
+    def func(input_var)
+        # if判断与输入input_var的shape有关
+        if input_var.shape[0] > 1:
+            out = paddle.cast(input_var, "float64")
+        else:
+            out = paddle.cast(input_var, "int64")
+
+    paddle.disable_static()
+    in_np = np.array([-2]).astype('int')
+    input_var = paddle.to_tensor(in_np)
+    func(input_var)
+
+
+若要存储转化后的静态图模型，可以调用 ``paddle.jit.save`` ，我们再以SimpleFcLayer为例，需要在SimpleFcLayer的forward函数添加装饰器：
+
+.. code-block:: python
+
+    import numpy as np
+    import paddle
+
+    class SimpleFcLayer(paddle.nn.Layer):
+        def __init__(self, feature_size, batch_size, fc_size):
+            super(SimpleFCLayer, self).__init__()
+            self._linear = paddle.nn.Linear(feature_size, fc_size)
+            self._offset = paddle.to_tensor(
+                np.random.random((batch_size, fc_size)).astype('float32'))
+
+        @paddle.jit.to_static
+        def forward(self, x):
+            fc = self._linear(x)
+            return fc + self._offset
+
+
+存储该模型可以使用paddle.jit.save接口：
+
+.. code-block:: python
+
+    import paddle
+
+    paddle.disable_static()
+
+    fc_layer = SimpleFcLayer(3, 4, 2)
+    in_np = np.random.random([3, 4]).astype('float32')
+    input_var = paddle.to_tensor(in_np)
+    out = fc_layer(input_var)
+
+    paddle.jit.save(fc_layer, "./fc_layer_dy2stat", input_spec=[input_var])
+
+内部架构原理
+--------------
+
+TracedLayer的原理就是trace，相对简单，因此我们在这里不展开描述。本节将主要阐述ProgramTranslator基于源代码将动态图代码转化为静态图代码。
+
+
+转化过程发生在用户开始调用被装饰的函数，转换过程在装饰器中实现。我们将内部涉及的过程分为以下几步：
+
+1. 函数与缓存
+
+动态图转静态图的主体是函数（Function）。对于函数内包含的PaddlePaddle接口，如果是仅计算相关算子代码语句，那么因为PaddlePaddle动态图和静态图接口一致，我们不需要额外转换这些代码为静态图代码。但是对于动态图，此类代码接口是直接运行计算和返回结果，而对于静态图此类代码接口其实是组网。那么如果被转化的函数被调用多次，动态图转静态图后会多次组网添加对应算子，这显然会导致问题。为了解决这个问题以及为了加速动转静转化过程，我们维护了被装饰器装饰的函数（Function）与其输入形状（shape），数据类型（dtype）映射到被转化后组网的Program的缓存（Cache）。当要被转化的函数命中缓存，我们直接用对应存储的Program运行静态图得到结果，否则我们才进行语句转化，并且转化成功后的Program存储进缓存。
+
+2. 动态图源码转AST（抽象语法树）
+
+动态图转静态图的最核心部分类似一个编译器，解析动态图代码语句为AST，再对应AST进行改写，最后反转回成静态图代码。从函数转化为代码字符串可以使用Python的inspect.getsource。从字符串Python提供了自带的 `ast <https://docs.python.org/3/library/ast.html>`_ 库来解析字符串为AST，但是由于Python2，Python3的语法略有不同，为了避免我们需要额外处理这些Python2，Python3的不同情况，我们使用了统一Python2，Python3的开源AST处理 `gast库 <https://github.com/serge-sans-paille/gast>`_ 。这些接口使得函数转化为AST没有本质上的困难。
+
+3. AST改写和静态图源码转换
+
+这部分为动转静最核心的部分，我们对支持的各种语法进行ast转写。其中最重要的Python控制流，if-else，while，for循环被分别分析转化为PaddlePaddle静态图接口cond，while_loop等接口实现。我们对想转化的每一种主要语法创建一个Transformer（这里的Transformer是Python ast转写的概念，而不是自然语言处理NLP领域的Transformer），每个Transformer扫一遍AST并进行对应的改写。最后被转化完成的AST我们使用gast提供的接口转回成源码。
+
+4. 静态图源码作为动态图一部分运行的技术
+
+为了动静转化更加易用和被转化的代码能在动态图中复用，我们在拥有源码后运行生成Program，并将这个Program作为一个大op，包装成动态图的一个op，这样既能把用户的代码转为静态图提速或者保存部署，另一方面如果用户想在Python层使用生成的静态图代码作为动态图的一部分继续训练或者别的动态图运算也是可以直接使用。
+
+5. 易用性与Debug功能在动转静过程的实现
+
+正如AST转写类似编译器，而一般编译器都会提供debug断点，报错，输出一些中间代码等功能。我们在进行动转静时，万一用户的动态图代码出错，或者用户想断点调试，或者用户想看看被转化后的静态图代码是否符合其预期，我们也希望能够像编译器一样提供这些易用性功能，使得动转静兼顾性能和部署同时还具有易用性。我们这里将列出这些功能的实现方式
+
+A. 报错对应到动态图代码行。由于被转化后的静态图代码和原动态图代码不同，Python运行出错时会报静态图的错误，因此我们在每一次AST转写时添加AST节点对应的原动态图代码行等信息，在Python报错栈中将静态图的报错转化成对应的动态图源码报错
+
+B. 设置断点功能。我们保留了被转化后代码的中的pdb.set_trace(), 用户可以使用这种方式进行断点调试
+
+C. 查看最后转化的静态图代码。我们输出为一个StaticLayer class，这个StaticLayer可以直接被调用，但是也存储转化后的代码，可以调用StaticLayer.code来获得转化后的代码。
+
+D. 输出中间转化状态代码，甚至不同语法Transformer转化的代码，比如经过for循环转化后代码是什么样的。我们开放接口设定了log level来让用户可以打印中间状态转化的代码。
+
+
--- a/doc/fluid/advanced_guide/dygraph_to_static/program_translator_en.rst
+++ b/doc/fluid/advanced_guide/dygraph_to_static/program_translator_en.rst
+Dygraph to Static Graph
+=======================
+
+The imperative-style coding of PaddlePaddle takes advantage of flexibility, Pythonic coding, and easy-to-debug interface. In dygraph mode, code immediately executes kernels and gets numerical results, which allows users to enjoy traditional Pythonic code order. Therefore it is efficient to transform idea into real code and simple to debug. However, Python code is usually slower than C++ thus lots of industrial systems (such as large recommend system, mobile devices) prefer to deploy with C++ implementation.
+
+Static graph is better at speed and portability. Static graph builds the network structure during compiling time and then does computation. The built network intermediate representation can be executed in C++ and gets rids of Python dependency.
+
+While dygraph has usability and debug benefits and static graph yields performance and deployment advantage, we adds functionality to convert dygraph to static graph. Users use imperative mode to write dygraph code and PaddlePaddle will analyze the Python syntax and turn it into network structure of static graph mode. Our approach retains both the usability of dygraph and portability of static graph.
+
+Basic Usage
+--------------
+
+PaddlePaddle has two ways to transform dygraph to static graph. TracedLayer extracts computation graph through tracing and ProgramTranslator gets computation graph through source code transformation.
+
+
+1. TracedLayer：
+
+Tracing means recording the operators when running a model. TracedLayer is based on this technique. It runs dygraph program once and records all operators, then constructs static graph model and saves it. Now take a glance at an usage example:
+
+Define a simple fully connected network:
+
+.. code-block:: python
+
+    import numpy as np
+    import paddle
+
+    class SimpleFcLayer(paddle.nn.Layer):
+        def __init__(self, feature_size, batch_size, fc_size):
+            super(SimpleFCLayer, self).__init__()
+            self._linear = paddle.nn.Linear(feature_size, fc_size)
+            self._offset = paddle.to_tensor(
+                np.random.random((batch_size, fc_size)).astype('float32'))
+
+        def forward(self, x):
+            fc = self._linear(x)
+            return fc + self._offset
+
+Save model by TracedLayer:
+
+.. code-block:: python
+
+    import paddle
+    from paddle.jit import TracedLayer
+
+    paddle.disable_static()
+
+    fc_layer = SimpleFcLayer(3, 4, 2)
+    in_np = np.random.random([3, 4]).astype('float32')
+    # Turn numpy ndarray into Tensor
+    input_var = paddle.to_tensor(in_np)
+    # Transforming imperative mode into declarative mode by TracerLayer.trace
+    out_dygraph, static_layer = TracedLayer.trace(fc_layer, inputs=[input_var])
+    save_dirname = './saved_infer_model'
+    # Save the transformed model
+    static_layer.save_inference_model(save_dirname, feed=[0], fetch=[0])
+
+Load model and run it in static graph mode:
+
+.. code-block:: python
+
+    place = paddle.CPUPlace()
+    exe = paddle.Executor(place)
+    program, feed_vars, fetch_vars = paddle.io.load_inference_model(save_dirname, exe)
+    fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
+
+However, as tracing only records operators once, if user's code contains Tensor-dependent (including Tensor value or Tensor shape) control flow, that is the Tensor can cause different operators being executed, then TracedLayer cannot handle this case. For instance:
+
+.. code-block:: python
+
+    import paddle
+
+    def func(input_var)
+        # if condition depends on the shape of input_var
+        if input_var.shape[0] > 1:
+            return paddle.cast(input_var, "float64")
+        else:
+            return paddle.cast(input_var, "int64")
+
+    paddle.disable_static()
+    in_np = np.array([-2]).astype('int')
+    input_var = paddle.to_tensor(in_np)
+    out = func(input_var)
+
+If we apply TracedLayer.trace(func, inputs=[input_var]) on above example, tracing can take record of operators in only one branch of if-else, then the model can not be saved as what user orignally means. The similar situations applies to while/for loop.
+
+2. ProgramTranslator
+
+For the Tensor-dependent control flow, we use source-code-translate based ProgramTranslator to convert dygraph into static graph. The basic idea is analyzing Python source code and turning into static graph code, then run the static graph code using Executor. The basic usage of ProgramTranslator is simple, put a decorator ``@paddle.jit.to_static`` before the definition of the function to transform (the function can also be a method of a class, e.g., the ``forward`` function of user-defined imperative Layer). Above Tensor-dependent example can be transformed correctly by ProgramTranslator as below:
+
+.. code-block:: python
+
+    import paddle
+
+    @paddle.jit.to_static
+    def func(input_var)
+        # if condition depends on the shape of input_var
+        if input_var.shape[0] > 1:
+            out = paddle.cast(input_var, "float64")
+        else:
+            out = paddle.cast(input_var, "int64")
+
+    paddle.disable_static()
+    in_np = np.array([-2]).astype('int')
+    input_var = paddle.to_tensor(in_np)
+    func(input_var)
+
+To save the transformed model, we can call ``paddle.jit.save`` . Let's take ``SimpleFcLayer`` as an example again, we put decorator at the ``forward`` method of ``SimpleFcLayer`` :
+
+.. code-block:: python
+
+    import numpy as np
+    import paddle
+
+    class SimpleFcLayer(paddle.nn.Layer):
+        def __init__(self, feature_size, batch_size, fc_size):
+            super(SimpleFCLayer, self).__init__()
+            self._linear = paddle.nn.Linear(feature_size, fc_size)
+            self._offset = paddle.to_tensor(
+                np.random.random((batch_size, fc_size)).astype('float32'))
+
+        @paddle.jit.to_static
+        def forward(self, x):
+            fc = self._linear(x)
+            return fc + self._offset
+
+
+Calling ``paddle.jit.save`` to save above model:
+
+.. code-block:: python
+
+    import paddle
+
+    paddle.disable_static()
+
+    fc_layer = SimpleFcLayer(3, 4, 2)
+    in_np = np.random.random([3, 4]).astype('float32')
+    input_var = paddle.to_tensor(in_np)
+    out = fc_layer(input_var)
+
+    paddle.jit.save(fc_layer, "./fc_layer_dy2stat")
+
+
+Architecture
+--------------
+
+The basic idea of TracedLayer is tracing, it is relatively simple so we won't expend here. This section will talk about the source code transformation of ProgramTranslator.
+
+The transformation is implemented in the decorator so transformation happens when user calls the decorated function, the procedure includes these steps:
+
+1. Function and cache.
+
+The entity for transforming dygraph to static graph is the decorated function. For the PaddlePaddle APIs in the function, since they are same code under dygraph mode and static mode, we don't have to transform those code. However, those APIs are computation in dygraph model while they are building network in static graph mode, if the transformed functions are called multiple times, those APIs will build network multiple times in static graph, which can cause problem. To solve it as well as speed up the transformation, we maintain a cache that maps from function, input shapes, input data types to the Program built by the transformed function. If the function hits cache, we run the stored Program in static graph mode to get result, else we do the code transformation on the function and store the transformed Program into the cache.
+
+2. From dygraph source code to AST (Abstract Syntax Tree)
+
+The core of transforming dygraph to static graph is similar to a compiler, we parse the dygraph code into AST, change AST, then turn it back into static graph code. We use Python ``inspect.getsource`` to get the source code string of the function. Python provides ``ast`` library to parse string code into AST, but Python2, Python3 have slight grammar difference. To avoid the work to handle different grammars, we used an open source AST library `gast <https://github.com/serge-sans-paille/gast>`_ that provides compatibility AST among various Python versions. There is no essential difficulty to turn function into AST with these library.
+
+3. Transform AST and turn it to static graph code
+
+This part is the key part in ProgramTranslator, we modify AST for supported grammars. Those important Python control flows, such as ``if-elif-else, while, for`` loop are converted to PaddlePaddle static graph API ``cond, while_loop`` and so on. We created a Transformer (AST-to-AST Transformer in Python, not the Transformer in Natural Language Process) to transform each grammar. Every Transformer scans AST and modify it. Lastly, we turn AST back to source code string by ``gast`` library.
+
+4. Running static graph code as part of dygraph
+
+In order to increase usability and re-use the transformed static graph code in dygraph, we wrap the generated Program as an dygraph op, the op can run the forward and backward computation of transformed Program. Then we can not only speed up dygraph code or save it for deployment, but also enable user to run part of their dygraph code in static graph mode so that they can continue training or other dygraph computation in their dygraph code.
+
+5. Error handling and Debug
+
+Compiler usually supports debug functionality like breakpoint, throwing exception, print some mid-level codes. ProgramTranslator is similar to a compiler, users may would like to set breakpoints for debugging, or see whether the transformed static graph code is expected. So we also implemented those error handling and debug functionality. Here we list those functions and their implementation.
+
+A. Report errors/exceptions on dygraph code line. Because the transformed static graph code is different to original dygraph code, when Python executes the static graph code, the exceptions will be reported at static graph code. To locate the corresponding dygraph code, we attach some informations such as line number on AST nodes when we transform AST, then we can re-write the static graph exception to the corresponding dygraph code exception.
+
+B. We support ``pdb.set_trace()`` when running ProgramTranslator, user can add this line to set breakpoints.
+
+C. Check the transformed static graph code. Our transformed output is a Python class named ``StaticLayer``, this class can be called, but it also stores the transformed code string. Users could call ``StaticLayer.code`` to get the converted code.
+
+D. Print mid-level transformed code, such as what's the code after transforming ``for`` loop. We provide APIs to set log level to let user check the mid-level code.
+
+
--- a/doc/fluid/advanced_guide/evaluation_debugging/debug/visualdl.md
+++ b/doc/fluid/advanced_guide/evaluation_debugging/debug/visualdl.md
 # VisualDL 工具简介
+
+
 <p align="center">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/VisualDL/develop/docs/images/vs-logo.png" width="60%" />
+  <img src="http://visualdl.bj.bcebos.com/images/vdl-logo.png" width="70%"/>
 </p>

-## 介绍
-VisualDL是一个面向深度学习任务设计的可视化工具，包含了scalar、参数分布、模型结构、图像可视化等功能，项目正处于高速迭代中，新的组件会不断加入。

-目前大多数DNN平台均使用Python作为配置语言，VisualDL原生支持python的使用，
-通过在模型的Python配置中添加几行，便可以为训练过程提供丰富的可视化支持。

-除了Python SDK之外，VisualDL底层采用C++编写，其暴露的C++ SDK也可以集成到其他平台中，
-实现原生的性能和定制效果。
+VisualDL是飞桨可视化分析工具，以丰富的图表呈现训练参数变化趋势、模型结构、数据样本、直方图、PR曲线及高维数据分布。可帮助用户更清晰直观地理解深度学习模型训练过程及模型结构，进而实现高效的模型优化。

-## 组件
-VisualDL 目前支持以下组件：
+具体功能使用方式请参见**VisualDL使用指南**。项目正处于高速迭代中，敬请期待新组件的加入。

- scalar
- histogram
- image
- audio
- graph
- high dimensional
+VisualDL支持浏览器种类：Chrome（81和83）、Safari 13、FireFox（77和78）、Edge（Chromium版）。

-### Scalar
-可以用于展示训练测试的误差趋势
+VisualDL原生支持python的使用， 通过在模型的Python配置中添加几行代码，便可为训练过程提供丰富的可视化支持。

-<p align="center">
-<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_scalar.gif" width="60%"/>
-</p>

-### Histogram

-用于可视化任何tensor中元素分布的变化趋势
+## 目录

-<p align="center">
-<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/histogram.gif" width="60%"/>
-</p>
+* [核心亮点](#核心亮点)
+* [安装方式](#安装方式)
+* [使用方式](#使用方式)
+* [可视化功能概览](#可视化功能概览)
+* [开源贡献](#开源贡献)
+* [更多细节](#更多细节)
+* [技术交流](#技术交流)

-### Image
-可以用于可视化任何tensor，或模型生成的图片

-<p align="center">
-<img src="https://raw.githubusercontent.com/daming-lu/large_files/master/loss_image.gif" width="60%"/>
-</p>

-### Audio
-可用于播放输入或生成的音频样本
+## 核心亮点

-### Graph
+### 简单易用

-VisualDL的graph支持paddle program的展示，同时兼容 ONNX(Open Neural Network Exchange)[https://github.com/onnx/onnx]，通过与 python SDK的结合，VisualDL可以兼容包括 PaddlePaddle, pytorch, mxnet在内的大部分主流DNN平台。
+API设计简洁易懂，使用简单。模型结构一键实现可视化。

-<p align="center">
-  <img src="https://raw.githubusercontent.com/PaddlePaddle/VisualDL/develop/docs/images/graph_demo.gif" width="60%" />
-</p>
+### 功能丰富

-要进行paddle模型的展示，需要进行以下两步操作：
+功能覆盖标量、数据样本、图结构、直方图、PR曲线及数据降维可视化。

-1. 在paddle代码中，调用`fluid.io.save_inference_model()`接口保存模型
-2. 在命令行界面，使用`visualdl --model_pb [paddle_model_dir]` 加载paddle模型
+### 高兼容性

+全面支持Paddle、ONNX、Caffe等市面主流模型结构可视化，广泛支持各类用户进行可视化分析。

-### High Dimensional
-用高维度数据映射在2D/3D来可视化嵌入
+### 全面支持
+
+与飞桨服务平台及工具组件全面打通，为您在飞桨生态系统中提供最佳使用体验。

-<p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/VisualDL/develop/docs/getting_started/high_dimensional_3d.png" width="60%"/>
-</p>

-## 快速尝试
-请使用下面的命令，来快速测试 VisualDL。

+## 安装方式
+
+### 使用pip安装
+
+```shell
+pip install --upgrade --pre visualdl
 ```
-# 安装，建議是在虚拟环境或anaconda下。
-pip install --upgrade visualdl

-# 运行一个例子，vdl_create_scratch_log 将创建测试日志
-vdl_create_scratch_log
-visualdl --logdir=scratch_log --port=8080
+### 使用代码安装

-# 访问 http://127.0.0.1:8080
 ```
+git clone https://github.com/PaddlePaddle/VisualDL.git
+cd VisualDL

-如果出现`TypeError: __init__() got an unexpected keyword argument 'file'`, 是因为protobuf不是3.5以上，运行`pip install --upgrade protobuf`就能解决。
+python setup.py bdist_wheel
+pip install --upgrade dist/visualdl-*.whl
+```

-如果以上步骤还有出现其他问题，很可能是因为python或pip不同版本或不同位置所致，以下安装方法能解决。
+需要注意，官方自2020年1月1日起不再维护Python2，为了保障代码可用性，VisualDL现仅支持Python3

-## 使用 virtualenv 安装
+## 使用方式

-[Virtualenv](https://virtualenv.pypa.io/en/stable/) 能创建独立Python环境，也能确保Python和pip的相对位置正确。
+VisualDL将训练过程中的数据、参数等信息储存至日志文件中后，启动面板即可查看可视化结果。

-在macOS上，安装pip和virtualenv如下：
-```
-sudo easy_install pip
-pip install --upgrade virtualenv
-```
+### 1. 记录日志

-在Linux上，安装pip和virtualenv如下:
-```
-sudo apt-get install python3-pip python3-dev python-virtualenv
+VisualDL的后端提供了Python SDK，可通过LogWriter定制一个日志记录器，接口如下：
+
+```python
+class LogWriter(logdir=None,
+                comment='',
+                max_queue=10,
+                flush_secs=120,
+                filename_suffix='',
+                write_to_disk=True,
+                **kwargs)
 ```

-然后创建一个虚拟环境：
+#### 接口参数
+
+| 参数            | 格式    | 含义                                                         |
+| --------------- | ------- | ------------------------------------------------------------ |
+| logdir          | string  | 日志文件所在的路径，VisualDL将在此路径下建立日志文件并进行记录，如果不填则默认为`runs/${CURRENT_TIME}` |
+| comment         | string  | 为日志文件夹名添加后缀，如果制定了logdir则此项无效           |
+| max_queue       | int     | 日志记录消息队列的最大容量，达到此容量则立即写入到日志文件   |
+| flush_secs      | int     | 日志记录消息队列的最大缓存时间，达到此时间则立即写入到日志文件 |
+| filename_suffix | string  | 为默认的日志文件名添加后缀                                   |
+| write_to_disk   | boolean | 是否写入到磁盘                                               |
+
+#### 示例
+
+设置日志文件并记录标量数据：
+
+```python
+from visualdl import LogWriter
+
+# 在`./log/scalar_test/train`路径下建立日志文件
+with LogWriter(logdir="./log/scalar_test/train") as writer:
+    # 使用scalar组件记录一个标量数据
+    writer.add_scalar(tag="acc", step=1, value=0.5678)
+    writer.add_scalar(tag="acc", step=2, value=0.6878)
+    writer.add_scalar(tag="acc", step=3, value=0.9878)
 ```
-virtualenv ~/vdl  # for Python2.7
-virtualenv -p python3 ~/vdl for Python 3.x
+
+### 2. 启动面板
+
+在上述示例中，日志已记录三组标量数据，现可启动VisualDL面板查看日志的可视化结果，共有两种启动方式：
+
+#### 在命令行启动
+
+使用命令行启动VisualDL面板，命令格式如下：
+
+```python
+visualdl --logdir <dir_1, dir_2, ... , dir_n> --host <host> --port <port> --cache-timeout <cache_timeout> --language <language> --public-path <public_path> --api-only
 ```

-```~/vdl``` 是你的Virtualenv目录, 你也可以选择任一目录。
+参数详情：
+
+| 参数            | 意义                                                         |
+| --------------- | ------------------------------------------------------------ |
+| --logdir        | 设定日志所在目录，可以指定多个目录，VisualDL将遍历并且迭代寻找指定目录的子目录，将所有实验结果进行可视化 |
+| --model         | 设定模型文件路径(非文件夹路径)，VisualDL将在此路径指定的模型文件进行可视化，目前可支持PaddlePaddle、ONNX、Keras、Core ML、Caffe等多种模型结构，详情可查看[graph支持模型种类]([https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/components/README.md#Graph--%E7%BD%91%E7%BB%9C%E7%BB%93%E6%9E%84%E7%BB%84%E4%BB%B6](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/components/README.md#Graph--网络结构组件)) |
+| --host          | 设定IP，默认为`127.0.0.1`                                    |
+| --port          | 设定端口，默认为`8040`                                       |
+| --cache-timeout | 后端缓存时间，在缓存时间内前端多次请求同一url，返回的数据从缓存中获取，默认为20秒 |
+| --language      | VisualDL面板语言，可指定为'EN'或'ZH'，默认为浏览器使用语言   |
+| --public-path   | VisualDL面板URL路径，默认是'/app'，即访问地址为'http://&lt;host&gt;:&lt;port&gt;/app' |
+| --api-only      | 是否只提供API，如果设置此参数，则VisualDL不提供页面展示，只提供API服务，此时API地址为'http://&lt;host&gt;:&lt;port&gt;/&lt;public_path&gt;/api'；若没有设置public_path参数，则默认为'http://&lt;host&gt;:&lt;port&gt;/api' |
+
+针对上一步生成的日志，启动命令为：

-激活虚拟环境如下：
 ```
-source ~/vdl/bin/activate
+visualdl --logdir ./log
 ```

-现在再安装 VisualDL 和运行范例：
+#### 在Python脚本中启动

+支持在Python脚本中启动VisualDL面板，接口如下：
+
+```python
+visualdl.server.app.run(logdir,
+                        host="127.0.0.1",
+                        port=8080,
+                        cache_timeout=20,
+                        language=None,
+                        public_path=None,
+                        api_only=False,
+                        open_browser=False)
 ```
-pip install --upgrade visualdl

-# 运行一个例子，vdl_create_scratch_log 将创建测试日志
-vdl_create_scratch_log
-visualdl --logdir=scratch_log --port=8080
+请注意：除`logdir`外，其他参数均为不定参数，传递时请指明参数名。
+
+接口参数具体如下：
+
+| 参数          | 格式                                             | 含义                                                         |
+| ------------- | ------------------------------------------------ | ------------------------------------------------------------ |
+| logdir        | string或list[string_1, string_2, ... , string_n] | 日志文件所在的路径，VisualDL将在此路径下递归搜索日志文件并进行可视化，可指定单个或多个路径 |
+| model         | string                                           | 模型文件路径(非文件夹路径)，VisualDL将在此路径指定的模型文件进行可视化 |
+| host          | string                                           | 指定启动服务的ip，默认为`127.0.0.1`                          |
+| port          | int                                              | 启动服务端口，默认为`8040`                                   |
+| cache_timeout | int                                              | 后端缓存时间，在缓存时间内前端多次请求同一url，返回的数据从缓存中获取，默认为20秒 |
+| language      | string                                           | VisualDL面板语言，可指定为'en'或'zh'，默认为浏览器使用语言   |
+| public_path   | string                                           | VisualDL面板URL路径，默认是'/app'，即访问地址为'http://<host>:<port>/app' |
+| api_only      | boolean                                          | 是否只提供API，如果设置此参数，则VisualDL不提供页面展示，只提供API服务，此时API地址为'http://<host>:<port>/<public_path>/api'；若没有设置public_path参数，则默认为http://<host>:<port>/api' |
+| open_browser  | boolean                                          | 是否打开浏览器，设置为True则在启动后自动打开浏览器并访问VisualDL面板，若设置api_only，则忽略此参数 |
+
+针对上一步生成的日志，我们的启动脚本为：

-# 访问 http://127.0.0.1:8080
+```python
+from visualdl.server import app
+
+app.run(logdir="./log")
 ```

-如果在虚拟环境下仍然遇到安装问题，请尝试以下方法。
+在使用任意一种方式启动VisualDL面板后，打开浏览器访问VisualDL面板，即可查看日志的可视化结果，如图：

+<p align="center">
+  <img src="https://user-images.githubusercontent.com/48054808/82786044-67ae9880-9e96-11ea-8a2b-3a0951a6ec19.png" width="60%"/>
+</p>

-## 使用 Anaconda 安装

-Anaconda是一个用于科学计算的Python发行版，提供了包管理与环境管理的功能，可以很方便地解决多版本python并存、切换以及各种第三方包安装问题。

-请根据[Anaconda下载网站](https://www.anaconda.com/download) 的指示去下载和安装Anaconda.
-下载Python 3.6版本的command-Line installer.
+## 可视化功能概览

-创建conda环境名字为```vdl```或任何名字:
-```
-conda create -n vdl pip python=2.7 # or python=3.3, etc.
-```
+### Scalar

-激活conda环境如下:
-```
-source activate vdl
-```
+以图表形式实时展示训练过程参数，如loss、accuracy。让用户通过观察单组或多组训练参数变化，了解训练过程，加速模型调优。具有两大特点：

-现在再安装 VisualDL 和运行范例：
+#### 动态展示

-```
-pip install --upgrade visualdl
+在启动VisualDL后，LogReader将不断增量的读取日志中数据并供前端调用展示，因此能够在训练中同步观测指标变化，如下图：

-# 运行一个例子，vdl_create_scratch_log 将创建测试日志
-vdl_create_scratch_log
-visualdl --logdir=scratch_log --port=8080
+<p align="center">
+  <img src="http://visualdl.bj.bcebos.com/images/dynamic_display.gif" width="60%"/>
+</p>

-# 访问 http://127.0.0.1:8080
-```

-如果仍然遇到安装问题，请尝试以下用源代码安装方法。

-### 使用代码安装
-```
-#建議是在虚拟环境或anaconda下。
-git clone https://github.com/PaddlePaddle/VisualDL.git
-cd VisualDL
+#### 多实验对比

-python setup.py bdist_wheel
-pip install --upgrade dist/visualdl-*.whl
-```
+只需在启动VisualDL时将每个实验日志所在路径同时传入即可，每个实验中相同tag的指标将绘制在一张图中同步呈现，如下图：

-如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
+<p align="center">
+  <img src="http://visualdl.bj.bcebos.com/images/multi_experiments.gif" width="100%"/>
+</p>


-## SDK
-VisualDL 同时提供了python SDK 和 C++ SDK 来实现不同方式的使用。

-### Python SDK
-VisualDL 现在支持 Python 2和 Python 3。
+### Image

-以最简单的Scalar组件为例，尝试创建一个scalar组件并插入多个时间步的数据：
+实时展示训练过程中的图像数据，用于观察不同训练阶段的图像变化，进而深入了解训练过程及效果。

-```python
-import random
-from visualdl import LogWriter
+<p align="center">
+<img src="http://visualdl.bj.bcebos.com/images/image-eye.gif" width="60%"/>
+</p>

-logdir = "./tmp"
-logger = LogWriter(logdir, sync_cycle=10000)

-# mark the components with 'train' label.
-with logger.mode("train"):
-    # create a scalar component called 'scalars/scalar0'
-    scalar0 = logger.scalar("scalars/scalar0")

-# add some records during DL model running.
-for step in range(100):
-    scalar0.add_record(step, random.random())
-```
+### Audio

-### C++ SDK
-上面 Python SDK 中代码完全一致的C++ SDK用法如下
-```c++
-#include <cstdlib>
-#include <string>
-#include "visualdl/sdk.h"
+实时查看训练过程中的音频数据，监控语音识别与合成等任务的训练过程。

-namespace vs = visualdl;
-namespace cp = visualdl::components;
+<p align="center">
+<img src="https://user-images.githubusercontent.com/48054808/89017647-38605000-d34d-11ea-9d75-7d10b9854c36.gif" width="100%"/>
+</p>

-int main() {
-  const std::string dir = "./tmp";
-  vs::LogWriter logger(dir, 10000);

-  logger.SetMode("train");
-  auto tablet = logger.AddTablet("scalars/scalar0");

-  cp::Scalar<float> scalar0(tablet);
+### Graph

-  for (int step = 0; step < 1000; step++) {
-    float v = (float)std::rand() / RAND_MAX;
-    scalar0.AddRecord(step, v);
-  }
+一键可视化模型的网络结构。可查看模型属性、节点信息、节点输入输出等，并支持节点搜索，辅助用户快速分析模型结构与了解数据流向。

-  return 0;
-}
-```
-## 启动Board
-当训练过程中已经产生了日志数据，就可以启动board进行实时预览可视化信息
+<p align="center">
+<img src="https://user-images.githubusercontent.com/48054808/84483052-5acdd980-accb-11ea-8519-1608da7ee698.png" width="100%"/>
+</p>

-```
-visualdl --logdir <some log dir>
-```

-board 还支持一下参数来实现远程的访问：

- `--host` 设定IP
- `--port` 设定端口
- `-m / --model_pb` 指定 ONNX 格式的模型文件
+### Histogram
+
+以直方图形式展示Tensor（weight、bias、gradient等）数据在训练过程中的变化趋势。深入了解模型各层效果，帮助开发者精准调整模型结构。
+
+- Offset模式
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/48054808/86551031-86647c80-bf76-11ea-8ec2-8c86826c8137.png" width="100%"/>
+</p>

-### 贡献

-VisualDL 是由 [PaddlePaddle](http://www.paddlepaddle.org/) 和
-[ECharts](http://echarts.baidu.com/) 合作推出的开源项目。我们欢迎所有人使用，提意见以及贡献代码。
+
+- Overlay模式
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/48054808/86551033-882e4000-bf76-11ea-8e6a-af954c662ced.png" width="100%"/>
+</p>
+
+
+
+### PR Curve
+
+精度-召回率曲线，帮助开发者权衡模型精度和召回率之间的平衡，设定最佳阈值。
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/48054808/86738774-ee46c000-c067-11ea-90d2-a98aac445cca.png" width="100%"/>
+</p>
+
+
+### High Dimensional
+
+将高维数据进行降维展示，目前支持T-SNE、PCA两种降维方式，用于深入分析高维数据间的关系，方便用户根据数据特征进行算法优化。
+
+<p align="center">
+<img src="http://visualdl.bj.bcebos.com/images/high_dimensional_test.png" width="100%"/>
+</p>
+
+## 开源贡献
+
+VisualDL 是由 [PaddlePaddle](https://www.paddlepaddle.org/) 和 [ECharts](https://echarts.apache.org/) 合作推出的开源项目。
+Graph 相关功能由 [Netron](https://github.com/lutzroeder/netron) 提供技术支持。
+欢迎所有人使用，提意见以及贡献代码。
+

 ## 更多细节

-想了解更多关于VisualDL的使用介绍，请查看[文档](https://github.com/PaddlePaddle/VisualDL/tree/develop/demo)
+想了解更多关于VisualDL可视化功能的使用详情介绍，请查看**VisualDL使用指南**。
+
+## 技术交流
+
+欢迎您加入VisualDL官方QQ群：1045783368 与飞桨团队以及其他用户共同针对VisualDL进行讨论与交流。
--- a/doc/fluid/advanced_guide/evaluation_debugging/debug/visualdl_usage.md
+++ b/doc/fluid/advanced_guide/evaluation_debugging/debug/visualdl_usage.md
--- a/doc/fluid/advanced_guide/evaluation_debugging/evaluation/metrics.rst
+++ b/doc/fluid/advanced_guide/evaluation_debugging/evaluation/metrics.rst
@@ -17,6 +17,7 @@ paddle.fluid.metrics模块提供了一系列常用的模型评价指标; 用户
 不同类型的任务，会选用不同的评价指标。
 
 回归问题通常会用RMSE(均方根误差)、MAE(平均绝对误差)、R-Square(R平方)等
+
 AUC(Area Under Cure)指标则常被用在分类任务(classification)上

 目标检测任务(Object Detection)则经常会用到mAP(Mean Average Precision) 

--- a/doc/fluid/advanced_guide/flags/flags_cn.rst
+++ b/doc/fluid/advanced_guide/flags/flags_cn.rst
+
+环境变量FLAGS
+==================
+
+调用说明
+----------
+
+PaddlePaddle中的环境变量FLAGS支持两种设置方式。
+
+- 通过export来设置环境变量，如 :code:`export FLAGS_eager_delete_tensor_gb = 1.0` 。
+
+- 通过API：:code:`get_flag` 和 :code:`set_flags` 来打印和设置环境变量FLAGS。API使用详情请参考 :ref:`cn_api_fluid_get_flags` 与 :ref:`cn_api_fluid_set_flags` 。
+
+
+环境变量FLAGS功能分类
+----------------------
+
+..  toctree::
+    :maxdepth: 1
+
+    cudnn_cn.rst
+    data_cn.rst
+    debug_cn.rst
+    device_cn.rst
+    distributed_cn.rst
+    executor_cn.rst
+    memory_cn.rst
+    others_cn.rst
--- a/doc/fluid/advanced_guide/flags/flags_en.rst
+++ b/doc/fluid/advanced_guide/flags/flags_en.rst
+==================
+FLAGS
+==================
+
+Usage
+------
+These FLAGS in PaddlePaddle can be set in two ways.
+
+- Set the FLAGS through export. For example: :code:`export FLAGS_eager_delete_tensor_gb = 1.0` .
+
+- Through :code:`get_flags` and :code:`set_flags` to print and set the environment variables. For more information of using these API, please refer to :ref:`api_fluid_get_flags` and :ref:`api_fluid_get_flags` .
+
+
+FLAGS Quick Search
+------------------
+
+..  toctree::
+    :maxdepth: 1
+
+
+    cudnn_en.rst
+    data_en.rst
+    debug_en.rst
+    device_en.rst
+    distributed_en.rst
+    executor_en.rst
+    memory_en.rst
+    others_en.rst
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/doc/fluid/advanced_guide/flags/memory_cn.rst
+++ b/doc/fluid/advanced_guide/flags/memory_cn.rst
@@ -11,13 +11,14 @@ FLAGS_allocator_strategy

 取值范围
 ---------------
-String型，['naive_best_fit', 'auto_growth']中的一个。缺省值为'naive_best_fit'。
+String型，['naive_best_fit', 'auto_growth']中的一个。缺省值如果编译Paddle CMake时使用-DON_INFER=ON为'naive_best_fit'。
+其他默认情况为'auto_growth'。PaddlePaddle pip安装包的默认策略也是'auto_growth'

 示例
 --------
-FLAGS_allocator_strategy=naive_best_fit - 使用预分配best fit分配器。
+FLAGS_allocator_strategy=naive_best_fit - 使用预分配best fit分配器，PaddlePaddle会先占用大多比例的可用内存/显存，在Paddle具体数据使用时分配，这种方式预占空间较大，但内存/显存碎片较少(比如能够支持模型的最大batch size会变大)。

-FLAGS_allocator_strategy=auto_growth - 使用auto growth分配器。
+FLAGS_allocator_strategy=auto_growth - 使用auto growth分配器。PaddlePaddle会随着真实数据需要再占用内存/显存，但内存/显存可能会产生碎片（比如能够支持模型的最大batch size会变小）。


 FLAGS_eager_delete_scope

--- a/doc/fluid/advanced_guide/flags/memory_en.rst
+++ b/doc/fluid/advanced_guide/flags/memory_en.rst
@@ -11,13 +11,13 @@ Use to choose allocator strategy of PaddlePaddle.

 Values accepted
 ---------------
-String, enum in ['naive_best_fit', 'auto_growth']. The default value is 'naive_best_fit'.
+String, enum in ['naive_best_fit', 'auto_growth']. The default value will be 'naive_best_fit' if users compile PaddlePaddle with -DON_INFER=ON CMake flag, otherwise is 'auto_growth'. The default PaddlePaddle pip package uses 'auto_growth'.

 Example
 --------
-FLAGS_allocator_strategy=naive_best_fit would use the pre-allocated best fit allocator.
+FLAGS_allocator_strategy=naive_best_fit would use the pre-allocated best fit allocator. 'naive_best_fit' strategy would occupy almost all GPU memory by default but leads to less memory fragmentation (i.e., maximum batch size of models may be larger).

-FLAGS_allocator_strategy=auto_growth would use the auto growth allocator.
+FLAGS_allocator_strategy=auto_growth would use the auto growth allocator. 'auto_growth' strategy would allocate GPU memory on demand but may lead to more memory fragmentation (i.e., maximum batch size of models may be smaller).




--- a/doc/fluid/advanced_guide/flags_cn.rst
+++ b/doc/fluid/advanced_guide/flags_cn.rst
-
-环境变量FLAGS
-==================
-
-
-..  toctree::
-    :maxdepth: 1
-
-
-    flags/cudnn_cn.rst
-    flags/data_cn.rst
-    flags/debug_cn.rst
-    flags/device_cn.rst
-    flags/distributed_cn.rst
-    flags/executor_cn.rst
-    flags/memory_cn.rst
-    flags/others_cn.rst
--- a/doc/fluid/advanced_guide/flags_en.rst
+++ b/doc/fluid/advanced_guide/flags_en.rst
-==================
-FLAGS
-==================
-
-
-..  toctree::
-    :maxdepth: 1
-
-
-    flags/cudnn_en.rst
-    flags/data_en.rst
-    flags/debug_en.rst
-    flags/device_en.rst
-    flags/distributed_en.rst
-    flags/executor_en.rst
-    flags/memory_en.rst
-    flags/others_en.rst
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/doc/fluid/advanced_guide/index_cn.rst
+++ b/doc/fluid/advanced_guide/index_cn.rst
@@ -2,31 +2,14 @@
 进阶指南
 ########

-如果您已比较熟练使用飞桨来完成常规任务，期望了解更多飞桨在工业部署方面的能力，或者尝试自己的二次开发，请阅读：
+如果您已经学会使用飞桨来完成常规任务，期望了解更多飞桨在工业部署方面的能力，请阅读：

-    - `数据准备 <../advanced_guide/data_preparing/index_cn.html>`_：介绍高效的同步异步数据读取方法
-
-    - `分布式训练 <../advanced_guide/distributed_training/index_cn.html>`_ ：介绍如何使用分布式训练

    - `预测与部署 <../advanced_guide/inference_deployment/index_cn.html>`_ ：介绍如何应用训练好的模型进行预测

-    - `性能调优 <../advanced_guide/performance_improving/index_cn.html>`_ ：介绍飞桨使用过程中的调优方法
-
-    - `模型评估/调试 <../advanced_guide/evaluation_debugging/index_cn.html>`_ ：介绍模型评估与调试的典型方法
-
-    - `二次开发 <../advanced_guide/addon_development/index_cn.html>`_ ：介绍如何新增Operator和如何向飞桨开源社区贡献代码
-
-    - `环境变量FLAGS <../advanced_guide/flags/index_cn.html>`_ 
-
-
 ..  toctree::
    :hidden:

-    data_preparing/index_cn.rst
-    distributed_training/index_cn.rst
+    dygraph_to_static/index_cn.rst
    inference_deployment/index_cn.rst
-    performance_improving/index_cn.rst
-    evaluation_debugging/index_cn.rst
-    addon_development/index_cn.rst
-    flags_cn.rst
-
+    flags/flags_cn.rst
--- a/doc/fluid/advanced_guide/index_en.rst
+++ b/doc/fluid/advanced_guide/index_en.rst
@@ -8,30 +8,14 @@ Advanced User Guides

 So far you have already been familiar with PaddlePaddle. And the next expectation, read more on:

-    - `Prepare Data <data_preparing/index_en.html>`_：How to prepare the data efficiently.
-
-    - `Distributed Training <distributed_training/index_en.html>`_ ：How to apply the distributed training in your projects.

    - `Deploy Inference Model  <inference_deployment/index_en.html>`_ ：How to deploy the trained network to perform practical inference

-    - `Practice Improving  <performance_improving/index_en.html>`_ ：How to do profiling for Fluid programs
-
-    - `Model Evaluation and Debugging <evaluation_debugging/index_en.html>`_ ：How to evaluate your program.
-
-    - `Addon Development <addon_development/index_en.html>`_ ：How to contribute codes and documentation to our communities
-
-    - `FLAGS <flags_en.html>`_ 
-

 ..  toctree::
    :hidden:

-    data_preparing/index_en.rst
-    distributed_training/index_en.rst
+    dygraph_to_static/index_en.rst
    inference_deployment/index_en.rst
-    performance_improving/index_en.rst
-    evaluation_debugging/index_en.rst
-    addon_development/index_en.rst
-    flags_en.rst
-
+    flags/flags_en.rst

--- a/doc/fluid/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/advanced_guide/inference_deployment/inference/build_and_install_lib_cn.rst
 .. _install_or_build_cpp_inference_lib:

-安装与编译C++预测库
+安装与编译 Linux 预测库
 ===========================

 直接下载安装
 -------------

 ..  csv-table:: 
-    :header: "版本说明", "预测库(1.7.0版本)", "预测库(develop版本)"
+    :header: "版本说明", "预测库(1.8.4版本)", "预测库(develop版本)"
    :widths: 3, 2, 2

-    "ubuntu14.04_cpu_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-cpu-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-mkl/fluid_inference.tgz>`_"
-    "ubuntu14.04_cpu_avx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-cpu-avx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-openblas/fluid_inference.tgz>`_"
-    "ubuntu14.04_cpu_noavx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-cpu-noavx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-noavx-openblas/fluid_inference.tgz>`_"
-    "ubuntu14.04_cuda9.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_"
-    "ubuntu14.04_cuda10.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_"
-    "ubuntu14.04_cuda9.0_cudnn7_avx_mkl_trt5", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda9-cudnn7-avx-mkl-trt5/fluid_inference.tgz>`_", 
-    "ubuntu14.04_cuda10.0_cudnn7_avx_mkl_trt5", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda10-cudnn7-avx-mkl-trt5/fluid_inference.tgz>`_", 
-    "nv-jetson-cuda10-cudnn7.5-trt5", "`fluid_inference.tar.gz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-nv-jetson-cuda10-cudnn7.5-trt5/fluid_inference.tar.gz>`_", 
+    "ubuntu14.04_cpu_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-cpu-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-mkl/fluid_inference.tgz>`_"
+    "ubuntu14.04_cpu_avx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-cpu-avx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-openblas/fluid_inference.tgz>`_"
+    "ubuntu14.04_cpu_noavx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-cpu-noavx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-noavx-openblas/fluid_inference.tgz>`_"
+    "ubuntu14.04_cuda9.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_"
+    "ubuntu14.04_cuda10.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_"
+    "ubuntu14.04_cuda10.1_cudnn7.6_avx_mkl_trt6", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6%2Ffluid_inference.tgz>`_", 
+    "nv-jetson-cuda10-cudnn7.5-trt5", "`fluid_inference.tar.gz <https://paddle-inference-lib.bj.bcebos.com/1.7.1-nv-jetson-cuda10-cudnn7.5-trt5/fluid_inference.tar.gz>`_", 


 从源码编译
@@ -40,23 +39,28 @@ WITH_NV_JETSON                OFF            在NV Jetson硬件上编译时需

 建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。

-首先从github拉取最新代码并安装nccl
+首先从github拉取最新代码

 .. code-block:: bash

-  git clone https://github.com/paddlepaddle/paddle
+  git clone https://github.com/paddlepaddle/Paddle
+  cd Paddle
  # 建议使用git checkout切换到Paddle稳定的版本，如：
-  git checkout v1.6.2
+  git checkout v1.8.4
+
+**note**: 如果您是多卡机器，建议安装NCCL；如果您是单卡机器则可以在编译时显示指定WITH_NCCL=OFF来跳过这一步。注意如果WITH_NCCL=ON，且没有安装NCCL，则编译会报错。
+
+.. code-block:: bash

  git clone https://github.com/NVIDIA/nccl.git
+  cd nccl
  make -j4
  make install

-**note**： 单卡机器上不会用到nccl但仍存在依赖， 后续会考虑将此依赖去除。

 **Server端预测库源码编译**

-下面的代码片段配制编译选项并进行编译（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+下面的代码片段配制编译选项并进行编译（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径，WITH_NCCL根据实际情况进行修改）：

  .. code-block:: bash

@@ -70,6 +74,7 @@ WITH_NV_JETSON                OFF            在NV Jetson硬件上编译时需
           -DWITH_MKL=OFF \
           -DWITH_GPU=OFF  \
           -DON_INFER=ON \
+           -DWITH_NCCL=OFF \
           ..
      make
      make inference_lib_dist
@@ -118,7 +123,7 @@ NVIDIA Jetson是NVIDIA推出的嵌入式AI平台，Paddle Inference支持在 NVI
        make inference_lib_dist -j4

    3. 样例测试
-      请参照官网样例：https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/paddle_tensorrt_infer.html#id2
+      请参照官网样例：https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/performance_improving/inference_improving/paddle_tensorrt_infer.html#id2
    
    **FAQ**

@@ -165,28 +170,21 @@ NVIDIA Jetson是NVIDIA推出的嵌入式AI平台，Paddle Inference支持在 NVI
     │       ├── libpaddle_fluid.a
     │       └── libpaddle_fluid.so
     ├── third_party
-     │   ├── boost
-     │   │   └── boost
-     │   ├── eigen3
-     │   │   ├── Eigen
-     │   │   └── unsupported
     │   └── install
     │       ├── gflags
     │       ├── glog
     │       ├── mkldnn
     │       ├── mklml
-     │       ├── protobuf
-     │       ├── xxhash
-     │       └── zlib
+     │       └── protobuf
     └── version.txt

 version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：

  .. code-block:: text

-     GIT COMMIT ID: cc9028b90ef50a825a722c55e5fda4b7cd26b0d6
+     GIT COMMIT ID: 0231f58e592ad9f673ac1832d8c495c8ed65d24f
     WITH_MKL: ON
     WITH_MKLDNN: ON
     WITH_GPU: ON
-     CUDA version: 8.0
+     CUDA version: 10.1
     CUDNN version: v7
--- a/doc/fluid/advanced_guide/inference_deployment/inference/build_and_install_lib_en.rst
+++ b/doc/fluid/advanced_guide/inference_deployment/inference/build_and_install_lib_en.rst
 .. _install_or_build_cpp_inference_lib_en:

-Install and Compile C++ Inference Library
+Install and Compile C++ Inference Library on Linux
 =============================================

 Direct Download and Installation
 ---------------------------------

 ..  csv-table:: c++ inference library list
-    :header: "version description", "inference library(1.7.0 version)", "inference library(develop version)"
+    :header: "version description", "inference library(1.8.4 version)", "inference library(develop version)"
    :widths: 3, 2, 2

-    "ubuntu14.04_cpu_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-cpu-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-mkl/fluid_inference.tgz>`_"
-    "ubuntu14.04_cpu_avx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-cpu-avx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-openblas/fluid_inference.tgz>`_"
-    "ubuntu14.04_cpu_noavx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-cpu-noavx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-noavx-openblas/fluid_inference.tgz>`_"
-    "ubuntu14.04_cuda9.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_"
-    "ubuntu14.04_cuda10.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_"
-    "ubuntu14.04_cuda9.0_cudnn7_avx_mkl_trt5", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda9-cudnn7-avx-mkl-trt5/fluid_inference.tgz>`_", 
-    "ubuntu14.04_cuda10.0_cudnn7_avx_mkl_trt5", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-gpu-cuda10-cudnn7-avx-mkl-trt5/fluid_inference.tgz>`_", 
-    "nv-jetson-cuda10-cudnn7.5-trt5", "`fluid_inference.tar.gz <https://paddle-inference-lib.bj.bcebos.com/1.7.0-nv-jetson-cuda10-cudnn7.5-trt5/fluid_inference.tar.gz>`_", 
+    "ubuntu14.04_cpu_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-cpu-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-mkl/fluid_inference.tgz>`_"
+    "ubuntu14.04_cpu_avx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-cpu-avx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-avx-openblas/fluid_inference.tgz>`_"
+    "ubuntu14.04_cpu_noavx_openblas", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-cpu-noavx-openblas/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-cpu-noavx-openblas/fluid_inference.tgz>`_"
+    "ubuntu14.04_cuda9.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda9-cudnn7-avx-mkl/fluid_inference.tgz>`_"
+    "ubuntu14.04_cuda10.0_cudnn7_avx_mkl", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/latest-gpu-cuda10-cudnn7-avx-mkl/fluid_inference.tgz>`_"
+    "ubuntu14.04_cuda10.1_cudnn7.6_avx_mkl_trt6", "`fluid_inference.tgz <https://paddle-inference-lib.bj.bcebos.com/1.8.4-gpu-cuda10.1-cudnn7.6-avx-mkl-trt6%2Ffluid_inference.tgz>`_", 
+    "nv-jetson-cuda10-cudnn7.5-trt5", "`fluid_inference.tar.gz <https://paddle-inference-lib.bj.bcebos.com/1.7.1-nv-jetson-cuda10-cudnn7.5-trt5/fluid_inference.tar.gz>`_", 

 Build from Source Code
 -----------------------
@@ -41,23 +40,29 @@ WITH_NV_JETSON                OFF              build inference libs on NV Jetson
 It is recommended to configure options according to the recommended values to avoid linking unnecessary libraries. Other options can be set if it is necessary.


-Firstly we pull the latest code from github and install nccl.
+Firstly we pull the latest code from github.

 .. code-block:: bash

-  git clone https://github.com/paddlepaddle/paddle
-  # Use git checkout to switch to stable versions such as v1.6.2
-  git checkout v1.6.2
+  git clone https://github.com/paddlepaddle/Paddle
+  cd Paddle
+  # Use git checkout to switch to stable versions such as v1.8.4
+  git checkout v1.8.4
+
+
+**note**: If your environment is a multi-card machine, it is recommended to install nccl; otherwise, you can skip this step by specifying WITH_NCCL = OFF during compilation. Note that if WITH_NCCL = ON, and NCCL is not installed, the compiler will report an error.
+
+.. code-block:: bash

  git clone https://github.com/NVIDIA/nccl.git
+  cd nccl
  make -j4
  make install

-**note**: nccl is not used but still needed in building. This dependence will be removed later.

 **build inference libs on server**

-Following codes set the configurations and execute building(PADDLE_ROOT should be set to the actual installing path of inference libs).
+Following codes set the configurations and execute building(PADDLE_ROOT should be set to the actual installing path of inference libs, WITH_NCCL should be modified according to the actual environment.).

  .. code-block:: bash

@@ -72,6 +77,7 @@ Following codes set the configurations and execute building(PADDLE_ROOT should b
           -DWITH_MKL=OFF \
           -DWITH_GPU=OFF  \
           -DON_INFER=ON \
+           -DWITH_NCCL=OFF \
           ..
      make
      make inference_lib_dist
@@ -121,7 +127,7 @@ NVIDIA Jetson is an AI computing platform in embedded systems introduced by NVID
        make inference_lib_dist -j4
      
    3. Test with samples
-      Please refer to samples on https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/paddle_tensorrt_infer.html#id2
+      Please refer to samples on https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/performance_improving/inference_improving/paddle_tensorrt_infer.html#id2

    **FAQ**


--- a/doc/fluid/advanced_guide/inference_deployment/inference/c_infer_cn.md
+++ b/doc/fluid/advanced_guide/inference_deployment/inference/c_infer_cn.md
@@ -27,7 +27,7 @@ Fluid提供了高度优化的[C++预测库](./native_infer.html)，为了方便
 * `void PD_DisableGpu(PD_AnalysisConfig* config)`: 禁用GPU。
 * `int PD_GpuDeviceId(const PD_AnalysisConfig* config)`: 返回使用的GPU设备的ID。
 * `void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x)`: 设置预测是否开启IR优化。
-* `void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size, int max_batch_size, int min_subgraph_size, Precision precision, bool use_static, bool use_calib_mode)`: 开启TensorRT。关于参数的解释，详见``使用Paddle-TensorRT库预测``。
+* `void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size, int max_batch_size, int min_subgraph_size, Precision precision, bool use_static, bool use_calib_mode)`: 开启TensorRT。关于参数的解释，详见[使用Paddle-TensorRT库预测](../../performance_improving/inference_improving/paddle_tensorrt_infer.html)。
 * `void PD_EnableMKLDNN(PD_AnalysisConfig* config)`: 开启MKLDNN。

 #### 代码示例

--- a/doc/fluid/advanced_guide/inference_deployment/inference/windows_cpp_inference.md
+++ b/doc/fluid/advanced_guide/inference_deployment/inference/windows_cpp_inference.md

-安装与编译Windows预测库
+安装与编译 Windows 预测库
 ===========================

 下载安装包与对应的测试环境
 -------------

-| 版本说明      |     预测库(1.7.0版本)     |       编译器        |    构建工具      |  cuDNN  |  CUDA  |
+| 版本说明      |     预测库(1.8.4版本)     |       编译器        |    构建工具      |  cuDNN  |  CUDA  |
 |:---------|:-------------------|:-------------------|:----------------|:--------|:-------|
-|    cpu_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/mkl/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
-|    cpu_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/open/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
-|    cuda9.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/mkl/post97/fluid_inference_install_dir.zip) |  MSVC 2015 update 3 |  CMake v3.16.0  |  7.4.1  |   9.0    |
-|    cuda10.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/mkl/post107/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.5.0  |   10.0    |
-|    cuda9.0_cudnn7_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/open/post97/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.4.1  |   9.0    |
-|    cuda10.0_cudnn7_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/open/post97/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.5.0  |   10.0    |
+|    cpu_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/mkl/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
+|    cpu_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/open/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
+|    cuda9.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/mkl/post97/fluid_inference_install_dir.zip) |  MSVC 2015 update 3 |  CMake v3.16.0  |  7.3.1  |   9.0    |
+|    cuda9.0_cudnn7_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/open/post97/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.3.1  |   9.0    |
+|    cuda10.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/mkl/post107/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.4.1  |   10.0    |

 ### 硬件环境


--- a/doc/fluid/advanced_guide/inference_deployment/inference/windows_cpp_inference_en.md
+++ b/doc/fluid/advanced_guide/inference_deployment/inference/windows_cpp_inference_en.md
@@ -5,14 +5,13 @@ Install and Compile C++ Inference Library on Windows
 Direct Download and Install
 -------------

-| Version      |     Inference Libraries(v1.7.0)   | Compiler | Build tools | cuDNN | CUDA |
+| Version      |     Inference Libraries(v1.8.4)   | Compiler | Build tools | cuDNN | CUDA |
 |:---------|:-------------------|:-------------------|:----------------|:--------|:-------|
-|    cpu_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/mkl/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
-|    cpu_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/open/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
-|    cuda9.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/mkl/post97/fluid_inference_install_dir.zip) |  MSVC 2015 update 3 |  CMake v3.16.0  |  7.4.1  |   9.0    |
-|    cuda10.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/mkl/post107/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.5.0  |   10.0    |
-|    cuda9.0_cudnn7_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/open/post97/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.4.1  |   9.0    |
-|    cuda10.0_cudnn7_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.7.0/win-infer/open/post97/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.5.0  |   10.0    |
+|    cpu_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/mkl/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
+|    cpu_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/open/cpu/fluid_inference_install_dir.zip) | MSVC 2015 update 3|  CMake v3.16.0  |
+|    cuda9.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/mkl/post97/fluid_inference_install_dir.zip) |  MSVC 2015 update 3 |  CMake v3.16.0  |  7.3.1  |   9.0    |
+|    cuda9.0_cudnn7_avx_openblas | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/open/post97/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.3.1  |   9.0    |
+|    cuda10.0_cudnn7_avx_mkl | [fluid_inference.zip](https://paddle-wheel.bj.bcebos.com/1.8.4/win-infer/mkl/post107/fluid_inference_install_dir.zip) | MSVC 2015 update 3 |  CMake v3.16.0  |  7.4.1  |   10.0    |

 ### Hardware Environment


--- a/doc/fluid/advanced_guide/inference_deployment/mobile/mobile_index.md
+++ b/doc/fluid/advanced_guide/inference_deployment/mobile/mobile_index.md
 # Paddle-Lite

-Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在内更多场景的轻量化高效预测，支持更广泛的硬件和平台，是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外，也兼容支持其他训练框架产出的模型。
+Paddle-Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在内更多场景的轻量化高效预测，支持更广泛的硬件和平台，是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外，也兼容支持其他训练框架产出的模型。

-完整使用文档位于 [PaddleLite 文档](https://paddlepaddle.github.io/Paddle-Lite/) 。
+完整使用文档位于 [Paddle-Lite 文档](https://paddle-lite.readthedocs.io/zh/latest/) 。

 ## 特性

@@ -13,39 +13,39 @@ Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在

 ### 高性能
 极致的 ARM CPU 性能优化，针对不同微架构特点实现kernel的定制，最大发挥计算性能，在主流模型上展现出领先的速度优势。
-支持INT8量化计算，结合 [PaddleSlim 模型压缩工具](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim) 中 INT8量化训练功能，可以提供高精度高性能的预测能力。
+支持量化模型，结合[PaddleSlim 模型压缩工具](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim) 中量化功能，可以提供高精度高性能的预测能力。
 在Huawei NPU， FPGA上也具有有很好的性能表现。

-最新 Benchmark 位于 [benchmark](https://paddlepaddle.github.io/Paddle-Lite/develop/benchmark/)。
+最新性能数据位于 [Benchmark 文档](https://paddle-lite.readthedocs.io/zh/latest/benchmark/benchmark.html)。

 ### 通用性
-硬件方面，Paddle Lite 的架构设计为多硬件兼容支持做了良好设计。除了支持ARM CPU、Mali GPU、Adreno GPU，还特别支持了华为 NPU，以及 FPGA 等边缘设备广泛使用的硬件。即将支持支持包括寒武纪、比特大陆等AI芯片，未来会增加对更多硬件的支持。
+硬件方面，Paddle-Lite 的架构设计为多硬件兼容支持做了良好设计。除了支持ARM CPU、Mali GPU、Adreno GPU，还特别支持了华为 NPU，以及 FPGA 等边缘设备广泛使用的硬件。即将支持支持包括寒武纪、比特大陆等AI芯片，未来会增加对更多硬件的支持。

-模型支持方面，Paddle Lite和PaddlePaddle训练框架的Op对齐，提供更广泛的模型支持能力。目前已严格验证18个模型85个OP的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持。未来会持续增加更多模型的支持验证。
+模型支持方面，Paddle-Lite和PaddlePaddle训练框架的Op对齐，提供更广泛的模型支持能力。目前已严格验证18个模型85个OP的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持。未来会持续增加更多模型的支持验证。

-框架兼容方面：除了PaddlePaddle外，对其他训练框架也提供兼容支持。当前，支持Caffe 和 TensorFlow 训练出来的模型，通过X2Paddle (https://github.com/PaddlePaddle/X2Paddle) 转换工具实现。接下来将会对ONNX等格式模型提供兼容支持。
+框架兼容方面：除了PaddlePaddle外，对其他训练框架也提供兼容支持。当前，支持Caffe 和 TensorFlow 训练出来的模型，通过[X2Paddle] (https://github.com/PaddlePaddle/X2Paddle) 转换工具实现。接下来将会对ONNX等格式模型提供兼容支持。

 ## 架构

-PaddleLite 的架构设计着重考虑了对多硬件和平台的支持，并且强化了多个硬件在一个模型中混合执行的能力，多个层面的性能优化处理，以及对端侧应用的轻量化设计。
+Paddle-Lite 的架构设计着重考虑了对多硬件和平台的支持，并且强化了多个硬件在一个模型中混合执行的能力，多个层面的性能优化处理，以及对端侧应用的轻量化设计。

 ![](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)

 其中，Analysis Phase 包括了 MIR(Machine IR) 相关模块，能够对原有的模型的计算图针对具体的硬件列表进行算子融合、计算裁剪 在内的多种优化。Execution Phase 只涉及到Kernel 的执行，且可以单独部署，以支持极致的轻量级部署。


-## Paddle-Mobile升级为Paddle Lite的说明
+## Paddle-Mobile升级为Paddle-Lite的说明
 原Paddle-Mobile作为一个致力于嵌入式平台的PaddlePaddle预测引擎，已支持多种硬件平台，包括ARM CPU、 Mali GPU、Adreno GPU，以及支持苹果设备的GPU Metal实现、ZU5、ZU9等FPGA开发板、树莓派等arm-linux开发板。在百度内已经过广泛业务场景应用验证。对应设计文档可参考: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md)

-Paddle-Mobile 整体升级重构并更名为Paddle Lite后，原paddle-mobile 的底层能力大部分已集成到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下。作为过渡，暂时保留原Paddle-mobile代码。 主体代码位于 `mobile/` 目录中，后续一段时间会继续维护，并完成全部迁移。新功能会统一到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下开发。
+Paddle-Mobile 整体升级重构并更名为Paddle-Lite后，原paddle-mobile 的底层能力大部分已集成到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下。作为过渡，暂时保留原Paddle-mobile代码。 主体代码位于 `mobile/` 目录中，后续一段时间会继续维护，并完成全部迁移。新功能会统一到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下开发。

 metal, web的模块相对独立，会继续在 `./metal` 和 `./web` 目录下开发和维护。对苹果设备的GPU Metal实现的需求及web前端预测需求，可以直接进入这两个目录。

 ## 致谢
-Paddle Lite 借鉴了以下开源项目：
+Paddle-Lite 借鉴了以下开源项目：

 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary)
- [Anakin](https://github.com/PaddlePaddle/Anakin) ，Anakin对应底层的一些优化实现已被集成到Paddle Lite。Anakin作为PaddlePaddle组织下的一个高性能预测项目，极具前瞻性，对Paddle Lite有重要贡献。Anakin已和本项目实现整合。之后，Anakin不再升级。
+- [Anakin](https://github.com/PaddlePaddle/Anakin) ，Anakin对应底层的一些优化实现已被集成到Paddle-Lite。Anakin作为PaddlePaddle组织下的一个高性能预测项目，极具前瞻性，对Paddle-Lite有重要贡献。Anakin已和本项目实现整合。之后，Anakin不再升级。

 ##  交流与反馈
 * 欢迎您通过Github Issues来提交问题、报告与建议

--- a/doc/fluid/advanced_guide/performance_improving/amp/amp.md
+++ b/doc/fluid/advanced_guide/performance_improving/amp/amp.md
+# 混合精度训练最佳实践
+
+Automatic Mixed Precision (AMP) 是一种自动混合使用半精度（FP16）和单精度（FP32）来加速模型训练的技术。AMP技术可方便用户快速将使用 FP32 训练的模型修改为使用混合精度训练，并通过黑白名单和动态`loss scaling`来保证训练时的数值稳定性进而避免梯度Infinite或者NaN(Not a Number)。借力于新一代NVIDIA GPU中Tensor Cores的计算性能，PaddlePaddle AMP技术在ResNet50、Transformer等模型上训练速度相对于FP32训练加速比可达1.5～2.9。
+
+### 半精度浮点类型FP16
+
+如图 1 所示，半精度（Float Precision16，FP16）是一种相对较新的浮点类型，在计算机中使用2字节（16位）存储。在IEEE 754-2008标准中，它亦被称作binary16。与计算中常用的单精度（FP32）和双精度（FP64）类型相比，FP16更适于在精度要求不高的场景中使用。
+
+<figure align="center">
+    <img src="https://paddleweb-static.bj.bcebos.com/images/fp16.png" width="600" alt='missing'/>
+    <figcaption><center>图 1. 半精度和单精度数据示意图</center></figcaption>
+</figure>
+
+### 英伟达GPU的FP16算力
+
+在使用相同的超参数下，混合精度训练使用半精度浮点（FP16）和单精度（FP32）浮点即可达到与使用纯单精度训练相同的准确率，并可加速模型的训练速度。这主要得益于英伟达推出的Volta及Turing架构GPU在使用FP16计算时具有如下特点：
+
+* FP16可降低一半的内存带宽和存储需求，这使得在相同的硬件条件下研究人员可使用更大更复杂的模型以及更大的batch size大小。
+* FP16可以充分利用英伟达Volta及Turing架构GPU提供的Tensor Cores技术。在相同的GPU硬件上，Tensor Cores的FP16计算吞吐量是FP32的8倍。
+
+### PaddlePaddle AMP功能——牛刀小试
+
+如前文所述，使用FP16数据类型可能会造成计算精度上的损失，但对深度学习领域而言，并不是所有计算都要求很高的精度，一些局部的精度损失对最终训练效果影响很微弱，却能使吞吐和训练速度带来大幅提升。因此，混合精度计算的需求应运而生。具体而言，训练过程中将一些对精度损失不敏感且能利用Tensor Cores进行加速的运算使用半精度处理，而对精度损失敏感部分依然保持FP32计算精度，用以最大限度提升访存和计算效率。
+
+为了避免对每个具体模型人工地去设计和尝试精度混合的方法，PaddlePaadle框架提供自动混合精度训练（AMP）功能，解放"炼丹师"的双手。在PaddlePaddle中使用AMP训练是一件十分容易的事情，用户只需要增加一行代码即可将原有的FP32训练转变为AMP训练。下面以`MNIST`为例介绍PaddlePaddle AMP功能的使用示例。
+
+**MNIST网络定义**
+
+```python
+import paddle.fluid as fluid
+
+def MNIST(data, class_dim):
+    conv1 = fluid.layers.conv2d(data, 16, 5, 1, act=None, data_format='NHWC')
+    bn1 = fluid.layers.batch_norm(conv1, act='relu', data_layout='NHWC')
+    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2, data_format='NHWC')
+    conv2 = fluid.layers.conv2d(pool1, 64, 5, 1, act=None, data_format='NHWC')
+    bn2 = fluid.layers.batch_norm(conv2, act='relu', data_layout='NHWC')
+    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2, data_format='NHWC')
+    fc1 = fluid.layers.fc(pool2, size=64, act='relu')
+    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
+    return fc2
+```
+
+针对CV(Computer Vision)类模型组网，为获得更高的训练性能需要注意如下三点：
+
+* `conv2d`、`batch_norm`以及`pool2d`等需要将数据布局设置为`NHWC`，这样有助于使用TensorCore技术加速计算过程<sup><a href="#fn1" id="ref1">1</a></sup>。
+* Tensor Cores要求在使用FP16加速卷积运算时conv2d的输入/输出通道数为8的倍数<sup><a href="#fn2" id="ref2">2</a></sup>，因此设计网络时推荐将conv2d层的输入/输出通道数设置为8的倍数。
+* Tensor Cores要求在使用FP16加速矩阵乘运算时矩阵行数和列数均为8的倍数<sup><a href="#fn3" id="ref3">3</a></sup>，因此设计网络时推荐将fc层的size参数设置为8的倍数。
+
+
+**FP32 训练**
+
+为了训练 MNIST 网络，还需要定义损失函数来更新权重参数，此处使用的优化器是SGDOptimizer。为了简化说明，这里省略了迭代训练的相关代码，仅体现损失函数及优化器定义相关的内容。
+
+```python
+import paddle
+import numpy as np
+
+data = fluid.layers.data(
+    name='image', shape=[None, 28, 28, 1], dtype='float32')
+label = fluid.layers.data(name='label', shape=[None, 1], dtype='int64')
+
+out = MNIST(data, class_dim=10)
+loss = fluid.layers.cross_entropy(input=out, label=label)
+avg_loss = fluid.layers.mean(loss)
+
+sgd = fluid.optimizer.SGDOptimizer(learning_rate=1e-3)
+sgd.minimize(avg_loss)
+```
+
+**AMP训练**
+
+与FP32训练相比，用户仅需使用PaddlePaddle提供的`fluid.contrib.mixed_precision.decorate` 函数将原来的优化器SGDOptimizer进行封装，然后使用封装后的优化器（mp_sgd）更新参数梯度即可完成向AMP训练的转换，代码如下所示：
+
+```python
+sgd = SGDOptimizer(learning_rate=1e-3)
+# 此处只需要使用fluid.contrib.mixed_precision.decorate将sgd封装成AMP训练所需的
+# 优化器mp_sgd，并使用mp_sgd.minimize(avg_loss)代替原来的sgd.minimize(avg_loss)语句即可。
+mp_sgd = fluid.contrib.mixed_precision.decorator.decorate(sgd)
+mp_sgd.minimize(avg_loss)
+```
+
+运行上述混合精度训练python脚本时为得到更好的执行性能可配置如下环境参数，并保证cudnn版本在7.4.1及以上。
+
+```shell
+export FLAGS_conv_workspace_size_limit=1024 # MB，根据所使用的GPU显存容量及模型特点设置数值，值越大越有可能选择到更快的卷积算法
+export FLAGS_cudnn_exhaustive_search=1 # 使用穷举搜索方法来选择快速卷积算法
+export FLAGS_cudnn_batchnorm_spatial_persistent=1 # 用于触发batch_norm和relu的融合
+```
+
+上述即为最简单的PaddlePaddle AMP功能使用方法。ResNet50模型的AMP训练示例可[点击此处](https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/image_classification/README.md#%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6%E8%AE%AD%E7%BB%83)查看，其他模型使用PaddlePaddle AMP的方法也与此类似。若AMP训练过程中出现连续的loss nan等不收敛现象，可尝试使用[check nan inf工具](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/flags/check_nan_inf_cn.html#span-id-speed-span)进行调试。
+
+
+### PaddlePaddle AMP功能——进阶使用
+
+上一小节所述均为默认AMP训练行为，用户当然也可以改变一些默认的参数设置来满足特定的模型训练场景需求。接下来的章节将介绍PaddlePaddle AMP功能使用中用户可配置的参数行为，即进阶使用技巧。
+
+#### 自定义黑白名单
+
+PaddlePaddle AMP功能实现中根据FP16数据类型计算稳定性和加速效果在框架内部定义了算子（Op）的黑白名单。具体来说，将对FP16计算友好且能利用Tensor Cores的Op归类于白名单，将使用FP16计算会导致数值不稳定的Op归类于黑名单，将对FP16计算没有多少影响的Op归类于灰名单。然而，框架开发人员不可能考虑到所有的网络模型情况，尤其是那些特殊场景中使用到的模型。用户可以在使用`fluid.contrib.mixed_precision.decorate` 函数时通过指定自定义的黑白名单列表来改变默认的FP16计算行为。
+
+```python
+sgd = SGDOptimizer(learning_rate=1e-3)
+# list1是白名单op列表，list2是黑名单op列表，list3是黑名单var_name列表（凡是以这些黑名单var_name为输入或输出的op均会被视为黑名单op）
+amp_list = AutoMixedPrecisionLists(custom_white_list=list1, custom_black_list=list2, custom_black_varnames=list3)
+mp_sgd = fluid.contrib.mixed_precision.decorator.decorate(sgd, amp_list)
+mp_sgd.minimize(avg_loss)
+```
+
+#### 自动loss scaling
+
+为了避免梯度Infinite或者NAN，PaddlePaddle AMP功能支持根据训练过程中梯度的数值自动调整loss scale值。用户在使用`fluid.contrib.mixed_precision.decorate` 函数时也可以改变与loss scaling相关的参数设置，示例如下：
+
+```python
+sgd = SGDOptimizer(learning_rate=1e-3)
+mp_sgd = fluid.contrib.mixed_precision.decorator.decorate(sgd,
+            amp_lists=None,
+             init_loss_scaling=2**8,
+             incr_every_n_steps=500,
+             decr_every_n_nan_or_inf=4,
+            incr_ratio=2.0,
+            decr_ratio=0.5,
+             use_dynamic_loss_scaling=True)
+mp_sgd.minimize(avg_loss)
+```
+
+`init_loss_scaling `、`incr_every_n_steps` 以及`decr_every_n_nan_or_inf`等参数控制着自动loss scaling的行为。它们仅当 `use_dynamic_loss_scaling`设置为True时有效。下面详述这些参数的意义：
+
+* init_loss_scaling(float)：初始loss scaling值。
+* incr_every_n_steps(int)：每经过incr_every_n_steps个连续的正常梯度值才会增大loss scaling值。
+* decr_every_n_nan_or_inf(int)：每经过decr_every_n_nan_or_inf个连续的无效梯度值(nan或者inf)才会减小loss scaling值。
+* incr_ratio(float)：每次增大loss scaling值的扩增倍数，其为大于1的浮点数。
+* decr_ratio(float)：每次减小loss scaling值的比例系数，其为小于1的浮点数。
+
+### 多卡GPU训练的优化
+
+PaddlePaddle AMP功能对多卡GPU训练进行了深度优化。如图 2 所示，优化之前的参数梯度更新特点：梯度计算时虽然使用的是FP16数据类型，但是不同GPU卡之间的梯度传输数据类型仍为FP32。
+
+<figure align="center">
+    <img src="https://paddleweb-static.bj.bcebos.com/images/transfer_fp32_grad.png" width="500" alt='missing'/>
+    <figcaption><center>图 2. 不同GPU卡之间传输梯度使用FP32数据类型（优化前）</center></figcaption>
+</figure>
+
+为了降低GPU多卡之间的梯度传输带宽，我们将梯度传输提前至`Cast`操作之前，而每个GPU卡在得到对应的FP16梯度后再执行`Cast`操作将其转变为FP32类型，具体操作详见图2。这一优化在训练大模型时对减少带宽占用尤其有效，如多卡训练BERT-Large模型。
+
+<figure align="center">
+    <img src="https://paddleweb-static.bj.bcebos.com/images/transfer_fp16_grad.png" width="500" alt='missing'/>
+    <figcaption><center>图 3. 不同GPU卡之间传输梯度使用FP16数据类型（优化后）</center></figcaption>
+</figure>
+
+### 训练性能对比（AMP VS FP32）
+
+PaddlePaddle AMP技术在ResNet50、Transformer等模型上训练速度相对于FP32训练上均有可观的加速比，下面是ResNet50和ERNIE Large模型的AMP训练相对于FP32训练的加速效果。
+
+<table align="center">
+<caption align="bottom"><center>图 4. Paddle AMP训练加速效果（横坐标为卡数，如8*8代表8机8卡）</center></caption>
+   <tr>
+       <td> <img src="https://paddleweb-static.bj.bcebos.com/images/resnet50.png" alt='missing'/> </td>
+       <td> <img src="https://paddleweb-static.bj.bcebos.com/images/ernie.png" alt='missing'/> </td>
+   </tr>
+</table>
+
+从图4所示的图表可以看出，ResNet50的AMP训练相对与FP32训练加速比可达$2.8 \times$以上，而ERNIE Large的AMP训练相对与FP32训练加速比亦可达 $1.7 \times -- 2.1 \times$ 。
+
+### 参考文献
+
+* <p> <a href="https://arxiv.org/abs/1710.03740"> Mixed Precision Training </a> </p>
+* <p> <a href="https://on-demand-gtc.gputechconf.com/gtcnew/sessionview.php?sessionName=cn9312-%e4%bd%bf%e7%94%a8%e8%87%aa%e5%8a%a8%e6%b7%b7%e5%90%88%e7%b2%be%e5%ba%a6%e5%8a%a0%e9%80%9f+paddlepaddle+%e8%ae%ad%e7%bb%83"> 使用自动混合精度加速 PaddlePaddle 训练 </a> </p>
+* <p id="fn1"> <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout"> Tensor Layouts In Memory: NCHW vs NHWC </a> <sup> <a href="#ref1">↩</a> </sub> </p>
+* <p id="fn2"> <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#channels"> Channels In And Out Requirements </a> <sup> <a href="#ref2">↩</a> </sup> </p>
+* <p id="fn3"> <a href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc"> Matrix-Matrix Multiplication Requirements </a> <sup> <a href="#ref3">↩</a> </sup> </p>
--- a/doc/fluid/advanced_guide/performance_improving/analysis_tools/host_memory_profiling_cn.md
+++ b/doc/fluid/advanced_guide/performance_improving/analysis_tools/host_memory_profiling_cn.md
-# 堆内存分析和优化
-
-计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
-
-
-目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
-
-因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
-
-本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
-
-gperftool主要支持以下四个功能：
-
- thread-caching malloc
- heap-checking using tcmalloc
- heap-profiling using tcmalloc
- CPU profiler
-
-Paddle也提供了基于gperftool的[CPU性能分析教程](./cpu_profiling_cn.html)。
-
-对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
-
-## 环境
-
-本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
-
-## 使用流程
-
- 安装google-perftools
-
-```
-apt-get install libunwind-dev
-apt-get install google-perftools
-```
-
- 安装pprof
-
-```
-go get -u github.com/google/pprof
-```
-
- 设置运行环境
-
-```
-export PPROF_PATH=/root/gopath/bin/pprof
-export PPROF_BINARY_PATH=/root/gopath/bin/pprof
-export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
-```
-
- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
-
-```
-# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
-# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
-env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
-```
-
-随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
-
-```
-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
-```
-
- 使用pprof对heap文件进行分析。分析有两种模式：
-    - 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
-
-    ```
-    pprof --pdf python test.log.0012.heap
-    ```
-    上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
-
-    ![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
-
-    - Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
-    ```
-    pprof --pdf --base test.log.0010.heap python test.log.1045.heap
-    ```
-    生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
-
-    从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
-
-    ![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
-    ![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
--- a/doc/fluid/advanced_guide/performance_improving/analysis_tools/index_cn.rst
+++ b/doc/fluid/advanced_guide/performance_improving/analysis_tools/index_cn.rst
+.. _api_guide_analysis_tools:
+
 ###############
 性能优化分析及工具
 ###############

--- a/doc/fluid/advanced_guide/performance_improving/analysis_tools/timeline_cn.md
+++ b/doc/fluid/advanced_guide/performance_improving/analysis_tools/timeline_cn.md
@@ -52,7 +52,6 @@ python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=time

 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。

-    ![chrome tracing](../tracing.jpeg)

 1. 结果如下图所示，可以放大来查看timeline的细节信息。


--- a/doc/fluid/advanced_guide/performance_improving/analysis_tools/timeline_en.md
+++ b/doc/fluid/advanced_guide/performance_improving/analysis_tools/timeline_en.md
@@ -52,7 +52,6 @@ python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=time

 3. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.

-    ![chrome tracing](./tracing.jpeg)




--- a/doc/fluid/advanced_guide/performance_improving/device_switching/device_switching.md
+++ b/doc/fluid/advanced_guide/performance_improving/device_switching/device_switching.md
+# 运行时设备切换
+
+Paddle提供了[fluid.CUDAPlace](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/CUDAPlace_cn.html)以及[fluid.CPUPlace](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/fluid_cn/CPUPlace_cn.html)用于指定运行时的设备。这两个接口用于指定全局的设备，从1.8版本开始，Paddle提供了[device_guard](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/fluid_cn/device_guard_cn.html)接口，用于指定部分OP的运行设备，此教程会介绍device_guard的使用场景，以及如何使用该接口对模型进行优化。
+
+如果使用了`fluid.CUDAPlace`设置了全局的执行设备，框架将尽可能地将OP设置在GPU上执行，因此有可能会遇到显存不够的情况。`device_guard`可以用于设置OP的执行设备，如果将部分层设置在CPU上运行，就能够充分利用CPU大内存的优势，避免显存超出。
+
+有时尽管指定了全局的执行设备为GPU，但框架在自动分配OP执行设备时，可能会将部分OP设置在CPU上执行。另外，个别OP会将输出存储在CPU上。在以上的场景中，常常会发生不同设备间的数据传输，可能会影响模型的性能。使用`device_guard`可以避免模型运行中不必要的数据传输。在下面的内容中，将会详细介绍如何通过[profile](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/profiler_cn.html)工具分析数据传输开销，以及如何使用`device_guard`避免不必要的数据传输，从而提升模型性能。
+
+## 如何避免显存超出
+
+下面示例代码中的`embedding`层，其参数`size`包含两个元素，第一个元素为`vocab_size` (词表大小), 第二个为`emb_size`（`embedding`层维度）。实际场景中，词表可能会非常大。示例代码中，词表大小被设置为10000000。如果在GPU模式下运行，该层创建的权重矩阵的大小为(10000000, 150)，仅这一层就需要5.59G的显存，如果词表大小继续增加，极有可能会导致显存超出。
+
+```python
+import paddle.fluid as fluid
+
+data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64')
+label = fluid.layers.fill_constant(shape=[1, 150], value=0.5, dtype='float32')
+emb = fluid.embedding(input=data, size=(10000000, 150), dtype='float32')
+out = fluid.layers.l2_normalize(x=emb, axis=-1)
+
+cost = fluid.layers.square_error_cost(input=out, label=label)
+avg_cost = fluid.layers.mean(cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+place = fluid.CUDAPlace(0)
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+result = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
+```
+
+`embedding`是根据`input`中的`id`信息从`embedding`矩阵中查询对应`embedding`信息，在CPU上进行计算，其速度也是可接受的。因此，可以参考如下代码，使用`device_guard`将`embedding`层设置在CPU上，以利用CPU内存资源。那么，除了`embedding`层，其他各层都会在GPU上运行。
+
+```python
+import paddle.fluid as fluid
+
+data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64')
+label = fluid.layers.fill_constant(shape=[1, 150], value=0.5, dtype='float32')
+with fluid.device_guard("cpu"):
+    emb = fluid.embedding(input=data, size=(10000000, 150), dtype='float32')
+out = fluid.layers.l2_normalize(x=emb, axis=-1)
+
+cost = fluid.layers.square_error_cost(input=out, label=label)
+avg_cost = fluid.layers.mean(cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+place = fluid.CUDAPlace(0)
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+result = exe.run(fluid.default_main_program(), fetch_list=[avg_cost])
+```
+
+在显存足够的情况下，可不必进行这样的设置。
+
+## 如何减少数据传输
+### 使用profile工具确认是否发生了数据传输
+首先对模型的性能数据进行分析，找到发生数据传输的原因。如下列代码所示，可以利用[profile](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/profiler_cn.html)工具进行分析。
+
+```python
+import paddle.fluid as fluid
+import paddle.fluid.compiler as compiler
+import paddle.fluid.profiler as profiler
+
+data1 = fluid.layers.fill_constant(shape=[1, 3, 8, 8], value=0.5, dtype='float32')
+data2 = fluid.layers.fill_constant(shape=[1, 3, 5, 5], value=0.5, dtype='float32')
+shape = fluid.layers.shape(data2)
+shape = fluid.layers.slice(shape, axes=[0], starts=[0], ends=[4])
+out = fluid.layers.crop_tensor(data1, shape=shape)
+place = fluid.CUDAPlace(0)
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+compiled_prog = compiler.CompiledProgram(fluid.default_main_program())
+with profiler.profiler('All', 'total') as prof:
+    for i in range(10):
+        result = exe.run(program=compiled_prog, fetch_list=[out])
+```
+
+在程序运行结束后，将会自动地打印出profile report。在下面的profile report中，可以看到	`GpuMemCpy Summary`中给出了2项数据传输的调用耗时。在OP执行过程中，如果输入Tensor所在的设备与OP执行的设备不同，就会发生`GpuMemcpySync`，通常我们可以直接优化的就是这一项。进一步分析，可以看到`slice`和`crop_tensor`执行中都发生了`GpuMemcpySync`。尽管我们在程序中设置了GPU模式运行，但是框架中有些OP，例如shape，会将输出结果放在CPU上。
+
+```text
+------------------------->     Profiling Report     <-------------------------
+
+Note! This Report merge all thread info into one.
+Place: All
+Time unit: ms
+Sorted by total time in descending order in the same thread
+
+Total time: 26.6328
+  Computation time       Total: 13.3133     Ratio: 49.9884%
+  Framework overhead     Total: 13.3195     Ratio: 50.0116%
+
+-------------------------     GpuMemCpy Summary     -------------------------
+
+GpuMemcpy                Calls: 30          Total: 1.47508     Ratio: 5.5386%
+  GpuMemcpyAsync         Calls: 10          Total: 0.443514    Ratio: 1.66529%
+  GpuMemcpySync          Calls: 20          Total: 1.03157     Ratio: 3.87331%
+
+-------------------------       Event Summary       -------------------------
+
+Event                                                       Calls       Total       CPU Time (Ratio)        GPU Time (Ratio)        Min.        Max.        Ave.        Ratio.
+FastThreadedSSAGraphExecutorPrepare                         10          9.16493     9.152509 (0.998645)     0.012417 (0.001355)     0.025192    8.85968     0.916493    0.344122
+shape                                                       10          8.33057     8.330568 (1.000000)     0.000000 (0.000000)     0.030711    7.99849     0.833057    0.312793
+fill_constant                                               20          4.06097     4.024522 (0.991025)     0.036449 (0.008975)     0.075087    0.888959    0.203049    0.15248
+slice                                                       10          1.78033     1.750439 (0.983212)     0.029888 (0.016788)     0.148503    0.290851    0.178033    0.0668471
+  GpuMemcpySync:CPU->GPU                                    10          0.45524     0.446312 (0.980388)     0.008928 (0.019612)     0.039089    0.060694    0.045524    0.0170932
+crop_tensor                                                 10          1.67658     1.620542 (0.966578)     0.056034 (0.033422)     0.143906    0.258776    0.167658    0.0629515
+  GpuMemcpySync:GPU->CPU                                    10          0.57633     0.552906 (0.959357)     0.023424 (0.040643)     0.050657    0.076322    0.057633    0.0216398
+Fetch                                                       10          0.919361    0.895201 (0.973721)     0.024160 (0.026279)     0.082935    0.138122    0.0919361   0.0345199
+  GpuMemcpyAsync:GPU->CPU                                   10          0.443514    0.419354 (0.945526)     0.024160 (0.054474)     0.040639    0.059673    0.0443514   0.0166529
+ScopeBufferedMonitor::post_local_exec_scopes_process        10          0.341999    0.341999 (1.000000)     0.000000 (0.000000)     0.028436    0.057134    0.0341999   0.0128413
+eager_deletion                                              30          0.287236    0.287236 (1.000000)     0.000000 (0.000000)     0.005452    0.022696    0.00957453  0.010785
+ScopeBufferedMonitor::pre_local_exec_scopes_process         10          0.047864    0.047864 (1.000000)     0.000000 (0.000000)     0.003668    0.011592    0.0047864   0.00179718
+InitLocalVars                                               1           0.022981    0.022981 (1.000000)     0.000000 (0.000000)     0.022981    0.022981    0.022981    0.000862883
+```
+### 通过log查看发生数据传输的具体位置
+
+以上的示例程序比较简单，我们只用看profile report就能知道具体是哪些算子发生了数据传输。但是当模型比较复杂时，可能需要去查看更加详细的调试信息，可以打印出运行时的log去确定发生数据传输的具体位置。依然以上述程序为例，执行`GLOG_vmodule=operator=3 python test_case.py`，会得到如下log信息，会发现发生了2次数据传输：
+
+- `shape`输出的结果在CPU上，在`slice`运行时，`shape`的输出被拷贝到GPU上
+- `slice`执行完的结果在GPU上，当`crop_tensor`执行时，它会被拷贝到CPU上。
+
+```text
+I0406 14:56:23.286592 17516 operator.cc:180] CUDAPlace(0) Op(shape), inputs:{Input[fill_constant_1.tmp_0:float[1, 3, 5, 5]({})]}, outputs:{Out[shape_0.tmp_0:int[4]({})]}.
+I0406 14:56:23.286628 17516 eager_deletion_op_handle.cc:107] Erase variable fill_constant_1.tmp_0 on CUDAPlace(0)
+I0406 14:56:23.286725 17516 operator.cc:1210] Transform Variable shape_0.tmp_0 from data_type[int]:data_layout[NCHW]:place[CPUPlace]:library_type[PLAIN] to data_type[int]:data_layout[ANY_LAYOUT]:place[CUDAPlace(0)]:library_type[PLAIN]
+I0406 14:56:23.286763 17516 scope.cc:169] Create variable shape_0.tmp_0
+I0406 14:56:23.286784 17516 data_device_transform.cc:21] DeviceTransform in, src_place CPUPlace dst_place: CUDAPlace(0)
+I0406 14:56:23.286867 17516 tensor_util.cu:129] TensorCopySync 4 from CPUPlace to CUDAPlace(0)
+I0406 14:56:23.287099 17516 operator.cc:180] CUDAPlace(0) Op(slice), inputs:{EndsTensor[], EndsTensorList[], Input[shape_0.tmp_0:int[4]({})], StartsTensor[], StartsTensorList[]}, outputs:{Out[slice_0.tmp_0:int[4]({})]}.
+I0406 14:56:23.287140 17516 eager_deletion_op_handle.cc:107] Erase variable shape_0.tmp_0 on CUDAPlace(0)
+I0406 14:56:23.287220 17516 tensor_util.cu:129] TensorCopySync 4 from CUDAPlace(0) to CPUPlace
+I0406 14:56:23.287473 17516 operator.cc:180] CUDAPlace(0) Op(crop_tensor), inputs:{Offsets[], OffsetsTensor[], Shape[slice_0.tmp_0:int[4]({})], ShapeTensor[], X[fill_constant_0.tmp_0:float[1, 3, 8, 8]({})]}, outputs:{Out[crop_tensor_0.tmp_0:float[1, 3, 5, 5]({})]}.
+```
+
+### 使用device_guard避免不必要的数据传输
+
+在上面的例子中，`shape`输出的是一个1-D的Tensor，因此对于`slice`而言计算量很小。这种情况下如果将`slice`设置在CPU上运行，就可以避免2次数据传输。修改后的程序如下：
+
+```python
+import paddle.fluid as fluid
+import paddle.fluid.compiler as compiler
+import paddle.fluid.profiler as profiler
+
+data1 = fluid.layers.fill_constant(shape=[1, 3, 8, 8], value=0.5, dtype='float32')
+data2 = fluid.layers.fill_constant(shape=[1, 3, 5, 5], value=0.5, dtype='float32')
+shape = fluid.layers.shape(data2)
+with fluid.device_guard("cpu"):
+    shape = fluid.layers.slice(shape, axes=[0], starts=[0], ends=[4])
+out = fluid.layers.crop_tensor(data1, shape=shape)
+place = fluid.CUDAPlace(0)
+exe = fluid.Executor(place)
+exe.run(fluid.default_startup_program())
+compiled_prog = compiler.CompiledProgram(fluid.default_main_program())
+with profiler.profiler('All', 'total') as prof:
+    for i in range(10):
+        result = exe.run(program=compiled_prog, fetch_list=[out])
+```
+再次观察profile report中`GpuMemCpy Summary`的内容，可以看到`GpuMemCpySync`已经被消除。在实际的模型中，若`GpuMemCpySync` 调用耗时占比较大，并且可以通过设置`device_guard`避免，那么就能够带来一定的性能提升。
+
+```text
+------------------------->     Profiling Report     <-------------------------
+
+Note! This Report merge all thread info into one.
+Place: All
+Time unit: ms
+Sorted by total time in descending order in the same thread
+
+Total time: 14.5345
+  Computation time       Total: 4.47587     Ratio: 30.7948%
+  Framework overhead     Total: 10.0586     Ratio: 69.2052%
+
+-------------------------     GpuMemCpy Summary     -------------------------
+
+GpuMemcpy                Calls: 10          Total: 0.457033    Ratio: 3.14447%
+  GpuMemcpyAsync         Calls: 10          Total: 0.457033    Ratio: 3.14447%
+
+-------------------------       Event Summary       -------------------------
+
+Event                                                       Calls       Total       CPU Time (Ratio)        GPU Time (Ratio)        Min.        Max.        Ave.        Ratio.
+FastThreadedSSAGraphExecutorPrepare                         10          7.70113     7.689066 (0.998433)     0.012064 (0.001567)     0.032657    7.39363     0.770113    0.529852
+fill_constant                                               20          2.62299     2.587022 (0.986287)     0.035968 (0.013713)     0.071097    0.342082    0.13115     0.180466
+shape                                                       10          1.93504     1.935040 (1.000000)     0.000000 (0.000000)     0.026774    1.6016      0.193504    0.133134
+Fetch                                                       10          0.880496    0.858512 (0.975032)     0.021984 (0.024968)     0.07392     0.140896    0.0880496   0.0605797
+  GpuMemcpyAsync:GPU->CPU                                   10          0.457033    0.435049 (0.951898)     0.021984 (0.048102)     0.037836    0.071424    0.0457033   0.0314447
+crop_tensor                                                 10          0.705426    0.671506 (0.951916)     0.033920 (0.048084)     0.05841     0.123901    0.0705426   0.0485346
+slice                                                       10          0.324241    0.324241 (1.000000)     0.000000 (0.000000)     0.024299    0.07213     0.0324241   0.0223084
+eager_deletion                                              30          0.250524    0.250524 (1.000000)     0.000000 (0.000000)     0.004171    0.016235    0.0083508   0.0172365
+ScopeBufferedMonitor::post_local_exec_scopes_process        10          0.047794    0.047794 (1.000000)     0.000000 (0.000000)     0.003344    0.014131    0.0047794   0.00328831
+InitLocalVars                                               1           0.034629    0.034629 (1.000000)     0.000000 (0.000000)     0.034629    0.034629    0.034629    0.00238254
+ScopeBufferedMonitor::pre_local_exec_scopes_process         10          0.032231    0.032231 (1.000000)     0.000000 (0.000000)     0.002952    0.004076    0.0032231   0.00221755
+```
+
+### 总结
+
+- 使用profile工具对模型进行分析，看是否存在GpuMemcpySync的调用耗时。若存在，则进一步分析发生数据传输的原因。
+- 可以通过profile report找到发生GpuMemcpySync的OP。如果需要，可以通过打印log，找到GpuMemcpySync发生的具体位置。
+- 尝试使用`device_guard`设置部分OP的运行设备，来减少GpuMemcpySync的调用。
+- 最后可以通过比较修改前后模型的profile report，或者其他用来衡量性能的指标，确认修改后是否带来了性能提升。
--- a/doc/fluid/advanced_guide/performance_improving/index_cn.rst
+++ b/doc/fluid/advanced_guide/performance_improving/index_cn.rst
@@ -7,6 +7,8 @@

    singlenode_training_improving/training_best_practice.rst
    singlenode_training_improving/memory_optimize.rst
+    device_switching/device_switching.md
+    amp/amp.md
    multinode_training_improving/cpu_train_best_practice.rst
    multinode_training_improving/dist_training_gpu.rst
    multinode_training_improving/gpu_training_with_recompute.rst

--- a/doc/fluid/advanced_guide/performance_improving/index_en.rst
+++ b/doc/fluid/advanced_guide/performance_improving/index_en.rst
@@ -5,7 +5,7 @@ Practice Improving
 ..  toctree::
    :maxdepth: 1

-
+    singlenode_training_improving/memory_optimize_en.rst
    multinode_training_improving/cpu_train_best_practice_en.rst
    multinode_training_improving/gpu_training_with_recompute_en.rst
    inference_improving/paddle_tensorrt_infer_en.md

--- a/doc/fluid/advanced_guide/performance_improving/multinode_training_improving/dist_training_gpu.rst
+++ b/doc/fluid/advanced_guide/performance_improving/multinode_training_improving/dist_training_gpu.rst
--- a/doc/fluid/advanced_guide/performance_improving/singlenode_training_improving/memory_optimize.rst
+++ b/doc/fluid/advanced_guide/performance_improving/singlenode_training_improving/memory_optimize.rst
--- a/doc/fluid/advanced_guide/performance_improving/singlenode_training_improving/memory_optimize_en.rst
+++ b/doc/fluid/advanced_guide/performance_improving/singlenode_training_improving/memory_optimize_en.rst
--- a/doc/fluid/advanced_guide/performance_improving/singlenode_training_improving/training_best_practice.rst
+++ b/doc/fluid/advanced_guide/performance_improving/singlenode_training_improving/training_best_practice.rst
--- a/doc/fluid/api/dataset.rst
+++ b/doc/fluid/api/dataset.rst
 =============
-fluid.dataset
+paddle.dataset
 =============

 ..  toctree::

--- a/doc/fluid/api/declarative.rst
+++ b/doc/fluid/api/declarative.rst
+=======================
+paddle.declarative
+=======================
+
+..  toctree::
+	:maxdepth: 1
+
+	declarative/batch_norm.rst
+	declarative/bilinear_tensor_product.rst
+	declarative/conv2d.rst
+	declarative/conv2d_transpose.rst
+	declarative/conv3d.rst
+	declarative/conv3d_transpose.rst
+	declarative/create_parameter.rst
+	declarative/crf_decoding.rst
+	declarative/data_norm.rst
+	declarative/deformable_conv.rst
+	declarative/embedding.rst
+	declarative/fc.rst
+	declarative/group_norm.rst
+	declarative/hsigmoid.rst
+	declarative/instance_norm.rst
+	declarative/layer_norm.rst
+	declarative/multi_box_head.rst
+	declarative/nce.rst
+	declarative/prelu.rst
+	declarative/row_conv.rst
+	declarative/spectral_norm.rst
--- a/doc/fluid/api/declarative/batch_norm.rst
+++ b/doc/fluid/api/declarative/batch_norm.rst
+.. _api_declarative_batch_norm:
+
+batch_norm
+-------------------------------
+:doc_source: paddle.fluid.layers.batch_norm
+
+
--- a/doc/fluid/api/declarative/bilinear_tensor_product.rst
+++ b/doc/fluid/api/declarative/bilinear_tensor_product.rst
+.. _api_declarative_bilinear_tensor_product:
+
+bilinear_tensor_product
+-------------------------------
+:doc_source: paddle.fluid.layers.bilinear_tensor_product
+
+
--- a/doc/fluid/api/declarative/conv2d.rst
+++ b/doc/fluid/api/declarative/conv2d.rst
+.. _api_declarative_conv2d:
+
+conv2d
+-------------------------------
+:doc_source: paddle.fluid.layers.conv2d
+
+
--- a/doc/fluid/api/declarative/conv2d_transpose.rst
+++ b/doc/fluid/api/declarative/conv2d_transpose.rst
+.. _api_declarative_conv2d_transpose:
+
+conv2d_transpose
+-------------------------------
+:doc_source: paddle.fluid.layers.conv2d_transpose
+
+
--- a/doc/fluid/api/declarative/conv3d.rst
+++ b/doc/fluid/api/declarative/conv3d.rst
+.. _api_declarative_conv3d:
+
+conv3d
+-------------------------------
+:doc_source: paddle.fluid.layers.conv3d
+
+
--- a/doc/fluid/api/declarative/conv3d_transpose.rst
+++ b/doc/fluid/api/declarative/conv3d_transpose.rst
+.. _api_declarative_conv3d_transpose:
+
+conv3d_transpose
+-------------------------------
+:doc_source: paddle.fluid.layers.conv3d_transpose
+
+
--- a/doc/fluid/api/declarative/create_parameter.rst
+++ b/doc/fluid/api/declarative/create_parameter.rst
+.. _api_declarative_create_parameter:
+
+create_parameter
+-------------------------------
+:doc_source: paddle.fluid.layers.create_parameter
+
+
--- a/doc/fluid/api/declarative/crf_decoding.rst
+++ b/doc/fluid/api/declarative/crf_decoding.rst
--- a/doc/fluid/api/declarative/data_norm.rst
+++ b/doc/fluid/api/declarative/data_norm.rst
--- a/doc/fluid/api/declarative/deformable_conv.rst
+++ b/doc/fluid/api/declarative/deformable_conv.rst
--- a/doc/fluid/api/declarative/embedding.rst
+++ b/doc/fluid/api/declarative/embedding.rst
--- a/doc/fluid/api/declarative/fc.rst
+++ b/doc/fluid/api/declarative/fc.rst
--- a/doc/fluid/api/declarative/group_norm.rst
+++ b/doc/fluid/api/declarative/group_norm.rst
--- a/doc/fluid/api/declarative/hsigmoid.rst
+++ b/doc/fluid/api/declarative/hsigmoid.rst
--- a/doc/fluid/api/declarative/instance_norm.rst
+++ b/doc/fluid/api/declarative/instance_norm.rst
--- a/doc/fluid/api/declarative/layer_norm.rst
+++ b/doc/fluid/api/declarative/layer_norm.rst
--- a/doc/fluid/api/declarative/multi_box_head.rst
+++ b/doc/fluid/api/declarative/multi_box_head.rst
--- a/doc/fluid/api/declarative/nce.rst
+++ b/doc/fluid/api/declarative/nce.rst
--- a/doc/fluid/api/declarative/prelu.rst
+++ b/doc/fluid/api/declarative/prelu.rst
--- a/doc/fluid/api/declarative/row_conv.rst
+++ b/doc/fluid/api/declarative/row_conv.rst
--- a/doc/fluid/api/declarative/spectral_norm.rst
+++ b/doc/fluid/api/declarative/spectral_norm.rst
--- a/doc/fluid/api/distributed.rst
+++ b/doc/fluid/api/distributed.rst
--- a/doc/fluid/api/distributed/ParallelEnv.rst
+++ b/doc/fluid/api/distributed/ParallelEnv.rst
--- a/doc/fluid/api/distributed/get_rank.rst
+++ b/doc/fluid/api/distributed/get_rank.rst
--- a/doc/fluid/api/distributed/get_world_size.rst
+++ b/doc/fluid/api/distributed/get_world_size.rst
--- a/doc/fluid/api/distributed/init_parallel_env.rst
+++ b/doc/fluid/api/distributed/init_parallel_env.rst
--- a/doc/fluid/api/distributed/prepare_context.rst
+++ b/doc/fluid/api/distributed/prepare_context.rst
--- a/doc/fluid/api/distributed/spawn.rst
+++ b/doc/fluid/api/distributed/spawn.rst
--- a/doc/fluid/api/dygraph.rst
+++ b/doc/fluid/api/dygraph.rst
--- a/doc/fluid/api/dygraph/BackwardStrategy.rst
+++ b/doc/fluid/api/dygraph/BackwardStrategy.rst
--- a/doc/fluid/api/dygraph/DataParallel.rst
+++ b/doc/fluid/api/dygraph/DataParallel.rst
--- a/doc/fluid/api/dygraph/Dropout.rst
+++ b/doc/fluid/api/dygraph/Dropout.rst
--- a/doc/fluid/api/dygraph/GRUCell.rst
+++ b/doc/fluid/api/dygraph/GRUCell.rst
--- a/doc/fluid/api/dygraph/InstanceNorm.rst
+++ b/doc/fluid/api/dygraph/InstanceNorm.rst
--- a/doc/fluid/api/dygraph/LSTMCell.rst
+++ b/doc/fluid/api/dygraph/LSTMCell.rst
--- a/doc/fluid/api/dygraph/ParallelEnv.rst
+++ b/doc/fluid/api/dygraph/ParallelEnv.rst
--- a/doc/fluid/api/dygraph/ReduceLROnPlateau.rst
+++ b/doc/fluid/api/dygraph/ReduceLROnPlateau.rst
--- a/doc/fluid/api/dygraph/TranslatedLayer.rst
+++ b/doc/fluid/api/dygraph/TranslatedLayer.rst
--- a/doc/fluid/api/dygraph/disable_dygraph.rst
+++ b/doc/fluid/api/dygraph/disable_dygraph.rst
--- a/doc/fluid/api/dygraph/dygraph_to_static_code.rst
+++ b/doc/fluid/api/dygraph/dygraph_to_static_code.rst
--- a/doc/fluid/api/dygraph/dygraph_to_static_func.rst
+++ b/doc/fluid/api/dygraph/dygraph_to_static_func.rst
--- a/doc/fluid/api/dygraph/dygraph_to_static_program.rst
+++ b/doc/fluid/api/dygraph/dygraph_to_static_program.rst
--- a/doc/fluid/api/dygraph/enable_dygraph.rst
+++ b/doc/fluid/api/dygraph/enable_dygraph.rst
--- a/doc/fluid/api/dygraph/enabled.rst
+++ b/doc/fluid/api/dygraph/enabled.rst
--- a/doc/fluid/api/dygraph/grad.rst
+++ b/doc/fluid/api/dygraph/grad.rst
--- a/doc/fluid/api/dygraph/jit.rst
+++ b/doc/fluid/api/dygraph/jit.rst
--- a/doc/fluid/api/dygraph/jit/SaveLoadConfig.rst
+++ b/doc/fluid/api/dygraph/jit/SaveLoadConfig.rst
--- a/doc/fluid/api/dygraph/jit/load.rst
+++ b/doc/fluid/api/dygraph/jit/load.rst
--- a/doc/fluid/api/dygraph/jit/save.rst
+++ b/doc/fluid/api/dygraph/jit/save.rst
--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
--- a/doc/fluid/api/fluid/device_guard.rst
+++ b/doc/fluid/api/fluid/device_guard.rst
--- a/doc/fluid/api/fluid/disable_dygraph.rst
+++ b/doc/fluid/api/fluid/disable_dygraph.rst
--- a/doc/fluid/api/fluid/enable_dygraph.rst
+++ b/doc/fluid/api/fluid/enable_dygraph.rst
--- a/doc/fluid/api/fluid/get_flags.rst
+++ b/doc/fluid/api/fluid/get_flags.rst
--- a/doc/fluid/api/fluid/set_flags.rst
+++ b/doc/fluid/api/fluid/set_flags.rst
--- a/doc/fluid/api/framework.rst
+++ b/doc/fluid/api/framework.rst
--- a/doc/fluid/api/framework/BuildStrategy.rst
+++ b/doc/fluid/api/framework/BuildStrategy.rst
--- a/doc/fluid/api/framework/CPUPlace.rst
+++ b/doc/fluid/api/framework/CPUPlace.rst
--- a/doc/fluid/api/framework/CUDAPinnedPlace.rst
+++ b/doc/fluid/api/framework/CUDAPinnedPlace.rst
--- a/doc/fluid/api/framework/CUDAPlace.rst
+++ b/doc/fluid/api/framework/CUDAPlace.rst
--- a/doc/fluid/api/framework/CompiledProgram.rst
+++ b/doc/fluid/api/framework/CompiledProgram.rst
--- a/doc/fluid/api/framework/ExecutionStrateg y.rst
+++ b/doc/fluid/api/framework/ExecutionStrateg y.rst
--- a/doc/fluid/api/framework/Executor.rst
+++ b/doc/fluid/api/framework/Executor.rst
--- a/doc/fluid/api/framework/ParallelExecutor.rst
+++ b/doc/fluid/api/framework/ParallelExecutor.rst
--- a/doc/fluid/api/framework/ParamAttr.rst
+++ b/doc/fluid/api/framework/ParamAttr.rst
--- a/doc/fluid/api/framework/Print.rst
+++ b/doc/fluid/api/framework/Print.rst
--- a/doc/fluid/api/framework/Program.rst
+++ b/doc/fluid/api/framework/Program.rst
--- a/doc/fluid/api/framework/Variable.rst
+++ b/doc/fluid/api/framework/Variable.rst
--- a/doc/fluid/api/framework/WeightNormParamAttr.rst
+++ b/doc/fluid/api/framework/WeightNormParamAttr.rst
--- a/doc/fluid/api/framework/append_backward.rst
+++ b/doc/fluid/api/framework/append_backward.rst
--- a/doc/fluid/api/framework/create_global_var.rst
+++ b/doc/fluid/api/framework/create_global_var.rst
--- a/doc/fluid/api/framework/create_parameter.rst
+++ b/doc/fluid/api/framework/create_parameter.rst
--- a/doc/fluid/api/framework/default_main_program.rst
+++ b/doc/fluid/api/framework/default_main_program.rst
--- a/doc/fluid/api/framework/default_startup_program.rst
+++ b/doc/fluid/api/framework/default_startup_program.rst
--- a/doc/fluid/api/framework/global_scope.rst
+++ b/doc/fluid/api/framework/global_scope.rst
--- a/doc/fluid/api/framework/gradients.rst
+++ b/doc/fluid/api/framework/gradients.rst
--- a/doc/fluid/api/framework/name_scope.rst
+++ b/doc/fluid/api/framework/name_scope.rst
--- a/doc/fluid/api/framework/program_guard.rst
+++ b/doc/fluid/api/framework/program_guard.rst
--- a/doc/fluid/api/framework/py_func.rst
+++ b/doc/fluid/api/framework/py_func.rst
--- a/doc/fluid/api/framework/scope_guard.rst
+++ b/doc/fluid/api/framework/scope_guard.rst
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
--- a/doc/fluid/api/gen_index.py
+++ b/doc/fluid/api/gen_index.py
--- a/doc/fluid/api/imperative.rst
+++ b/doc/fluid/api/imperative.rst
--- a/doc/fluid/api/imperative/CosineDecay.rst
+++ b/doc/fluid/api/imperative/CosineDecay.rst
--- a/doc/fluid/api/imperative/DataParallel.rst
+++ b/doc/fluid/api/imperative/DataParallel.rst
--- a/doc/fluid/api/imperative/ExponentialDecay.rst
+++ b/doc/fluid/api/imperative/ExponentialDecay.rst
--- a/doc/fluid/api/imperative/InverseTimeDecay.rst
+++ b/doc/fluid/api/imperative/InverseTimeDecay.rst
--- a/doc/fluid/api/imperative/NaturalExpDecay.rst
+++ b/doc/fluid/api/imperative/NaturalExpDecay.rst
--- a/doc/fluid/api/imperative/NoamDecay.rst
+++ b/doc/fluid/api/imperative/NoamDecay.rst
--- a/doc/fluid/api/imperative/ParallelEnv.rst
+++ b/doc/fluid/api/imperative/ParallelEnv.rst
--- a/doc/fluid/api/imperative/PiecewiseDecay.rst
+++ b/doc/fluid/api/imperative/PiecewiseDecay.rst
--- a/doc/fluid/api/imperative/PolynomialDecay.rst
+++ b/doc/fluid/api/imperative/PolynomialDecay.rst
--- a/doc/fluid/api/imperative/ProgramTranslator.rst
+++ b/doc/fluid/api/imperative/ProgramTranslator.rst
--- a/doc/fluid/api/imperative/TracedLayer.rst
+++ b/doc/fluid/api/imperative/TracedLayer.rst
--- a/doc/fluid/api/imperative/TranslatedLayer.rst
+++ b/doc/fluid/api/imperative/TranslatedLayer.rst
--- a/doc/fluid/api/imperative/declarative.rst
+++ b/doc/fluid/api/imperative/declarative.rst
--- a/doc/fluid/api/imperative/enabled.rst
+++ b/doc/fluid/api/imperative/enabled.rst
--- a/doc/fluid/api/imperative/grad.rst
+++ b/doc/fluid/api/imperative/grad.rst
--- a/doc/fluid/api/imperative/guard.rst
+++ b/doc/fluid/api/imperative/guard.rst
--- a/doc/fluid/api/imperative/jit.rst
+++ b/doc/fluid/api/imperative/jit.rst
--- a/doc/fluid/api/imperative/jit/SaveLoadConfig.rst
+++ b/doc/fluid/api/imperative/jit/SaveLoadConfig.rst
--- a/doc/fluid/api/imperative/jit/load.rst
+++ b/doc/fluid/api/imperative/jit/load.rst
--- a/doc/fluid/api/imperative/jit/save.rst
+++ b/doc/fluid/api/imperative/jit/save.rst
--- a/doc/fluid/api/imperative/load.rst
+++ b/doc/fluid/api/imperative/load.rst
--- a/doc/fluid/api/imperative/no_grad.rst
+++ b/doc/fluid/api/imperative/no_grad.rst
--- a/doc/fluid/api/imperative/prepare_context.rst
+++ b/doc/fluid/api/imperative/prepare_context.rst
--- a/doc/fluid/api/imperative/save.rst
+++ b/doc/fluid/api/imperative/save.rst
--- a/doc/fluid/api/imperative/to_variable.rst
+++ b/doc/fluid/api/imperative/to_variable.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
--- a/doc/fluid/api/io/BatchSampler.rst
+++ b/doc/fluid/api/io/BatchSampler.rst
--- a/doc/fluid/api/io/ComposeNotAligned.rst
+++ b/doc/fluid/api/io/ComposeNotAligned.rst
--- a/doc/fluid/api/io/Dataset.rst
+++ b/doc/fluid/api/io/Dataset.rst
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
--- a/doc/fluid/api/layers/BasicDecoder.rst
+++ b/doc/fluid/api/layers/BasicDecoder.rst
--- a/doc/fluid/api/layers/DecodeHelper.rst
+++ b/doc/fluid/api/layers/DecodeHelper.rst
--- a/doc/fluid/api/layers/GreedyEmbeddingHelper.rst
+++ b/doc/fluid/api/layers/GreedyEmbeddingHelper.rst
--- a/doc/fluid/api/layers/SampleEmbeddingHelper.rst
+++ b/doc/fluid/api/layers/SampleEmbeddingHelper.rst
--- a/doc/fluid/api/layers/TrainingHelper.rst
+++ b/doc/fluid/api/layers/TrainingHelper.rst
--- a/doc/fluid/api/layers/inplace_abn.rst
+++ b/doc/fluid/api/layers/inplace_abn.rst
--- a/doc/fluid/api/layers/matrix_nms.rst
+++ b/doc/fluid/api/layers/matrix_nms.rst
--- a/doc/fluid/api/metric.rst
+++ b/doc/fluid/api/metric.rst
--- a/doc/fluid/api/metric/ChunkEvaluator.rst
+++ b/doc/fluid/api/metric/ChunkEvaluator.rst
--- a/doc/fluid/api/metric/CompositeMetric.rst
+++ b/doc/fluid/api/metric/CompositeMetric.rst
--- a/doc/fluid/api/metric/DetectionMAP.rst
+++ b/doc/fluid/api/metric/DetectionMAP.rst
--- a/doc/fluid/api/metric/EditDistance.rst
+++ b/doc/fluid/api/metric/EditDistance.rst
--- a/doc/fluid/api/metric/Precision.rst
+++ b/doc/fluid/api/metric/Precision.rst
--- a/doc/fluid/api/metric/Recall.rst
+++ b/doc/fluid/api/metric/Recall.rst
--- a/doc/fluid/api/metric/accuracy.rst
+++ b/doc/fluid/api/metric/accuracy.rst
--- a/doc/fluid/api/metric/auc.rst
+++ b/doc/fluid/api/metric/auc.rst
--- a/doc/fluid/api/metric/chunk_eval.rst
+++ b/doc/fluid/api/metric/chunk_eval.rst
--- a/doc/fluid/api/metric/cos_sim.rst
+++ b/doc/fluid/api/metric/cos_sim.rst
--- a/doc/fluid/api/metric/mean_iou.rst
+++ b/doc/fluid/api/metric/mean_iou.rst
--- a/doc/fluid/api/nn.rst
+++ b/doc/fluid/api/nn.rst
--- a/doc/fluid/api/nn/AdaptiveAvgPool2d.rst
+++ b/doc/fluid/api/nn/AdaptiveAvgPool2d.rst
--- a/doc/fluid/api/nn/AdaptiveAvgPool3d.rst
+++ b/doc/fluid/api/nn/AdaptiveAvgPool3d.rst
--- a/doc/fluid/api/nn/BatchNorm.rst
+++ b/doc/fluid/api/nn/BatchNorm.rst
--- a/doc/fluid/api/nn/Bilinear.rst
+++ b/doc/fluid/api/nn/Bilinear.rst
--- a/doc/fluid/api/nn/BilinearTensorProduct.rst
+++ b/doc/fluid/api/nn/BilinearTensorProduct.rst
--- a/doc/fluid/api/nn/ConstantPad1d.rst
+++ b/doc/fluid/api/nn/ConstantPad1d.rst
--- a/doc/fluid/api/nn/ConstantPad2d.rst
+++ b/doc/fluid/api/nn/ConstantPad2d.rst
--- a/doc/fluid/api/nn/ConstantPad3d.rst
+++ b/doc/fluid/api/nn/ConstantPad3d.rst
--- a/doc/fluid/api/nn/Conv2d.rst
+++ b/doc/fluid/api/nn/Conv2d.rst
--- a/doc/fluid/api/nn/Conv3d.rst
+++ b/doc/fluid/api/nn/Conv3d.rst
--- a/doc/fluid/api/nn/ConvTranspose2d.rst
+++ b/doc/fluid/api/nn/ConvTranspose2d.rst
--- a/doc/fluid/api/nn/ConvTranspose3d.rst
+++ b/doc/fluid/api/nn/ConvTranspose3d.rst
--- a/doc/fluid/api/nn/CosineSimilarity.rst
+++ b/doc/fluid/api/nn/CosineSimilarity.rst
--- a/doc/fluid/api/nn/Embedding.rst
+++ b/doc/fluid/api/nn/Embedding.rst
--- a/doc/fluid/api/nn/GradientClipByGlobalNorm.rst
+++ b/doc/fluid/api/nn/GradientClipByGlobalNorm.rst
--- a/doc/fluid/api/nn/GradientClipByNorm.rst
+++ b/doc/fluid/api/nn/GradientClipByNorm.rst
--- a/doc/fluid/api/nn/GradientClipByValue.rst
+++ b/doc/fluid/api/nn/GradientClipByValue.rst
--- a/doc/fluid/api/nn/GroupNorm.rst
+++ b/doc/fluid/api/nn/GroupNorm.rst
--- a/doc/fluid/api/nn/Layer.rst
+++ b/doc/fluid/api/nn/Layer.rst
--- a/doc/fluid/api/nn/LayerList.rst
+++ b/doc/fluid/api/nn/LayerList.rst
--- a/doc/fluid/api/nn/LayerNorm.rst
+++ b/doc/fluid/api/nn/LayerNorm.rst
--- a/doc/fluid/api/nn/Linear.rst
+++ b/doc/fluid/api/nn/Linear.rst
--- a/doc/fluid/api/nn/ParameterList.rst
+++ b/doc/fluid/api/nn/ParameterList.rst
--- a/doc/fluid/api/nn/Pool2D.rst
+++ b/doc/fluid/api/nn/Pool2D.rst
--- a/doc/fluid/api/nn/ReLU.rst
+++ b/doc/fluid/api/nn/ReLU.rst
--- a/doc/fluid/api/nn/ReflectionPad1d.rst
+++ b/doc/fluid/api/nn/ReflectionPad1d.rst
--- a/doc/fluid/api/nn/ReflectionPad2d.rst
+++ b/doc/fluid/api/nn/ReflectionPad2d.rst
--- a/doc/fluid/api/nn/ReplicationPad1d.rst
+++ b/doc/fluid/api/nn/ReplicationPad1d.rst
--- a/doc/fluid/api/nn/ReplicationPad2d.rst
+++ b/doc/fluid/api/nn/ReplicationPad2d.rst
--- a/doc/fluid/api/nn/ReplicationPad3d.rst
+++ b/doc/fluid/api/nn/ReplicationPad3d.rst
--- a/doc/fluid/api/nn/Sequential.rst
+++ b/doc/fluid/api/nn/Sequential.rst
--- a/doc/fluid/api/nn/SpectralNorm.rst
+++ b/doc/fluid/api/nn/SpectralNorm.rst
--- a/doc/fluid/api/nn/ZeroPad2d.rst
+++ b/doc/fluid/api/nn/ZeroPad2d.rst
--- a/doc/fluid/api/nn/activation.rst
+++ b/doc/fluid/api/nn/activation.rst
--- a/doc/fluid/api/nn/activation/ELU.rst
+++ b/doc/fluid/api/nn/activation/ELU.rst
--- a/doc/fluid/api/nn/activation/GELU.rst
+++ b/doc/fluid/api/nn/activation/GELU.rst
--- a/doc/fluid/api/nn/activation/Hardshrink.rst
+++ b/doc/fluid/api/nn/activation/Hardshrink.rst
--- a/doc/fluid/api/nn/activation/Hardtanh.rst
+++ b/doc/fluid/api/nn/activation/Hardtanh.rst
--- a/doc/fluid/api/nn/activation/LogSigmoid.rst
+++ b/doc/fluid/api/nn/activation/LogSigmoid.rst
--- a/doc/fluid/api/nn/activation/PReLU.rst
+++ b/doc/fluid/api/nn/activation/PReLU.rst
--- a/doc/fluid/api/nn/activation/ReLU.rst
+++ b/doc/fluid/api/nn/activation/ReLU.rst
--- a/doc/fluid/api/nn/activation/ReLU6.rst
+++ b/doc/fluid/api/nn/activation/ReLU6.rst
--- a/doc/fluid/api/nn/activation/SELU.rst
+++ b/doc/fluid/api/nn/activation/SELU.rst
--- a/doc/fluid/api/nn/activation/Softmax.rst
+++ b/doc/fluid/api/nn/activation/Softmax.rst
--- a/doc/fluid/api/nn/activation/Softplus.rst
+++ b/doc/fluid/api/nn/activation/Softplus.rst
--- a/doc/fluid/api/nn/activation/Softshrink.rst
+++ b/doc/fluid/api/nn/activation/Softshrink.rst
--- a/doc/fluid/api/nn/activation/Softsign.rst
+++ b/doc/fluid/api/nn/activation/Softsign.rst
--- a/doc/fluid/api/nn/activation/Tanh.rst
+++ b/doc/fluid/api/nn/activation/Tanh.rst
--- a/doc/fluid/api/nn/activation/Tanhshrink.rst
+++ b/doc/fluid/api/nn/activation/Tanhshrink.rst
--- a/doc/fluid/api/nn/adaptive_pool2d.rst
+++ b/doc/fluid/api/nn/adaptive_pool2d.rst
--- a/doc/fluid/api/nn/adaptive_pool3d.rst
+++ b/doc/fluid/api/nn/adaptive_pool3d.rst
--- a/doc/fluid/api/nn/add_position_encoding.rst
+++ b/doc/fluid/api/nn/add_position_encoding.rst
--- a/doc/fluid/api/nn/affine_channel.rst
+++ b/doc/fluid/api/nn/affine_channel.rst
--- a/doc/fluid/api/nn/affine_grid.rst
+++ b/doc/fluid/api/nn/affine_grid.rst
--- a/doc/fluid/api/nn/anchor_generator.rst
+++ b/doc/fluid/api/nn/anchor_generator.rst
--- a/doc/fluid/api/nn/assign.rst
+++ b/doc/fluid/api/nn/assign.rst
--- a/doc/fluid/api/nn/beam_search.rst
+++ b/doc/fluid/api/nn/beam_search.rst
--- a/doc/fluid/api/nn/beam_search_decode.rst
+++ b/doc/fluid/api/nn/beam_search_decode.rst
--- a/doc/fluid/api/nn/bipartite_match.rst
+++ b/doc/fluid/api/nn/bipartite_match.rst
--- a/doc/fluid/api/nn/box_clip.rst
+++ b/doc/fluid/api/nn/box_clip.rst
--- a/doc/fluid/api/nn/box_coder.rst
+++ b/doc/fluid/api/nn/box_coder.rst
--- a/doc/fluid/api/nn/box_decoder_and_assign.rst
+++ b/doc/fluid/api/nn/box_decoder_and_assign.rst
--- a/doc/fluid/api/nn/bpr_loss.rst
+++ b/doc/fluid/api/nn/bpr_loss.rst
--- a/doc/fluid/api/nn/brelu.rst
+++ b/doc/fluid/api/nn/brelu.rst
--- a/doc/fluid/api/nn/case.rst
+++ b/doc/fluid/api/nn/case.rst
--- a/doc/fluid/api/nn/center_loss.rst
+++ b/doc/fluid/api/nn/center_loss.rst
--- a/doc/fluid/api/nn/clip.rst
+++ b/doc/fluid/api/nn/clip.rst
--- a/doc/fluid/api/nn/clip_by_norm.rst
+++ b/doc/fluid/api/nn/clip_by_norm.rst
--- a/doc/fluid/api/nn/collect_fpn_proposals.rst
+++ b/doc/fluid/api/nn/collect_fpn_proposals.rst
--- a/doc/fluid/api/nn/cond.rst
+++ b/doc/fluid/api/nn/cond.rst
--- a/doc/fluid/api/nn/continuous_value_model.rst
+++ b/doc/fluid/api/nn/continuous_value_model.rst
--- a/doc/fluid/api/nn/cosine_decay.rst
+++ b/doc/fluid/api/nn/cosine_decay.rst
--- a/doc/fluid/api/nn/cosine_similarity.rst
+++ b/doc/fluid/api/nn/cosine_similarity.rst
--- a/doc/fluid/api/nn/cross_entropy.rst
+++ b/doc/fluid/api/nn/cross_entropy.rst
--- a/doc/fluid/api/nn/data.rst
+++ b/doc/fluid/api/nn/data.rst
--- a/doc/fluid/api/nn/deformable_roi_pooling.rst
+++ b/doc/fluid/api/nn/deformable_roi_pooling.rst
--- a/doc/fluid/api/nn/density_prior_box.rst
+++ b/doc/fluid/api/nn/density_prior_box.rst
--- a/doc/fluid/api/nn/detection_output.rst
+++ b/doc/fluid/api/nn/detection_output.rst
--- a/doc/fluid/api/nn/dice_loss.rst
+++ b/doc/fluid/api/nn/dice_loss.rst
--- a/doc/fluid/api/nn/distribute_fpn_proposals.rst
+++ b/doc/fluid/api/nn/distribute_fpn_proposals.rst
--- a/doc/fluid/api/nn/dropout.rst
+++ b/doc/fluid/api/nn/dropout.rst
--- a/doc/fluid/api/nn/edit_distance.rst
+++ b/doc/fluid/api/nn/edit_distance.rst
--- a/doc/fluid/api/nn/elu.rst
+++ b/doc/fluid/api/nn/elu.rst
--- a/doc/fluid/api/nn/erf.rst
+++ b/doc/fluid/api/nn/erf.rst
--- a/doc/fluid/api/nn/exponential_decay.rst
+++ b/doc/fluid/api/nn/exponential_decay.rst
--- a/doc/fluid/api/nn/filter_by_instag.rst
+++ b/doc/fluid/api/nn/filter_by_instag.rst
--- a/doc/fluid/api/nn/fsp_matrix.rst
+++ b/doc/fluid/api/nn/fsp_matrix.rst
--- a/doc/fluid/api/nn/functional.rst
+++ b/doc/fluid/api/nn/functional.rst
--- a/doc/fluid/api/nn/functional/activation/sigmoid.rst
+++ b/doc/fluid/api/nn/functional/activation/sigmoid.rst
--- a/doc/fluid/api/nn/functional/adaptive_avg_pool2d.rst
+++ b/doc/fluid/api/nn/functional/adaptive_avg_pool2d.rst
--- a/doc/fluid/api/nn/functional/adaptive_avg_pool3d.rst
+++ b/doc/fluid/api/nn/functional/adaptive_avg_pool3d.rst
--- a/doc/fluid/api/nn/functional/bilinear.rst
+++ b/doc/fluid/api/nn/functional/bilinear.rst
--- a/doc/fluid/api/nn/functional/binary_cross_entropy.rst
+++ b/doc/fluid/api/nn/functional/binary_cross_entropy.rst
--- a/doc/fluid/api/nn/functional/binary_cross_entropy_with_logits.rst
+++ b/doc/fluid/api/nn/functional/binary_cross_entropy_with_logits.rst
--- a/doc/fluid/api/nn/functional/conv2d.rst
+++ b/doc/fluid/api/nn/functional/conv2d.rst
--- a/doc/fluid/api/nn/functional/conv3d.rst
+++ b/doc/fluid/api/nn/functional/conv3d.rst
--- a/doc/fluid/api/nn/functional/conv_transpose2d.rst
+++ b/doc/fluid/api/nn/functional/conv_transpose2d.rst
--- a/doc/fluid/api/nn/functional/conv_transpose3d.rst
+++ b/doc/fluid/api/nn/functional/conv_transpose3d.rst
--- a/doc/fluid/api/nn/functional/ctc_loss.rst
+++ b/doc/fluid/api/nn/functional/ctc_loss.rst
--- a/doc/fluid/api/nn/functional/kl_div.rst
+++ b/doc/fluid/api/nn/functional/kl_div.rst
--- a/doc/fluid/api/nn/functional/l1_loss.rst
+++ b/doc/fluid/api/nn/functional/l1_loss.rst
--- a/doc/fluid/api/nn/functional/loss/margin_ranking_loss.rst
+++ b/doc/fluid/api/nn/functional/loss/margin_ranking_loss.rst
--- a/doc/fluid/api/nn/functional/mse_loss.rst
+++ b/doc/fluid/api/nn/functional/mse_loss.rst
--- a/doc/fluid/api/nn/functional/nll_loss.rst
+++ b/doc/fluid/api/nn/functional/nll_loss.rst
--- a/doc/fluid/api/nn/functional/one_hot.rst
+++ b/doc/fluid/api/nn/functional/one_hot.rst
--- a/doc/fluid/api/nn/gather_tree.rst
+++ b/doc/fluid/api/nn/gather_tree.rst
--- a/doc/fluid/api/nn/gelu.rst
+++ b/doc/fluid/api/nn/gelu.rst
--- a/doc/fluid/api/nn/generate_mask_labels.rst
+++ b/doc/fluid/api/nn/generate_mask_labels.rst
--- a/doc/fluid/api/nn/generate_proposal_labels.rst
+++ b/doc/fluid/api/nn/generate_proposal_labels.rst
--- a/doc/fluid/api/nn/generate_proposals.rst
+++ b/doc/fluid/api/nn/generate_proposals.rst
--- a/doc/fluid/api/nn/grid_sampler.rst
+++ b/doc/fluid/api/nn/grid_sampler.rst
--- a/doc/fluid/api/nn/hard_sigmoid.rst
+++ b/doc/fluid/api/nn/hard_sigmoid.rst
--- a/doc/fluid/api/nn/hard_swish.rst
+++ b/doc/fluid/api/nn/hard_swish.rst
--- a/doc/fluid/api/nn/hardshrink.rst
+++ b/doc/fluid/api/nn/hardshrink.rst
--- a/doc/fluid/api/nn/hardtanh.rst
+++ b/doc/fluid/api/nn/hardtanh.rst
--- a/doc/fluid/api/nn/hash.rst
+++ b/doc/fluid/api/nn/hash.rst
--- a/doc/fluid/api/nn/hsigmoid.rst
+++ b/doc/fluid/api/nn/hsigmoid.rst
--- a/doc/fluid/api/nn/huber_loss.rst
+++ b/doc/fluid/api/nn/huber_loss.rst
--- a/doc/fluid/api/nn/image_resize.rst
+++ b/doc/fluid/api/nn/image_resize.rst
--- a/doc/fluid/api/nn/image_resize_short.rst
+++ b/doc/fluid/api/nn/image_resize_short.rst
--- a/doc/fluid/api/nn/initializer.rst
+++ b/doc/fluid/api/nn/initializer.rst
--- a/doc/fluid/api/nn/initializer/Bilinear.rst
+++ b/doc/fluid/api/nn/initializer/Bilinear.rst
--- a/doc/fluid/api/nn/initializer/Constant.rst
+++ b/doc/fluid/api/nn/initializer/Constant.rst
--- a/doc/fluid/api/nn/initializer/MSRA.rst
+++ b/doc/fluid/api/nn/initializer/MSRA.rst
--- a/doc/fluid/api/nn/initializer/Normal.rst
+++ b/doc/fluid/api/nn/initializer/Normal.rst
--- a/doc/fluid/api/nn/initializer/TruncatedNormal.rst
+++ b/doc/fluid/api/nn/initializer/TruncatedNormal.rst
--- a/doc/fluid/api/nn/initializer/Uniform.rst
+++ b/doc/fluid/api/nn/initializer/Uniform.rst
--- a/doc/fluid/api/nn/initializer/Xavier.rst
+++ b/doc/fluid/api/nn/initializer/Xavier.rst
--- a/doc/fluid/api/nn/inverse_time_decay.rst
+++ b/doc/fluid/api/nn/inverse_time_decay.rst
--- a/doc/fluid/api/nn/iou_similarity.rst
+++ b/doc/fluid/api/nn/iou_similarity.rst
--- a/doc/fluid/api/nn/kldiv_loss.rst
+++ b/doc/fluid/api/nn/kldiv_loss.rst
--- a/doc/fluid/api/nn/l2_normalize.rst
+++ b/doc/fluid/api/nn/l2_normalize.rst
--- a/doc/fluid/api/nn/label_smooth.rst
+++ b/doc/fluid/api/nn/label_smooth.rst
--- a/doc/fluid/api/nn/layer/activation/Sigmoid.rst
+++ b/doc/fluid/api/nn/layer/activation/Sigmoid.rst
--- a/doc/fluid/api/nn/layer/loss/MarginRankingLoss.rst
+++ b/doc/fluid/api/nn/layer/loss/MarginRankingLoss.rst
--- a/doc/fluid/api/nn/leaky_relu.rst
+++ b/doc/fluid/api/nn/leaky_relu.rst
--- a/doc/fluid/api/nn/linear_lr_warmup.rst
+++ b/doc/fluid/api/nn/linear_lr_warmup.rst
--- a/doc/fluid/api/nn/log_loss.rst
+++ b/doc/fluid/api/nn/log_loss.rst
--- a/doc/fluid/api/nn/log_sigmoid.rst
+++ b/doc/fluid/api/nn/log_sigmoid.rst
--- a/doc/fluid/api/nn/log_softmax.rst
+++ b/doc/fluid/api/nn/log_softmax.rst
--- a/doc/fluid/api/nn/loss.rst
+++ b/doc/fluid/api/nn/loss.rst
--- a/doc/fluid/api/nn/loss/BCELoss.rst
+++ b/doc/fluid/api/nn/loss/BCELoss.rst
--- a/doc/fluid/api/nn/loss/BCEWithLogitsLoss.rst
+++ b/doc/fluid/api/nn/loss/BCEWithLogitsLoss.rst
--- a/doc/fluid/api/nn/loss/CTCLoss.rst
+++ b/doc/fluid/api/nn/loss/CTCLoss.rst
--- a/doc/fluid/api/nn/loss/KLDivLoss.rst
+++ b/doc/fluid/api/nn/loss/KLDivLoss.rst
--- a/doc/fluid/api/nn/loss/L1Loss.rst
+++ b/doc/fluid/api/nn/loss/L1Loss.rst
--- a/doc/fluid/api/nn/loss/NLLLoss.rst
+++ b/doc/fluid/api/nn/loss/NLLLoss.rst
--- a/doc/fluid/api/nn/loss/SmoothL1Loss.rst
+++ b/doc/fluid/api/nn/loss/SmoothL1Loss.rst
--- a/doc/fluid/api/nn/lrn.rst
+++ b/doc/fluid/api/nn/lrn.rst
--- a/doc/fluid/api/nn/matrix_nms.rst
+++ b/doc/fluid/api/nn/matrix_nms.rst
--- a/doc/fluid/api/nn/maxout.rst
+++ b/doc/fluid/api/nn/maxout.rst
--- a/doc/fluid/api/nn/mse_loss.rst
+++ b/doc/fluid/api/nn/mse_loss.rst
--- a/doc/fluid/api/nn/multiclass_nms.rst
+++ b/doc/fluid/api/nn/multiclass_nms.rst
--- a/doc/fluid/api/nn/natural_exp_decay.rst
+++ b/doc/fluid/api/nn/natural_exp_decay.rst
--- a/doc/fluid/api/nn/noam_decay.rst
+++ b/doc/fluid/api/nn/noam_decay.rst
--- a/doc/fluid/api/nn/npair_loss.rst
+++ b/doc/fluid/api/nn/npair_loss.rst
--- a/doc/fluid/api/nn/one_hot.rst
+++ b/doc/fluid/api/nn/one_hot.rst
--- a/doc/fluid/api/nn/pad.rst
+++ b/doc/fluid/api/nn/pad.rst
--- a/doc/fluid/api/nn/pad2d.rst
+++ b/doc/fluid/api/nn/pad2d.rst
--- a/doc/fluid/api/nn/pad_constant_like.rst
+++ b/doc/fluid/api/nn/pad_constant_like.rst
--- a/doc/fluid/api/nn/piecewise_decay.rst
+++ b/doc/fluid/api/nn/piecewise_decay.rst
--- a/doc/fluid/api/nn/pixel_shuffle.rst
+++ b/doc/fluid/api/nn/pixel_shuffle.rst
--- a/doc/fluid/api/nn/polygon_box_transform.rst
+++ b/doc/fluid/api/nn/polygon_box_transform.rst
--- a/doc/fluid/api/nn/polynomial_decay.rst
+++ b/doc/fluid/api/nn/polynomial_decay.rst
--- a/doc/fluid/api/nn/pool3d.rst
+++ b/doc/fluid/api/nn/pool3d.rst
--- a/doc/fluid/api/nn/prelu.rst
+++ b/doc/fluid/api/nn/prelu.rst
--- a/doc/fluid/api/nn/prior_box.rst
+++ b/doc/fluid/api/nn/prior_box.rst
--- a/doc/fluid/api/nn/prroi_pool.rst
+++ b/doc/fluid/api/nn/prroi_pool.rst
--- a/doc/fluid/api/nn/psroi_pool.rst
+++ b/doc/fluid/api/nn/psroi_pool.rst
--- a/doc/fluid/api/nn/random_crop.rst
+++ b/doc/fluid/api/nn/random_crop.rst
--- a/doc/fluid/api/nn/rank_loss.rst
+++ b/doc/fluid/api/nn/rank_loss.rst
--- a/doc/fluid/api/nn/relu.rst
+++ b/doc/fluid/api/nn/relu.rst
--- a/doc/fluid/api/nn/relu6.rst
+++ b/doc/fluid/api/nn/relu6.rst
--- a/doc/fluid/api/nn/resize_bilinear.rst
+++ b/doc/fluid/api/nn/resize_bilinear.rst
--- a/doc/fluid/api/nn/resize_nearest.rst
+++ b/doc/fluid/api/nn/resize_nearest.rst
--- a/doc/fluid/api/nn/resize_trilinear.rst
+++ b/doc/fluid/api/nn/resize_trilinear.rst
--- a/doc/fluid/api/nn/retinanet_detection_output.rst
+++ b/doc/fluid/api/nn/retinanet_detection_output.rst
--- a/doc/fluid/api/nn/retinanet_target_assign.rst
+++ b/doc/fluid/api/nn/retinanet_target_assign.rst
--- a/doc/fluid/api/nn/roi_align.rst
+++ b/doc/fluid/api/nn/roi_align.rst
--- a/doc/fluid/api/nn/roi_perspective_transform.rst
+++ b/doc/fluid/api/nn/roi_perspective_transform.rst
--- a/doc/fluid/api/nn/roi_pool.rst
+++ b/doc/fluid/api/nn/roi_pool.rst
--- a/doc/fluid/api/nn/row_conv.rst
+++ b/doc/fluid/api/nn/row_conv.rst
--- a/doc/fluid/api/nn/rpn_target_assign.rst
+++ b/doc/fluid/api/nn/rpn_target_assign.rst
--- a/doc/fluid/api/nn/sampled_softmax_with_cross_entropy.rst
+++ b/doc/fluid/api/nn/sampled_softmax_with_cross_entropy.rst
--- a/doc/fluid/api/nn/selu.rst
+++ b/doc/fluid/api/nn/selu.rst
--- a/doc/fluid/api/nn/shuffle_channel.rst
+++ b/doc/fluid/api/nn/shuffle_channel.rst
--- a/doc/fluid/api/nn/sigmoid_cross_entropy_with_logits.rst
+++ b/doc/fluid/api/nn/sigmoid_cross_entropy_with_logits.rst
--- a/doc/fluid/api/nn/sigmoid_focal_loss.rst
+++ b/doc/fluid/api/nn/sigmoid_focal_loss.rst
--- a/doc/fluid/api/nn/similarity_focus.rst
+++ b/doc/fluid/api/nn/similarity_focus.rst
--- a/doc/fluid/api/nn/smooth_l1.rst
+++ b/doc/fluid/api/nn/smooth_l1.rst
--- a/doc/fluid/api/nn/soft_relu.rst
+++ b/doc/fluid/api/nn/soft_relu.rst
--- a/doc/fluid/api/nn/softmax.rst
+++ b/doc/fluid/api/nn/softmax.rst
--- a/doc/fluid/api/nn/softmax_with_cross_entropy.rst
+++ b/doc/fluid/api/nn/softmax_with_cross_entropy.rst
--- a/doc/fluid/api/nn/softplus.rst
+++ b/doc/fluid/api/nn/softplus.rst
--- a/doc/fluid/api/nn/softshrink.rst
+++ b/doc/fluid/api/nn/softshrink.rst
--- a/doc/fluid/api/nn/softsign.rst
+++ b/doc/fluid/api/nn/softsign.rst
--- a/doc/fluid/api/nn/space_to_depth.rst
+++ b/doc/fluid/api/nn/space_to_depth.rst
--- a/doc/fluid/api/nn/square_error_cost.rst
+++ b/doc/fluid/api/nn/square_error_cost.rst
--- a/doc/fluid/api/nn/ssd_loss.rst
+++ b/doc/fluid/api/nn/ssd_loss.rst
--- a/doc/fluid/api/nn/swish.rst
+++ b/doc/fluid/api/nn/swish.rst
--- a/doc/fluid/api/nn/switch_case.rst
+++ b/doc/fluid/api/nn/switch_case.rst
--- a/doc/fluid/api/nn/tanhshrink.rst
+++ b/doc/fluid/api/nn/tanhshrink.rst
--- a/doc/fluid/api/nn/target_assign.rst
+++ b/doc/fluid/api/nn/target_assign.rst
--- a/doc/fluid/api/nn/teacher_student_sigmoid_loss.rst
+++ b/doc/fluid/api/nn/teacher_student_sigmoid_loss.rst
--- a/doc/fluid/api/nn/temporal_shift.rst
+++ b/doc/fluid/api/nn/temporal_shift.rst
--- a/doc/fluid/api/nn/thresholded_relu.rst
+++ b/doc/fluid/api/nn/thresholded_relu.rst
--- a/doc/fluid/api/nn/unfold.rst
+++ b/doc/fluid/api/nn/unfold.rst
--- a/doc/fluid/api/nn/warpctc.rst
+++ b/doc/fluid/api/nn/warpctc.rst
--- a/doc/fluid/api/nn/while_loop.rst
+++ b/doc/fluid/api/nn/while_loop.rst
--- a/doc/fluid/api/nn/yolo_box.rst
+++ b/doc/fluid/api/nn/yolo_box.rst
--- a/doc/fluid/api/nn/yolov3_loss.rst
+++ b/doc/fluid/api/nn/yolov3_loss.rst
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
--- a/doc/fluid/api/optimizer/Adadelta.rst
+++ b/doc/fluid/api/optimizer/Adadelta.rst
--- a/doc/fluid/api/optimizer/AdadeltaOptimizer.rst
+++ b/doc/fluid/api/optimizer/AdadeltaOptimizer.rst
--- a/doc/fluid/api/optimizer/Adagrad.rst
+++ b/doc/fluid/api/optimizer/Adagrad.rst
--- a/doc/fluid/api/optimizer/AdagradOptimizer.rst
+++ b/doc/fluid/api/optimizer/AdagradOptimizer.rst
--- a/doc/fluid/api/optimizer/Adam.rst
+++ b/doc/fluid/api/optimizer/Adam.rst
--- a/doc/fluid/api/optimizer/AdamOptimizer.rst
+++ b/doc/fluid/api/optimizer/AdamOptimizer.rst
--- a/doc/fluid/api/optimizer/AdamW.rst
+++ b/doc/fluid/api/optimizer/AdamW.rst
--- a/doc/fluid/api/optimizer/Adamax.rst
+++ b/doc/fluid/api/optimizer/Adamax.rst
--- a/doc/fluid/api/optimizer/AdamaxOptimizer.rst
+++ b/doc/fluid/api/optimizer/AdamaxOptimizer.rst
--- a/doc/fluid/api/optimizer/CosineAnnealingLR.rst
+++ b/doc/fluid/api/optimizer/CosineAnnealingLR.rst
--- a/doc/fluid/api/optimizer/DGCMomentumOptimizer.rst
+++ b/doc/fluid/api/optimizer/DGCMomentumOptimizer.rst
--- a/doc/fluid/api/optimizer/DecayedAdagrad.rst
+++ b/doc/fluid/api/optimizer/DecayedAdagrad.rst
--- a/doc/fluid/api/optimizer/DecayedAdagradOptimizer.rst
+++ b/doc/fluid/api/optimizer/DecayedAdagradOptimizer.rst
--- a/doc/fluid/api/optimizer/Dpsgd.rst
+++ b/doc/fluid/api/optimizer/Dpsgd.rst
--- a/doc/fluid/api/optimizer/DpsgdOptimizer.rst
+++ b/doc/fluid/api/optimizer/DpsgdOptimizer.rst
--- a/doc/fluid/api/optimizer/ExponentialLR.rst
+++ b/doc/fluid/api/optimizer/ExponentialLR.rst
--- a/doc/fluid/api/optimizer/ExponentialMovingAverage.rst
+++ b/doc/fluid/api/optimizer/ExponentialMovingAverage.rst
--- a/doc/fluid/api/optimizer/Ftrl.rst
+++ b/doc/fluid/api/optimizer/Ftrl.rst
--- a/doc/fluid/api/optimizer/FtrlOptimizer.rst
+++ b/doc/fluid/api/optimizer/FtrlOptimizer.rst
--- a/doc/fluid/api/optimizer/InverseTimeLR.rst
+++ b/doc/fluid/api/optimizer/InverseTimeLR.rst
--- a/doc/fluid/api/optimizer/LambOptimizer.rst
+++ b/doc/fluid/api/optimizer/LambOptimizer.rst
--- a/doc/fluid/api/optimizer/LambdaLR.rst
+++ b/doc/fluid/api/optimizer/LambdaLR.rst
--- a/doc/fluid/api/optimizer/LarsMomentum.rst
+++ b/doc/fluid/api/optimizer/LarsMomentum.rst
--- a/doc/fluid/api/optimizer/LarsMomentumOptimizer.rst
+++ b/doc/fluid/api/optimizer/LarsMomentumOptimizer.rst
--- a/doc/fluid/api/optimizer/LinearLrWarmup.rst
+++ b/doc/fluid/api/optimizer/LinearLrWarmup.rst
--- a/doc/fluid/api/optimizer/LookaheadOptimizer.rst
+++ b/doc/fluid/api/optimizer/LookaheadOptimizer.rst
--- a/doc/fluid/api/optimizer/ModelAverage.rst
+++ b/doc/fluid/api/optimizer/ModelAverage.rst
--- a/doc/fluid/api/optimizer/Momentum.rst
+++ b/doc/fluid/api/optimizer/Momentum.rst
--- a/doc/fluid/api/optimizer/MomentumOptimizer.rst
+++ b/doc/fluid/api/optimizer/MomentumOptimizer.rst
--- a/doc/fluid/api/optimizer/MultiStepLR.rst
+++ b/doc/fluid/api/optimizer/MultiStepLR.rst
--- a/doc/fluid/api/optimizer/NaturalExpLR.rst
+++ b/doc/fluid/api/optimizer/NaturalExpLR.rst
--- a/doc/fluid/api/optimizer/NoamLR.rst
+++ b/doc/fluid/api/optimizer/NoamLR.rst
--- a/doc/fluid/api/optimizer/Optimizer.rst
+++ b/doc/fluid/api/optimizer/Optimizer.rst
--- a/doc/fluid/api/optimizer/PiecewiseLR.rst
+++ b/doc/fluid/api/optimizer/PiecewiseLR.rst
--- a/doc/fluid/api/optimizer/PipelineOptimizer.rst
+++ b/doc/fluid/api/optimizer/PipelineOptimizer.rst
--- a/doc/fluid/api/optimizer/PolynomialLR.rst
+++ b/doc/fluid/api/optimizer/PolynomialLR.rst
--- a/doc/fluid/api/optimizer/RMSProp.rst
+++ b/doc/fluid/api/optimizer/RMSProp.rst
--- a/doc/fluid/api/optimizer/RMSPropOptimizer.rst
+++ b/doc/fluid/api/optimizer/RMSPropOptimizer.rst
--- a/doc/fluid/api/optimizer/RecomputeOptimizer.rst
+++ b/doc/fluid/api/optimizer/RecomputeOptimizer.rst
--- a/doc/fluid/api/optimizer/ReduceLROnPlateau.rst
+++ b/doc/fluid/api/optimizer/ReduceLROnPlateau.rst
--- a/doc/fluid/api/optimizer/SGD.rst
+++ b/doc/fluid/api/optimizer/SGD.rst
--- a/doc/fluid/api/optimizer/SGDOptimizer.rst
+++ b/doc/fluid/api/optimizer/SGDOptimizer.rst
--- a/doc/fluid/api/optimizer/StepLR.rst
+++ b/doc/fluid/api/optimizer/StepLR.rst
--- a/doc/fluid/api/paddle.rst
+++ b/doc/fluid/api/paddle.rst
--- a/doc/fluid/api/paddle/BuildStrategy.rst
+++ b/doc/fluid/api/paddle/BuildStrategy.rst
--- a/doc/fluid/api/paddle/CPUPlace.rst
+++ b/doc/fluid/api/paddle/CPUPlace.rst
--- a/doc/fluid/api/paddle/CUDAPinnedPlace.rst
+++ b/doc/fluid/api/paddle/CUDAPinnedPlace.rst
--- a/doc/fluid/api/paddle/CUDAPlace.rst
+++ b/doc/fluid/api/paddle/CUDAPlace.rst
--- a/doc/fluid/api/paddle/CompiledProgram.rst
+++ b/doc/fluid/api/paddle/CompiledProgram.rst
--- a/doc/fluid/api/paddle/DataParallel.rst
+++ b/doc/fluid/api/paddle/DataParallel.rst
--- a/doc/fluid/api/paddle/ExecutionStrategy.rst
+++ b/doc/fluid/api/paddle/ExecutionStrategy.rst
--- a/doc/fluid/api/paddle/Executor.rst
+++ b/doc/fluid/api/paddle/Executor.rst
--- a/doc/fluid/api/paddle/ParallelExecutor.rst
+++ b/doc/fluid/api/paddle/ParallelExecutor.rst
--- a/doc/fluid/api/paddle/ParamAttr.rst
+++ b/doc/fluid/api/paddle/ParamAttr.rst
--- a/doc/fluid/api/paddle/Print.rst
+++ b/doc/fluid/api/paddle/Print.rst
--- a/doc/fluid/api/paddle/Program.rst
+++ b/doc/fluid/api/paddle/Program.rst
--- a/doc/fluid/api/paddle/Variable.rst
+++ b/doc/fluid/api/paddle/Variable.rst
--- a/doc/fluid/api/paddle/WeightNormParamAttr.rst
+++ b/doc/fluid/api/paddle/WeightNormParamAttr.rst
--- a/doc/fluid/api/paddle/abs.rst
+++ b/doc/fluid/api/paddle/abs.rst
--- a/doc/fluid/api/paddle/acos.rst
+++ b/doc/fluid/api/paddle/acos.rst
--- a/doc/fluid/api/paddle/add.rst
+++ b/doc/fluid/api/paddle/add.rst
--- a/doc/fluid/api/paddle/addcmul.rst
+++ b/doc/fluid/api/paddle/addcmul.rst
--- a/doc/fluid/api/paddle/addmm.rst
+++ b/doc/fluid/api/paddle/addmm.rst
--- a/doc/fluid/api/paddle/allclose.rst
+++ b/doc/fluid/api/paddle/allclose.rst
--- a/doc/fluid/api/paddle/append_backward.rst
+++ b/doc/fluid/api/paddle/append_backward.rst
--- a/doc/fluid/api/paddle/arange.rst
+++ b/doc/fluid/api/paddle/arange.rst
--- a/doc/fluid/api/paddle/argmax.rst
+++ b/doc/fluid/api/paddle/argmax.rst
--- a/doc/fluid/api/paddle/argmin.rst
+++ b/doc/fluid/api/paddle/argmin.rst
--- a/doc/fluid/api/paddle/argsort.rst
+++ b/doc/fluid/api/paddle/argsort.rst
--- a/doc/fluid/api/paddle/asin.rst
+++ b/doc/fluid/api/paddle/asin.rst
--- a/doc/fluid/api/paddle/atan.rst
+++ b/doc/fluid/api/paddle/atan.rst
--- a/doc/fluid/api/paddle/bmm.rst
+++ b/doc/fluid/api/paddle/bmm.rst
--- a/doc/fluid/api/paddle/cast.rst
+++ b/doc/fluid/api/paddle/cast.rst
--- a/doc/fluid/api/paddle/ceil.rst
+++ b/doc/fluid/api/paddle/ceil.rst
--- a/doc/fluid/api/paddle/cholesky.rst
+++ b/doc/fluid/api/paddle/cholesky.rst
--- a/doc/fluid/api/paddle/clamp.rst
+++ b/doc/fluid/api/paddle/clamp.rst
--- a/doc/fluid/api/paddle/concat.rst
+++ b/doc/fluid/api/paddle/concat.rst
--- a/doc/fluid/api/paddle/cos.rst
+++ b/doc/fluid/api/paddle/cos.rst
--- a/doc/fluid/api/paddle/create_global_var.rst
+++ b/doc/fluid/api/paddle/create_global_var.rst
--- a/doc/fluid/api/paddle/create_parameter.rst
+++ b/doc/fluid/api/paddle/create_parameter.rst
--- a/doc/fluid/api/paddle/create_tensor.rst
+++ b/doc/fluid/api/paddle/create_tensor.rst
--- a/doc/fluid/api/paddle/crop_tensor.rst
+++ b/doc/fluid/api/paddle/crop_tensor.rst
--- a/doc/fluid/api/paddle/cross.rst
+++ b/doc/fluid/api/paddle/cross.rst
--- a/doc/fluid/api/paddle/cumsum.rst
+++ b/doc/fluid/api/paddle/cumsum.rst
--- a/doc/fluid/api/paddle/default_main_program.rst
+++ b/doc/fluid/api/paddle/default_main_program.rst
--- a/doc/fluid/api/paddle/default_startup_program.rst
+++ b/doc/fluid/api/paddle/default_startup_program.rst
--- a/doc/fluid/api/paddle/diag.rst
+++ b/doc/fluid/api/paddle/diag.rst
--- a/doc/fluid/api/paddle/disable_imperative.rst
+++ b/doc/fluid/api/paddle/disable_imperative.rst
--- a/doc/fluid/api/paddle/dist.rst
+++ b/doc/fluid/api/paddle/dist.rst
--- a/doc/fluid/api/paddle/distribution.rst
+++ b/doc/fluid/api/paddle/distribution.rst
--- a/doc/fluid/api/paddle/distribution/Distribution.rst
+++ b/doc/fluid/api/paddle/distribution/Distribution.rst
--- a/doc/fluid/api/paddle/distribution/Normal.rst
+++ b/doc/fluid/api/paddle/distribution/Normal.rst
--- a/doc/fluid/api/paddle/distribution/Uniform.rst
+++ b/doc/fluid/api/paddle/distribution/Uniform.rst
--- a/doc/fluid/api/paddle/div.rst
+++ b/doc/fluid/api/paddle/div.rst
--- a/doc/fluid/api/paddle/dot.rst
+++ b/doc/fluid/api/paddle/dot.rst
--- a/doc/fluid/api/paddle/elementwise_add.rst
+++ b/doc/fluid/api/paddle/elementwise_add.rst
--- a/doc/fluid/api/paddle/elementwise_div.rst
+++ b/doc/fluid/api/paddle/elementwise_div.rst
--- a/doc/fluid/api/paddle/elementwise_floordiv.rst
+++ b/doc/fluid/api/paddle/elementwise_floordiv.rst
--- a/doc/fluid/api/paddle/elementwise_mod.rst
+++ b/doc/fluid/api/paddle/elementwise_mod.rst
--- a/doc/fluid/api/paddle/elementwise_mul.rst
+++ b/doc/fluid/api/paddle/elementwise_mul.rst
--- a/doc/fluid/api/paddle/elementwise_pow.rst
+++ b/doc/fluid/api/paddle/elementwise_pow.rst
--- a/doc/fluid/api/paddle/elementwise_sub.rst
+++ b/doc/fluid/api/paddle/elementwise_sub.rst
--- a/doc/fluid/api/paddle/elementwise_sum.rst
+++ b/doc/fluid/api/paddle/elementwise_sum.rst
--- a/doc/fluid/api/paddle/enable_imperative.rst
+++ b/doc/fluid/api/paddle/enable_imperative.rst
--- a/doc/fluid/api/paddle/equal.rst
+++ b/doc/fluid/api/paddle/equal.rst
--- a/doc/fluid/api/paddle/equal_all.rst
+++ b/doc/fluid/api/paddle/equal_all.rst
--- a/doc/fluid/api/paddle/erf.rst
+++ b/doc/fluid/api/paddle/erf.rst
--- a/doc/fluid/api/paddle/exp.rst
+++ b/doc/fluid/api/paddle/exp.rst
--- a/doc/fluid/api/paddle/expand.rst
+++ b/doc/fluid/api/paddle/expand.rst
--- a/doc/fluid/api/paddle/expand_as.rst
+++ b/doc/fluid/api/paddle/expand_as.rst
--- a/doc/fluid/api/paddle/eye.rst
+++ b/doc/fluid/api/paddle/eye.rst
--- a/doc/fluid/api/paddle/fill_constant.rst
+++ b/doc/fluid/api/paddle/fill_constant.rst
--- a/doc/fluid/api/paddle/flatten.rst
+++ b/doc/fluid/api/paddle/flatten.rst
--- a/doc/fluid/api/paddle/flip.rst
+++ b/doc/fluid/api/paddle/flip.rst
--- a/doc/fluid/api/paddle/floor.rst
+++ b/doc/fluid/api/paddle/floor.rst
--- a/doc/fluid/api/paddle/full.rst
+++ b/doc/fluid/api/paddle/full.rst
--- a/doc/fluid/api/paddle/full_like.rst
+++ b/doc/fluid/api/paddle/full_like.rst
--- a/doc/fluid/api/paddle/gather.rst
+++ b/doc/fluid/api/paddle/gather.rst
--- a/doc/fluid/api/paddle/gather_nd.rst
+++ b/doc/fluid/api/paddle/gather_nd.rst
--- a/doc/fluid/api/paddle/global_scope.rst
+++ b/doc/fluid/api/paddle/global_scope.rst
--- a/doc/fluid/api/paddle/gradients.rst
+++ b/doc/fluid/api/paddle/gradients.rst
--- a/doc/fluid/api/paddle/greater_equal.rst
+++ b/doc/fluid/api/paddle/greater_equal.rst
--- a/doc/fluid/api/paddle/greater_than.rst
+++ b/doc/fluid/api/paddle/greater_than.rst
--- a/doc/fluid/api/paddle/has_inf.rst
+++ b/doc/fluid/api/paddle/has_inf.rst
--- a/doc/fluid/api/paddle/has_nan.rst
+++ b/doc/fluid/api/paddle/has_nan.rst
--- a/doc/fluid/api/paddle/in_imperative_mode.rst
+++ b/doc/fluid/api/paddle/in_imperative_mode.rst
--- a/doc/fluid/api/paddle/increment.rst
+++ b/doc/fluid/api/paddle/increment.rst
--- a/doc/fluid/api/paddle/index_sample.rst
+++ b/doc/fluid/api/paddle/index_sample.rst
--- a/doc/fluid/api/paddle/index_select.rst
+++ b/doc/fluid/api/paddle/index_select.rst
--- a/doc/fluid/api/paddle/inverse.rst
+++ b/doc/fluid/api/paddle/inverse.rst
--- a/doc/fluid/api/paddle/is_empty.rst
+++ b/doc/fluid/api/paddle/is_empty.rst
--- a/doc/fluid/api/paddle/isfinite.rst
+++ b/doc/fluid/api/paddle/isfinite.rst
--- a/doc/fluid/api/paddle/kron.rst
+++ b/doc/fluid/api/paddle/kron.rst
--- a/doc/fluid/api/paddle/less_equal.rst
+++ b/doc/fluid/api/paddle/less_equal.rst
--- a/doc/fluid/api/paddle/less_than.rst
+++ b/doc/fluid/api/paddle/less_than.rst
--- a/doc/fluid/api/paddle/linspace.rst
+++ b/doc/fluid/api/paddle/linspace.rst
--- a/doc/fluid/api/paddle/load.rst
+++ b/doc/fluid/api/paddle/load.rst
--- a/doc/fluid/api/paddle/log.rst
+++ b/doc/fluid/api/paddle/log.rst
--- a/doc/fluid/api/paddle/log1p.rst
+++ b/doc/fluid/api/paddle/log1p.rst
--- a/doc/fluid/api/paddle/logical_and.rst
+++ b/doc/fluid/api/paddle/logical_and.rst
--- a/doc/fluid/api/paddle/logical_not.rst
+++ b/doc/fluid/api/paddle/logical_not.rst
--- a/doc/fluid/api/paddle/logical_or.rst
+++ b/doc/fluid/api/paddle/logical_or.rst
--- a/doc/fluid/api/paddle/logical_xor.rst
+++ b/doc/fluid/api/paddle/logical_xor.rst
--- a/doc/fluid/api/paddle/logsumexp.rst
+++ b/doc/fluid/api/paddle/logsumexp.rst
--- a/doc/fluid/api/paddle/manual_seed.rst
+++ b/doc/fluid/api/paddle/manual_seed.rst
--- a/doc/fluid/api/paddle/matmul.rst
+++ b/doc/fluid/api/paddle/matmul.rst
--- a/doc/fluid/api/paddle/max.rst
+++ b/doc/fluid/api/paddle/max.rst
--- a/doc/fluid/api/paddle/maximum.rst
+++ b/doc/fluid/api/paddle/maximum.rst
--- a/doc/fluid/api/paddle/mean.rst
+++ b/doc/fluid/api/paddle/mean.rst
--- a/doc/fluid/api/paddle/meshgrid.rst
+++ b/doc/fluid/api/paddle/meshgrid.rst
--- a/doc/fluid/api/paddle/min.rst
+++ b/doc/fluid/api/paddle/min.rst
--- a/doc/fluid/api/paddle/minimum.rst
+++ b/doc/fluid/api/paddle/minimum.rst
--- a/doc/fluid/api/paddle/mm.rst
+++ b/doc/fluid/api/paddle/mm.rst
--- a/doc/fluid/api/paddle/mul.rst
+++ b/doc/fluid/api/paddle/mul.rst
--- a/doc/fluid/api/paddle/multiplex.rst
+++ b/doc/fluid/api/paddle/multiplex.rst
--- a/doc/fluid/api/paddle/name_scope.rst
+++ b/doc/fluid/api/paddle/name_scope.rst
--- a/doc/fluid/api/paddle/nonzero.rst
+++ b/doc/fluid/api/paddle/nonzero.rst
--- a/doc/fluid/api/paddle/norm.rst
+++ b/doc/fluid/api/paddle/norm.rst
--- a/doc/fluid/api/paddle/not_equal.rst
+++ b/doc/fluid/api/paddle/not_equal.rst
--- a/doc/fluid/api/paddle/ones.rst
+++ b/doc/fluid/api/paddle/ones.rst
--- a/doc/fluid/api/paddle/ones_like.rst
+++ b/doc/fluid/api/paddle/ones_like.rst
--- a/doc/fluid/api/paddle/pow.rst
+++ b/doc/fluid/api/paddle/pow.rst
--- a/doc/fluid/api/paddle/program_guard.rst
+++ b/doc/fluid/api/paddle/program_guard.rst
--- a/doc/fluid/api/paddle/py_func.rst
+++ b/doc/fluid/api/paddle/py_func.rst
--- a/doc/fluid/api/paddle/rand.rst
+++ b/doc/fluid/api/paddle/rand.rst
--- a/doc/fluid/api/paddle/randint.rst
+++ b/doc/fluid/api/paddle/randint.rst
--- a/doc/fluid/api/paddle/randn.rst
+++ b/doc/fluid/api/paddle/randn.rst
--- a/doc/fluid/api/paddle/randperm.rst
+++ b/doc/fluid/api/paddle/randperm.rst
--- a/doc/fluid/api/paddle/rank.rst
+++ b/doc/fluid/api/paddle/rank.rst
--- a/doc/fluid/api/paddle/reciprocal.rst
+++ b/doc/fluid/api/paddle/reciprocal.rst
--- a/doc/fluid/api/paddle/reduce_all.rst
+++ b/doc/fluid/api/paddle/reduce_all.rst
--- a/doc/fluid/api/paddle/reduce_any.rst
+++ b/doc/fluid/api/paddle/reduce_any.rst
--- a/doc/fluid/api/paddle/reduce_max.rst
+++ b/doc/fluid/api/paddle/reduce_max.rst
--- a/doc/fluid/api/paddle/reduce_mean.rst
+++ b/doc/fluid/api/paddle/reduce_mean.rst
--- a/doc/fluid/api/paddle/reduce_min.rst
+++ b/doc/fluid/api/paddle/reduce_min.rst
--- a/doc/fluid/api/paddle/reduce_prod.rst
+++ b/doc/fluid/api/paddle/reduce_prod.rst
--- a/doc/fluid/api/paddle/reduce_sum.rst
+++ b/doc/fluid/api/paddle/reduce_sum.rst
--- a/doc/fluid/api/paddle/reshape.rst
+++ b/doc/fluid/api/paddle/reshape.rst
--- a/doc/fluid/api/paddle/reverse.rst
+++ b/doc/fluid/api/paddle/reverse.rst
--- a/doc/fluid/api/paddle/roll.rst
+++ b/doc/fluid/api/paddle/roll.rst
--- a/doc/fluid/api/paddle/round.rst
+++ b/doc/fluid/api/paddle/round.rst
--- a/doc/fluid/api/paddle/rsqrt.rst
+++ b/doc/fluid/api/paddle/rsqrt.rst
--- a/doc/fluid/api/paddle/save.rst
+++ b/doc/fluid/api/paddle/save.rst
--- a/doc/fluid/api/paddle/scale.rst
+++ b/doc/fluid/api/paddle/scale.rst
--- a/doc/fluid/api/paddle/scatter.rst
+++ b/doc/fluid/api/paddle/scatter.rst
--- a/doc/fluid/api/paddle/scatter_nd.rst
+++ b/doc/fluid/api/paddle/scatter_nd.rst
--- a/doc/fluid/api/paddle/scatter_nd_add.rst
+++ b/doc/fluid/api/paddle/scatter_nd_add.rst
--- a/doc/fluid/api/paddle/scope_guard.rst
+++ b/doc/fluid/api/paddle/scope_guard.rst
--- a/doc/fluid/api/paddle/shape.rst
+++ b/doc/fluid/api/paddle/shape.rst
--- a/doc/fluid/api/paddle/shard_index.rst
+++ b/doc/fluid/api/paddle/shard_index.rst
--- a/doc/fluid/api/paddle/shuffle.rst
+++ b/doc/fluid/api/paddle/shuffle.rst
--- a/doc/fluid/api/paddle/sign.rst
+++ b/doc/fluid/api/paddle/sign.rst
--- a/doc/fluid/api/paddle/sin.rst
+++ b/doc/fluid/api/paddle/sin.rst
--- a/doc/fluid/api/paddle/slice.rst
+++ b/doc/fluid/api/paddle/slice.rst
--- a/doc/fluid/api/paddle/sort.rst
+++ b/doc/fluid/api/paddle/sort.rst
--- a/doc/fluid/api/paddle/split.rst
+++ b/doc/fluid/api/paddle/split.rst
--- a/doc/fluid/api/paddle/sqrt.rst
+++ b/doc/fluid/api/paddle/sqrt.rst
--- a/doc/fluid/api/paddle/square.rst
+++ b/doc/fluid/api/paddle/square.rst
--- a/doc/fluid/api/paddle/squeeze.rst
+++ b/doc/fluid/api/paddle/squeeze.rst
--- a/doc/fluid/api/paddle/stack.rst
+++ b/doc/fluid/api/paddle/stack.rst
--- a/doc/fluid/api/paddle/stanh.rst
+++ b/doc/fluid/api/paddle/stanh.rst
--- a/doc/fluid/api/paddle/std.rst
+++ b/doc/fluid/api/paddle/std.rst
--- a/doc/fluid/api/paddle/strided_slice.rst
+++ b/doc/fluid/api/paddle/strided_slice.rst
--- a/doc/fluid/api/paddle/sum.rst
+++ b/doc/fluid/api/paddle/sum.rst
--- a/doc/fluid/api/paddle/sums.rst
+++ b/doc/fluid/api/paddle/sums.rst
--- a/doc/fluid/api/paddle/t.rst
+++ b/doc/fluid/api/paddle/t.rst
--- a/doc/fluid/api/paddle/tanh.rst
+++ b/doc/fluid/api/paddle/tanh.rst
--- a/doc/fluid/api/paddle/topk.rst
+++ b/doc/fluid/api/paddle/topk.rst
--- a/doc/fluid/api/paddle/trace.rst
+++ b/doc/fluid/api/paddle/trace.rst
--- a/doc/fluid/api/paddle/transpose.rst
+++ b/doc/fluid/api/paddle/transpose.rst
--- a/doc/fluid/api/paddle/tril.rst
+++ b/doc/fluid/api/paddle/tril.rst
--- a/doc/fluid/api/paddle/triu.rst
+++ b/doc/fluid/api/paddle/triu.rst
--- a/doc/fluid/api/paddle/unbind.rst
+++ b/doc/fluid/api/paddle/unbind.rst
--- a/doc/fluid/api/paddle/unique.rst
+++ b/doc/fluid/api/paddle/unique.rst
--- a/doc/fluid/api/paddle/unique_with_counts.rst
+++ b/doc/fluid/api/paddle/unique_with_counts.rst
--- a/doc/fluid/api/paddle/unsqueeze.rst
+++ b/doc/fluid/api/paddle/unsqueeze.rst
--- a/doc/fluid/api/paddle/unstack.rst
+++ b/doc/fluid/api/paddle/unstack.rst
--- a/doc/fluid/api/paddle/var.rst
+++ b/doc/fluid/api/paddle/var.rst
--- a/doc/fluid/api/paddle/where.rst
+++ b/doc/fluid/api/paddle/where.rst
--- a/doc/fluid/api/paddle/zeros.rst
+++ b/doc/fluid/api/paddle/zeros.rst
--- a/doc/fluid/api/paddle/zeros_like.rst
+++ b/doc/fluid/api/paddle/zeros_like.rst
--- a/doc/fluid/api/recordio_writer/convert_reader_to_recordio_file.rst
+++ b/doc/fluid/api/recordio_writer/convert_reader_to_recordio_file.rst
--- a/doc/fluid/api/recordio_writer/convert_reader_to_recordio_files.rst
+++ b/doc/fluid/api/recordio_writer/convert_reader_to_recordio_files.rst
--- a/doc/fluid/api/review_tmp.rst
+++ b/doc/fluid/api/review_tmp.rst
--- a/doc/fluid/api/review_tmp/MarginRankingLoss.rst
+++ b/doc/fluid/api/review_tmp/MarginRankingLoss.rst
--- a/doc/fluid/api/review_tmp/margin_ranking_loss.rst
+++ b/doc/fluid/api/review_tmp/margin_ranking_loss.rst
--- a/doc/fluid/api/static.rst
+++ b/doc/fluid/api/static.rst
--- a/doc/fluid/api/static/InputSpec.rst
+++ b/doc/fluid/api/static/InputSpec.rst
--- a/doc/fluid/api/static/data.rst
+++ b/doc/fluid/api/static/data.rst
--- a/doc/fluid/api/tensor.rst
+++ b/doc/fluid/api/tensor.rst
--- a/doc/fluid/api/tensor/abs.rst
+++ b/doc/fluid/api/tensor/abs.rst
--- a/doc/fluid/api/tensor/acos.rst
+++ b/doc/fluid/api/tensor/acos.rst
--- a/doc/fluid/api/tensor/add.rst
+++ b/doc/fluid/api/tensor/add.rst
--- a/doc/fluid/api/tensor/arange.rst
+++ b/doc/fluid/api/tensor/arange.rst
--- a/doc/fluid/api/tensor/argmax.rst
+++ b/doc/fluid/api/tensor/argmax.rst
--- a/doc/fluid/api/tensor/argmin.rst
+++ b/doc/fluid/api/tensor/argmin.rst
--- a/doc/fluid/api/tensor/argsort.rst
+++ b/doc/fluid/api/tensor/argsort.rst
--- a/doc/fluid/api/tensor/asin.rst
+++ b/doc/fluid/api/tensor/asin.rst
--- a/doc/fluid/api/tensor/atan.rst
+++ b/doc/fluid/api/tensor/atan.rst
--- a/doc/fluid/api/tensor/cast.rst
+++ b/doc/fluid/api/tensor/cast.rst
--- a/doc/fluid/api/tensor/ceil.rst
+++ b/doc/fluid/api/tensor/ceil.rst
--- a/doc/fluid/api/tensor/chunk.rst
+++ b/doc/fluid/api/tensor/chunk.rst
--- a/doc/fluid/api/tensor/concat.rst
+++ b/doc/fluid/api/tensor/concat.rst
--- a/doc/fluid/api/tensor/cos.rst
+++ b/doc/fluid/api/tensor/cos.rst
--- a/doc/fluid/api/tensor/create_tensor.rst
+++ b/doc/fluid/api/tensor/create_tensor.rst
--- a/doc/fluid/api/tensor/crop_tensor.rst
+++ b/doc/fluid/api/tensor/crop_tensor.rst
--- a/doc/fluid/api/tensor/cross.rst
+++ b/doc/fluid/api/tensor/cross.rst
--- a/doc/fluid/api/tensor/cumsum.rst
+++ b/doc/fluid/api/tensor/cumsum.rst
--- a/doc/fluid/api/tensor/diag.rst
+++ b/doc/fluid/api/tensor/diag.rst
--- a/doc/fluid/api/tensor/div.rst
+++ b/doc/fluid/api/tensor/div.rst
--- a/doc/fluid/api/tensor/elementwise_add.rst
+++ b/doc/fluid/api/tensor/elementwise_add.rst
--- a/doc/fluid/api/tensor/elementwise_div.rst
+++ b/doc/fluid/api/tensor/elementwise_div.rst
--- a/doc/fluid/api/tensor/elementwise_floordiv.rst
+++ b/doc/fluid/api/tensor/elementwise_floordiv.rst
--- a/doc/fluid/api/tensor/elementwise_mod.rst
+++ b/doc/fluid/api/tensor/elementwise_mod.rst
--- a/doc/fluid/api/tensor/elementwise_mul.rst
+++ b/doc/fluid/api/tensor/elementwise_mul.rst
--- a/doc/fluid/api/tensor/elementwise_pow.rst
+++ b/doc/fluid/api/tensor/elementwise_pow.rst
--- a/doc/fluid/api/tensor/elementwise_sub.rst
+++ b/doc/fluid/api/tensor/elementwise_sub.rst
--- a/doc/fluid/api/tensor/equal_all.rst
+++ b/doc/fluid/api/tensor/equal_all.rst
--- a/doc/fluid/api/tensor/erf.rst
+++ b/doc/fluid/api/tensor/erf.rst
--- a/doc/fluid/api/tensor/exp.rst
+++ b/doc/fluid/api/tensor/exp.rst
--- a/doc/fluid/api/tensor/expand.rst
+++ b/doc/fluid/api/tensor/expand.rst
--- a/doc/fluid/api/tensor/expand_as.rst
+++ b/doc/fluid/api/tensor/expand_as.rst
--- a/doc/fluid/api/tensor/eye.rst
+++ b/doc/fluid/api/tensor/eye.rst
--- a/doc/fluid/api/tensor/fill_constant.rst
+++ b/doc/fluid/api/tensor/fill_constant.rst
--- a/doc/fluid/api/tensor/flatten.rst
+++ b/doc/fluid/api/tensor/flatten.rst
--- a/doc/fluid/api/tensor/floor.rst
+++ b/doc/fluid/api/tensor/floor.rst
--- a/doc/fluid/api/tensor/full.rst
+++ b/doc/fluid/api/tensor/full.rst
--- a/doc/fluid/api/tensor/full_like.rst
+++ b/doc/fluid/api/tensor/full_like.rst
--- a/doc/fluid/api/tensor/gather.rst
+++ b/doc/fluid/api/tensor/gather.rst
--- a/doc/fluid/api/tensor/gather_nd.rst
+++ b/doc/fluid/api/tensor/gather_nd.rst
--- a/doc/fluid/api/tensor/greater_equal.rst
+++ b/doc/fluid/api/tensor/greater_equal.rst
--- a/doc/fluid/api/tensor/greater_than.rst
+++ b/doc/fluid/api/tensor/greater_than.rst
--- a/doc/fluid/api/tensor/has_inf.rst
+++ b/doc/fluid/api/tensor/has_inf.rst
--- a/doc/fluid/api/tensor/has_nan.rst
+++ b/doc/fluid/api/tensor/has_nan.rst
--- a/doc/fluid/api/tensor/increment.rst
+++ b/doc/fluid/api/tensor/increment.rst
--- a/doc/fluid/api/tensor/index_select.rst
+++ b/doc/fluid/api/tensor/index_select.rst
--- a/doc/fluid/api/tensor/is_empty.rst
+++ b/doc/fluid/api/tensor/is_empty.rst
--- a/doc/fluid/api/tensor/isfinite.rst
+++ b/doc/fluid/api/tensor/isfinite.rst
--- a/doc/fluid/api/tensor/isinf.rst
+++ b/doc/fluid/api/tensor/isinf.rst
--- a/doc/fluid/api/tensor/isnan.rst
+++ b/doc/fluid/api/tensor/isnan.rst
--- a/doc/fluid/api/tensor/less_equal.rst
+++ b/doc/fluid/api/tensor/less_equal.rst
--- a/doc/fluid/api/tensor/less_than.rst
+++ b/doc/fluid/api/tensor/less_than.rst
--- a/doc/fluid/api/tensor/linalg.rst
+++ b/doc/fluid/api/tensor/linalg.rst
--- a/doc/fluid/api/tensor/linalg/dist.rst
+++ b/doc/fluid/api/tensor/linalg/dist.rst
--- a/doc/fluid/api/tensor/linspace.rst
+++ b/doc/fluid/api/tensor/linspace.rst
--- a/doc/fluid/api/tensor/load.rst
+++ b/doc/fluid/api/tensor/load.rst
--- a/doc/fluid/api/tensor/log.rst
+++ b/doc/fluid/api/tensor/log.rst
--- a/doc/fluid/api/tensor/logic.rst
+++ b/doc/fluid/api/tensor/logic.rst
--- a/doc/fluid/api/tensor/logic/allclose.rst
+++ b/doc/fluid/api/tensor/logic/allclose.rst
--- a/doc/fluid/api/tensor/logical_and.rst
+++ b/doc/fluid/api/tensor/logical_and.rst
--- a/doc/fluid/api/tensor/logical_not.rst
+++ b/doc/fluid/api/tensor/logical_not.rst
--- a/doc/fluid/api/tensor/logical_or.rst
+++ b/doc/fluid/api/tensor/logical_or.rst
--- a/doc/fluid/api/tensor/logical_xor.rst
+++ b/doc/fluid/api/tensor/logical_xor.rst
--- a/doc/fluid/api/tensor/masked_select.rst
+++ b/doc/fluid/api/tensor/masked_select.rst
--- a/doc/fluid/api/tensor/math.rst
+++ b/doc/fluid/api/tensor/math.rst
--- a/doc/fluid/api/tensor/math/add.rst
+++ b/doc/fluid/api/tensor/math/add.rst
--- a/doc/fluid/api/tensor/math/atan.rst
+++ b/doc/fluid/api/tensor/math/atan.rst
--- a/doc/fluid/api/tensor/math/div.rst
+++ b/doc/fluid/api/tensor/math/div.rst
--- a/doc/fluid/api/tensor/math/divide.rst
+++ b/doc/fluid/api/tensor/math/divide.rst
--- a/doc/fluid/api/tensor/math/elementwise_sum.rst
+++ b/doc/fluid/api/tensor/math/elementwise_sum.rst
--- a/doc/fluid/api/tensor/math/floor_divide.rst
+++ b/doc/fluid/api/tensor/math/floor_divide.rst
--- a/doc/fluid/api/tensor/math/floor_mod.rst
+++ b/doc/fluid/api/tensor/math/floor_mod.rst
--- a/doc/fluid/api/tensor/math/logsumexp.rst
+++ b/doc/fluid/api/tensor/math/logsumexp.rst
--- a/doc/fluid/api/tensor/math/mm.rst
+++ b/doc/fluid/api/tensor/math/mm.rst
--- a/doc/fluid/api/tensor/math/mod.rst
+++ b/doc/fluid/api/tensor/math/mod.rst
--- a/doc/fluid/api/tensor/math/mul.rst
+++ b/doc/fluid/api/tensor/math/mul.rst
--- a/doc/fluid/api/tensor/math/multiply.rst
+++ b/doc/fluid/api/tensor/math/multiply.rst
--- a/doc/fluid/api/tensor/math/pow.rst
+++ b/doc/fluid/api/tensor/math/pow.rst
--- a/doc/fluid/api/tensor/math/prod.rst
+++ b/doc/fluid/api/tensor/math/prod.rst
--- a/doc/fluid/api/tensor/math/remainder.rst
+++ b/doc/fluid/api/tensor/math/remainder.rst
--- a/doc/fluid/api/tensor/math/sign.rst
+++ b/doc/fluid/api/tensor/math/sign.rst
--- a/doc/fluid/api/tensor/math/sin.rst
+++ b/doc/fluid/api/tensor/math/sin.rst
--- a/doc/fluid/api/tensor/math/sqrt.rst
+++ b/doc/fluid/api/tensor/math/sqrt.rst
--- a/doc/fluid/api/tensor/math/sum.rst
+++ b/doc/fluid/api/tensor/math/sum.rst
--- a/doc/fluid/api/tensor/math/tanh.rst
+++ b/doc/fluid/api/tensor/math/tanh.rst
--- a/doc/fluid/api/tensor/max.rst
+++ b/doc/fluid/api/tensor/max.rst
--- a/doc/fluid/api/tensor/maximum.rst
+++ b/doc/fluid/api/tensor/maximum.rst
--- a/doc/fluid/api/tensor/mean.rst
+++ b/doc/fluid/api/tensor/mean.rst
--- a/doc/fluid/api/tensor/min.rst
+++ b/doc/fluid/api/tensor/min.rst
--- a/doc/fluid/api/tensor/minimum.rst
+++ b/doc/fluid/api/tensor/minimum.rst
--- a/doc/fluid/api/tensor/mm.rst
+++ b/doc/fluid/api/tensor/mm.rst
--- a/doc/fluid/api/tensor/mul.rst
+++ b/doc/fluid/api/tensor/mul.rst
--- a/doc/fluid/api/tensor/multiplex.rst
+++ b/doc/fluid/api/tensor/multiplex.rst
--- a/doc/fluid/api/tensor/norm.rst
+++ b/doc/fluid/api/tensor/norm.rst
--- a/doc/fluid/api/tensor/not_equal.rst
+++ b/doc/fluid/api/tensor/not_equal.rst
--- a/doc/fluid/api/tensor/numel.rst
+++ b/doc/fluid/api/tensor/numel.rst
--- a/doc/fluid/api/tensor/ones.rst
+++ b/doc/fluid/api/tensor/ones.rst
--- a/doc/fluid/api/tensor/ones_like.rst
+++ b/doc/fluid/api/tensor/ones_like.rst
--- a/doc/fluid/api/tensor/pow.rst
+++ b/doc/fluid/api/tensor/pow.rst
--- a/doc/fluid/api/tensor/random.rst
+++ b/doc/fluid/api/tensor/random.rst
--- a/doc/fluid/api/tensor/random/normal.rst
+++ b/doc/fluid/api/tensor/random/normal.rst
--- a/doc/fluid/api/tensor/random/rand.rst
+++ b/doc/fluid/api/tensor/random/rand.rst
--- a/doc/fluid/api/tensor/random/randint.rst
+++ b/doc/fluid/api/tensor/random/randint.rst
--- a/doc/fluid/api/tensor/random/randn.rst
+++ b/doc/fluid/api/tensor/random/randn.rst
--- a/doc/fluid/api/tensor/random/randperm.rst
+++ b/doc/fluid/api/tensor/random/randperm.rst
--- a/doc/fluid/api/tensor/random/standard_normal.rst
+++ b/doc/fluid/api/tensor/random/standard_normal.rst
--- a/doc/fluid/api/tensor/random/uniform.rst
+++ b/doc/fluid/api/tensor/random/uniform.rst
--- a/doc/fluid/api/tensor/rank.rst
+++ b/doc/fluid/api/tensor/rank.rst
--- a/doc/fluid/api/tensor/reciprocal.rst
+++ b/doc/fluid/api/tensor/reciprocal.rst
--- a/doc/fluid/api/tensor/reduce_all.rst
+++ b/doc/fluid/api/tensor/reduce_all.rst
--- a/doc/fluid/api/tensor/reduce_any.rst
+++ b/doc/fluid/api/tensor/reduce_any.rst
--- a/doc/fluid/api/tensor/reduce_max.rst
+++ b/doc/fluid/api/tensor/reduce_max.rst
--- a/doc/fluid/api/tensor/reduce_mean.rst
+++ b/doc/fluid/api/tensor/reduce_mean.rst
--- a/doc/fluid/api/tensor/reduce_min.rst
+++ b/doc/fluid/api/tensor/reduce_min.rst
--- a/doc/fluid/api/tensor/reduce_prod.rst
+++ b/doc/fluid/api/tensor/reduce_prod.rst
--- a/doc/fluid/api/tensor/reduce_sum.rst
+++ b/doc/fluid/api/tensor/reduce_sum.rst
--- a/doc/fluid/api/tensor/reshape.rst
+++ b/doc/fluid/api/tensor/reshape.rst
--- a/doc/fluid/api/tensor/reverse.rst
+++ b/doc/fluid/api/tensor/reverse.rst
--- a/doc/fluid/api/tensor/round.rst
+++ b/doc/fluid/api/tensor/round.rst
--- a/doc/fluid/api/tensor/rsqrt.rst
+++ b/doc/fluid/api/tensor/rsqrt.rst
--- a/doc/fluid/api/tensor/save.rst
+++ b/doc/fluid/api/tensor/save.rst
--- a/doc/fluid/api/tensor/scale.rst
+++ b/doc/fluid/api/tensor/scale.rst
--- a/doc/fluid/api/tensor/scatter.rst
+++ b/doc/fluid/api/tensor/scatter.rst
--- a/doc/fluid/api/tensor/scatter_nd.rst
+++ b/doc/fluid/api/tensor/scatter_nd.rst
--- a/doc/fluid/api/tensor/scatter_nd_add.rst
+++ b/doc/fluid/api/tensor/scatter_nd_add.rst
--- a/doc/fluid/api/tensor/shape.rst
+++ b/doc/fluid/api/tensor/shape.rst
--- a/doc/fluid/api/tensor/shard_index.rst
+++ b/doc/fluid/api/tensor/shard_index.rst
--- a/doc/fluid/api/tensor/shuffle.rst
+++ b/doc/fluid/api/tensor/shuffle.rst
--- a/doc/fluid/api/tensor/sign.rst
+++ b/doc/fluid/api/tensor/sign.rst
--- a/doc/fluid/api/tensor/sin.rst
+++ b/doc/fluid/api/tensor/sin.rst
--- a/doc/fluid/api/tensor/slice.rst
+++ b/doc/fluid/api/tensor/slice.rst
--- a/doc/fluid/api/tensor/sort.rst
+++ b/doc/fluid/api/tensor/sort.rst
--- a/doc/fluid/api/tensor/split.rst
+++ b/doc/fluid/api/tensor/split.rst
--- a/doc/fluid/api/tensor/sqrt.rst
+++ b/doc/fluid/api/tensor/sqrt.rst
--- a/doc/fluid/api/tensor/square.rst
+++ b/doc/fluid/api/tensor/square.rst
--- a/doc/fluid/api/tensor/squeeze.rst
+++ b/doc/fluid/api/tensor/squeeze.rst
--- a/doc/fluid/api/tensor/stack.rst
+++ b/doc/fluid/api/tensor/stack.rst
--- a/doc/fluid/api/tensor/stanh.rst
+++ b/doc/fluid/api/tensor/stanh.rst
--- a/doc/fluid/api/tensor/std.rst
+++ b/doc/fluid/api/tensor/std.rst
--- a/doc/fluid/api/tensor/strided_slice.rst
+++ b/doc/fluid/api/tensor/strided_slice.rst
--- a/doc/fluid/api/tensor/sum.rst
+++ b/doc/fluid/api/tensor/sum.rst
--- a/doc/fluid/api/tensor/sums.rst
+++ b/doc/fluid/api/tensor/sums.rst
--- a/doc/fluid/api/tensor/tanh.rst
+++ b/doc/fluid/api/tensor/tanh.rst
--- a/doc/fluid/api/tensor/topk.rst
+++ b/doc/fluid/api/tensor/topk.rst
--- a/doc/fluid/api/tensor/transpose.rst
+++ b/doc/fluid/api/tensor/transpose.rst
--- a/doc/fluid/api/tensor/unique.rst
+++ b/doc/fluid/api/tensor/unique.rst
--- a/doc/fluid/api/tensor/unique_with_counts.rst
+++ b/doc/fluid/api/tensor/unique_with_counts.rst
--- a/doc/fluid/api/tensor/unsqueeze.rst
+++ b/doc/fluid/api/tensor/unsqueeze.rst
--- a/doc/fluid/api/tensor/unstack.rst
+++ b/doc/fluid/api/tensor/unstack.rst
--- a/doc/fluid/api/tensor/var.rst
+++ b/doc/fluid/api/tensor/var.rst
--- a/doc/fluid/api/tensor/where.rst
+++ b/doc/fluid/api/tensor/where.rst
--- a/doc/fluid/api/tensor/zeros.rst
+++ b/doc/fluid/api/tensor/zeros.rst
--- a/doc/fluid/api/tensor/zeros_like.rst
+++ b/doc/fluid/api/tensor/zeros_like.rst
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
--- a/doc/fluid/api/transpiler/RoundRobin.rst
+++ b/doc/fluid/api/transpiler/RoundRobin.rst
--- a/doc/fluid/api_cn/api_tree_cn.rst
+++ b/doc/fluid/api_cn/api_tree_cn.rst
--- a/doc/fluid/api_cn/backward_cn/append_backward_cn.rst
+++ b/doc/fluid/api_cn/backward_cn/append_backward_cn.rst
--- a/doc/fluid/api_cn/backward_cn/gradients_cn.rst
+++ b/doc/fluid/api_cn/backward_cn/gradients_cn.rst
--- a/doc/fluid/api_cn/clip_cn/ErrorClipByValue_cn.rst
+++ b/doc/fluid/api_cn/clip_cn/ErrorClipByValue_cn.rst
--- a/doc/fluid/api_cn/clip_cn/GradientClipByGlobalNorm_cn.rst
+++ b/doc/fluid/api_cn/clip_cn/GradientClipByGlobalNorm_cn.rst
--- a/doc/fluid/api_cn/clip_cn/GradientClipByNorm_cn.rst
+++ b/doc/fluid/api_cn/clip_cn/GradientClipByNorm_cn.rst
--- a/doc/fluid/api_cn/clip_cn/GradientClipByValue_cn.rst
+++ b/doc/fluid/api_cn/clip_cn/GradientClipByValue_cn.rst
--- a/doc/fluid/api_cn/clip_cn/set_gradient_clip_cn.rst
+++ b/doc/fluid/api_cn/clip_cn/set_gradient_clip_cn.rst
--- a/doc/fluid/api_cn/dataset_cn.rst
+++ b/doc/fluid/api_cn/dataset_cn.rst
--- a/doc/fluid/api_cn/dataset_cn/DatasetFactory_cn.rst
+++ b/doc/fluid/api_cn/dataset_cn/DatasetFactory_cn.rst
--- a/doc/fluid/api_cn/dataset_cn/InMemoryDataset_cn.rst
+++ b/doc/fluid/api_cn/dataset_cn/InMemoryDataset_cn.rst
--- a/doc/fluid/api_cn/dataset_cn/QueueDataset_cn.rst
+++ b/doc/fluid/api_cn/dataset_cn/QueueDataset_cn.rst
--- a/doc/fluid/api_cn/declarative_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/batch_norm_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/batch_norm_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/bilinear_tensor_product_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/bilinear_tensor_product_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/conv2d_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/conv2d_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/conv2d_transpose_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/conv2d_transpose_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/conv3d_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/conv3d_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/conv3d_transpose_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/conv3d_transpose_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/create_parameter_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/create_parameter_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/crf_decoding_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/crf_decoding_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/data_norm_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/data_norm_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/deformable_conv_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/deformable_conv_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/embedding_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/embedding_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/fc_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/fc_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/group_norm_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/group_norm_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/hsigmoid_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/hsigmoid_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/instance_norm_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/instance_norm_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/layer_norm_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/layer_norm_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/multi_box_head_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/multi_box_head_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/nce_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/nce_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/prelu_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/prelu_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/row_conv_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/row_conv_cn.rst
--- a/doc/fluid/api_cn/declarative_cn/spectral_norm_cn.rst
+++ b/doc/fluid/api_cn/declarative_cn/spectral_norm_cn.rst
--- a/doc/fluid/api_cn/distributed_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/ParallelEnv_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/ParallelEnv_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/all_gather_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/all_gather_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/all_reduce_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/all_reduce_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/barrier_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/barrier_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/broadcast_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/broadcast_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/get_rank_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/get_rank_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/get_world_size_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/get_world_size_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/init_parallel_env_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/init_parallel_env_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/prepare_context_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/prepare_context_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/reduce_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/reduce_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/scatter_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/scatter_cn.rst
--- a/doc/fluid/api_cn/distributed_cn/spawn_cn.rst
+++ b/doc/fluid/api_cn/distributed_cn/spawn_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/BackwardStrategy_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/BackwardStrategy_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/BatchNorm_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/BatchNorm_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/BilinearTensorProduct_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/BilinearTensorProduct_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Conv2DTranspose_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Conv2DTranspose_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Conv2D_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Conv2D_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Conv3DTranspose_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Conv3DTranspose_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Conv3D_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Conv3D_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/CosineDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/CosineDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/DataParallel_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/DataParallel_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Dropout_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Dropout_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Embedding_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Embedding_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/ExponentialDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/ExponentialDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/GRUUnit_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/GRUUnit_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/GroupNorm_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/GroupNorm_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/InstanceNorm_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/InstanceNorm_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/InverseTimeDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/InverseTimeDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/LambdaDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/LambdaDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/LayerList_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/LayerList_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/LayerNorm_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/LayerNorm_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Layer_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Layer_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Linear_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Linear_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/MultiStepDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/MultiStepDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/NCE_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/NCE_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/NaturalExpDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/NaturalExpDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/NoamDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/NoamDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/PRelu_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/PRelu_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/ParallelEnv_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/ParallelEnv_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/ParameterList_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/ParameterList_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/PiecewiseDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/PiecewiseDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/PolynomialDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/PolynomialDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Pool2D_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Pool2D_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/ProgramTranslator_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/ProgramTranslator_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/ReduceLROnPlateau_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/ReduceLROnPlateau_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/Sequential_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/Sequential_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/SpectralNorm_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/SpectralNorm_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/StepDecay_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/StepDecay_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/TracedLayer_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/TracedLayer_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/TranslatedLayer_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/TranslatedLayer_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/TreeConv_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/TreeConv_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/declarative_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/declarative_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/enabled_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/enabled_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/grad_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/grad_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/guard_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/guard_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/jit_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/jit_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/jit_cn/SaveLoadConfig_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/jit_cn/SaveLoadConfig_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/jit_cn/load_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/jit_cn/load_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/jit_cn/save_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/jit_cn/save_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/jit_cn/set_code_level_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/jit_cn/set_code_level_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/jit_cn/set_verbosity_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/jit_cn/set_verbosity_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/load_dygraph_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/load_dygraph_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/no_grad_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/no_grad_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/prepare_context_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/prepare_context_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/save_dygraph_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/save_dygraph_cn.rst
--- a/doc/fluid/api_cn/dygraph_cn/to_variable_cn.rst
+++ b/doc/fluid/api_cn/dygraph_cn/to_variable_cn.rst
--- a/doc/fluid/api_cn/executor_cn/Executor_cn.rst
+++ b/doc/fluid/api_cn/executor_cn/Executor_cn.rst
--- a/doc/fluid/api_cn/executor_cn/global_scope_cn.rst
+++ b/doc/fluid/api_cn/executor_cn/global_scope_cn.rst
--- a/doc/fluid/api_cn/executor_cn/scope_guard_cn.rst
+++ b/doc/fluid/api_cn/executor_cn/scope_guard_cn.rst
--- a/doc/fluid/api_cn/fluid_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/BuildStrategy_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/BuildStrategy_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/CPUPlace_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/CPUPlace_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/CUDAPinnedPlace_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/CUDAPinnedPlace_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/CUDAPlace_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/CUDAPlace_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/CompiledProgram_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/CompiledProgram_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/DataFeedDesc_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/DataFeedDesc_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/DataFeeder_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/DataFeeder_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/DistributeTranspilerConfig_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/DistributeTranspilerConfig_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/DistributeTranspiler_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/DistributeTranspiler_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/ExecutionStrategy_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/ExecutionStrategy_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/Executor_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/Executor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/LoDTensorArray_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/LoDTensorArray_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/LoDTensor_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/LoDTensor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/ParallelExecutor_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/ParallelExecutor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/ParamAttr_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/ParamAttr_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/Program_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/Program_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/Tensor_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/Tensor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/Variable_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/Variable_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/WeightNormParamAttr_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/WeightNormParamAttr_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/cpu_places_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/cpu_places_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/create_lod_tensor_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/create_lod_tensor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/create_random_int_lodtensor_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/create_random_int_lodtensor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/cuda_pinned_places_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/cuda_pinned_places_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/cuda_places_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/cuda_places_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/data_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/data_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/default_main_program_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/default_main_program_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/default_startup_program_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/default_startup_program_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/device_guard_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/device_guard_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/disable_dygraph_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/disable_dygraph_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/embedding_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/embedding_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/enable_dygraph_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/enable_dygraph_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/get_flags_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/get_flags_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/global_scope_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/global_scope_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/gradients_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/gradients_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/in_dygraph_mode_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/in_dygraph_mode_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/is_compiled_with_cuda_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/is_compiled_with_cuda_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/load_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/load_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/load_op_library_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/load_op_library_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/memory_optimize_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/memory_optimize_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/name_scope_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/name_scope_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/one_hot_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/one_hot_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/program_guard_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/program_guard_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/release_memory_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/release_memory_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/require_version_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/require_version_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/save_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/save_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/scope_guard_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/scope_guard_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/set_flags_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/set_flags_cn.rst
--- a/doc/fluid/api_cn/fluid_cn/set_global_initializer_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn/set_global_initializer_cn.rst
--- a/doc/fluid/api_cn/framework_cn.rst
+++ b/doc/fluid/api_cn/framework_cn.rst
--- a/doc/fluid/api_cn/framework_cn/BuildStrategy_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/BuildStrategy_cn.rst
--- a/doc/fluid/api_cn/framework_cn/CPUPlace_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/CPUPlace_cn.rst
--- a/doc/fluid/api_cn/framework_cn/CUDAPinnedPlace_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/CUDAPinnedPlace_cn.rst
--- a/doc/fluid/api_cn/framework_cn/CUDAPlace_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/CUDAPlace_cn.rst
--- a/doc/fluid/api_cn/framework_cn/CompiledProgram_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/CompiledProgram_cn.rst
--- a/doc/fluid/api_cn/framework_cn/ExecutionStrateg y_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/ExecutionStrateg y_cn.rst
--- a/doc/fluid/api_cn/framework_cn/Executor_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/Executor_cn.rst
--- a/doc/fluid/api_cn/framework_cn/ParallelExecutor_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/ParallelExecutor_cn.rst
--- a/doc/fluid/api_cn/framework_cn/ParamAttr_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/ParamAttr_cn.rst
--- a/doc/fluid/api_cn/framework_cn/Print_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/Print_cn.rst
--- a/doc/fluid/api_cn/framework_cn/Program_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/Program_cn.rst
--- a/doc/fluid/api_cn/framework_cn/Variable_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/Variable_cn.rst
--- a/doc/fluid/api_cn/framework_cn/WeightNormParamAttr_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/WeightNormParamAttr_cn.rst
--- a/doc/fluid/api_cn/framework_cn/append_backward_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/append_backward_cn.rst
--- a/doc/fluid/api_cn/framework_cn/create_global_var_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/create_global_var_cn.rst
--- a/doc/fluid/api_cn/framework_cn/create_parameter_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/create_parameter_cn.rst
--- a/doc/fluid/api_cn/framework_cn/default_main_program_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/default_main_program_cn.rst
--- a/doc/fluid/api_cn/framework_cn/default_startup_program_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/default_startup_program_cn.rst
--- a/doc/fluid/api_cn/framework_cn/get_cuda_rng_state_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/get_cuda_rng_state_cn.rst
--- a/doc/fluid/api_cn/framework_cn/get_default_dtype_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/get_default_dtype_cn.rst
--- a/doc/fluid/api_cn/framework_cn/global_scope_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/global_scope_cn.rst
--- a/doc/fluid/api_cn/framework_cn/gradients_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/gradients_cn.rst
--- a/doc/fluid/api_cn/framework_cn/manual_seed_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/manual_seed_cn.rst
--- a/doc/fluid/api_cn/framework_cn/name_scope_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/name_scope_cn.rst
--- a/doc/fluid/api_cn/framework_cn/program_guard_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/program_guard_cn.rst
--- a/doc/fluid/api_cn/framework_cn/py_func_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/py_func_cn.rst
--- a/doc/fluid/api_cn/framework_cn/scope_guard_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/scope_guard_cn.rst
--- a/doc/fluid/api_cn/framework_cn/set_cuda_rng_state_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/set_cuda_rng_state_cn.rst
--- a/doc/fluid/api_cn/framework_cn/set_default_dtype_cn.rst
+++ b/doc/fluid/api_cn/framework_cn/set_default_dtype_cn.rst
--- a/doc/fluid/api_cn/gen_index.py
+++ b/doc/fluid/api_cn/gen_index.py
--- a/doc/fluid/api_cn/imperative_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/CosineDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/CosineDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/DataParallel_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/DataParallel_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/ExponentialDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/ExponentialDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/InverseTimeDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/InverseTimeDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/NaturalExpDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/NaturalExpDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/NoamDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/NoamDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/ParallelEnv_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/ParallelEnv_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/PiecewiseDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/PiecewiseDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/PolynomialDecay_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/PolynomialDecay_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/ProgramTranslator_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/ProgramTranslator_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/TracedLayer_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/TracedLayer_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/TranslatedLayer_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/TranslatedLayer_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/declarative_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/declarative_cn.rst
--- a/doc/fluid/api_cn/imperative_cn/enabled_cn.rst
+++ b/doc/fluid/api_cn/imperative_cn/enabled_cn.rst