diff --git a/.travis.yml b/.travis.yml
index 361136ac2c8d899a0d7a4d7945083fcc489551b5..8c2d9f143b3102c142ec2d5d193b82936d04fba8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,6 @@ services:
os:
- linux
env:
- - JOB=doc
- JOB=check_style
- JOB=build_android
addons:
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
deleted file mode 100644
index be92af3902769a65c77953c9f3cb1f3aa3738d79..0000000000000000000000000000000000000000
--- a/doc/fluid/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-if(NOT DEFINED SPHINX_THEME)
- set(SPHINX_THEME default)
-endif()
-
-if(NOT DEFINED SPHINX_THEME_DIR)
- set(SPHINX_THEME_DIR)
-endif()
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-
-set(IMPORT_PADDLE_STRING "")
-set(IMPORT_PADDLEV2_STRING "")
-
-configure_file(
- "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
- "${BINARY_BUILD_DIR_EN}/conf.py"
- @ONLY)
-
-sphinx_add_target(paddle_fluid_docs
- html
- ${BINARY_BUILD_DIR_EN}
- ${SPHINX_CACHE_DIR_EN}
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${SPHINX_HTML_DIR_EN})
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
-
-configure_file(
- "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
- "${BINARY_BUILD_DIR_CN}/conf.py"
- @ONLY)
-
-sphinx_add_target(paddle_fluid_docs_cn
- html
- ${BINARY_BUILD_DIR_CN}
- ${SPHINX_CACHE_DIR_CN}
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${SPHINX_HTML_DIR_CN})
-
-add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
deleted file mode 100644
index 435d6e10fb02e9b2a8147f37da33e8848cc9b98a..0000000000000000000000000000000000000000
--- a/doc/fluid/api/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
-
-# HTML output director
-set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
-
-set(IMPORT_PADDLE_STRING "import paddle")
-set(IMPORT_PADDLEV2_STRING "import paddle.v2")
-
-configure_file(
- "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
- "${BINARY_BUILD_DIR_EN}/conf.py"
- @ONLY)
-
-sphinx_add_target(paddle_fluid_apis
- html
- ${BINARY_BUILD_DIR_EN}
- ${SPHINX_CACHE_DIR_EN}
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${SPHINX_HTML_DIR_EN})
-
-add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
deleted file mode 100644
index 496f5b29875443f0c44f50fcb3ca837f4e7bcd12..0000000000000000000000000000000000000000
--- a/doc/fluid/api/average.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=============
-fluid.average
-=============
-
-.. _api_fluid_average_WeightedAverage:
-
-WeightedAverage
----------------
-
-.. autoclass:: paddle.fluid.average.WeightedAverage
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
deleted file mode 100644
index 0076394543c2f87e90fa1ea989d7b5cbf468a6f7..0000000000000000000000000000000000000000
--- a/doc/fluid/api/backward.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-==============
-fluid.backward
-==============
-
-.. _api_fluid_backward_append_backward:
-
-append_backward
----------------
-
-.. autofunction:: paddle.fluid.backward.append_backward
- :noindex:
-
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
deleted file mode 100644
index aeefbb95a46e5d5ed46375e388a720fad2711779..0000000000000000000000000000000000000000
--- a/doc/fluid/api/clip.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-==========
-fluid.clip
-==========
-
-.. _api_fluid_clip_ErrorClipByValue:
-
-ErrorClipByValue
-----------------
-
-.. autoclass:: paddle.fluid.clip.ErrorClipByValue
- :members:
- :noindex:
-
-.. _api_fluid_clip_GradientClipByValue:
-
-GradientClipByValue
--------------------
-
-.. autoclass:: paddle.fluid.clip.GradientClipByValue
- :members:
- :noindex:
-
-.. _api_fluid_clip_GradientClipByNorm:
-
-GradientClipByNorm
-------------------
-
-.. autoclass:: paddle.fluid.clip.GradientClipByNorm
- :members:
- :noindex:
-
-.. _api_fluid_clip_GradientClipByGlobalNorm:
-
-GradientClipByGlobalNorm
-------------------------
-
-.. autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst
deleted file mode 100644
index 1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data/data_reader.rst
+++ /dev/null
@@ -1,72 +0,0 @@
-=====================
-Data Reader Interface
-=====================
-
-
-DataTypes
-=========
-
-.. autofunction:: paddle.v2.data_type.dense_array
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.integer_value
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.integer_value_sequence
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_binary_vector
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_float_vector
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_non_value_slot
- :noindex:
-
-.. autofunction:: paddle.v2.data_type.sparse_value_slot
- :noindex:
-
-.. autoclass:: paddle.v2.data_type.InputType
- :members:
- :noindex:
-
-DataFeeder
-==========
-
-.. automodule:: paddle.v2.data_feeder
- :members:
- :noindex:
-
-Reader
-======
-
-.. automodule:: paddle.reader
- :members:
- :noindex:
-
-.. automodule:: paddle.reader.creator
- :members:
- :noindex:
-
-minibatch
-=========
-
-.. automodule:: paddle.v2.minibatch
- :members:
- :noindex:
diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst
deleted file mode 100644
index e7c8be4452bf55e0967d750c2e624e8e316e9330..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data/dataset.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-Dataset
-=======
-
-.. automodule:: paddle.dataset
- :members:
- :noindex:
-
-mnist
-+++++
-
-.. automodule:: paddle.dataset.mnist
- :members:
- :noindex:
-
-cifar
-+++++
-
-.. automodule:: paddle.dataset.cifar
- :members:
- :noindex:
-
-conll05
-+++++++
-
-.. automodule:: paddle.dataset.conll05
- :members: get_dict,get_embedding,test
- :noindex:
-
-imdb
-++++
-
-.. automodule:: paddle.dataset.imdb
- :members:
- :noindex:
-
-imikolov
-++++++++
-
-.. automodule:: paddle.dataset.imikolov
- :members:
- :noindex:
-
-movielens
-+++++++++
-
-.. automodule:: paddle.dataset.movielens
- :members:
- :noindex:
-
-.. autoclass:: paddle.dataset.movielens.MovieInfo
- :noindex:
-
-.. autoclass:: paddle.dataset.movielens.UserInfo
- :noindex:
-
-sentiment
-+++++++++
-
-.. automodule:: paddle.dataset.sentiment
- :members:
- :noindex:
-
-uci_housing
-+++++++++++
-
-.. automodule:: paddle.dataset.uci_housing
- :members:
- :noindex:
-
-wmt14
-+++++
-
-.. automodule:: paddle.dataset.wmt14
- :members:
- :noindex:
-
-wmt16
-+++++
-
-.. automodule:: paddle.dataset.wmt16
- :members:
- :noindex:
diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst
deleted file mode 100644
index 97651ffa6be56cf3ecaca2caca38a353fa5c1f49..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data/image.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Image Interface
-===============
-
-.. automodule:: paddle.v2.image
- :members:
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
deleted file mode 100644
index 11d2890f5b3446e37c3ef31e5a17ebebe169dbc8..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data_feeder.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=================
-fluid.data_feeder
-=================
-
-.. _api_fluid_data_feeder_DataFeeder:
-
-DataFeeder
-----------
-
-.. autoclass:: paddle.fluid.data_feeder.DataFeeder
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
deleted file mode 100644
index f23ecc1f80030f20359ce9675130a167722606c9..0000000000000000000000000000000000000000
--- a/doc/fluid/api/executor.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-==============
-fluid.executor
-==============
-
-.. _api_fluid_executor_Executor:
-
-Executor
---------
-
-.. autoclass:: paddle.fluid.executor.Executor
- :members:
- :noindex:
-
-.. _api_fluid_executor_global_scope:
-
-global_scope
-------------
-
-.. autofunction:: paddle.fluid.executor.global_scope
- :noindex:
-
-.. _api_fluid_executor_scope_guard:
-
-scope_guard
------------
-
-.. autofunction:: paddle.fluid.executor.scope_guard
- :noindex:
-
-.. _api_fluid_executor__switch_scope:
-
-_switch_scope
--------------
-
-.. autofunction:: paddle.fluid.executor._switch_scope
- :noindex:
-
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
deleted file mode 100644
index f76c7aab7be0b9703642bbf9de26cc298c849fb3..0000000000000000000000000000000000000000
--- a/doc/fluid/api/fluid.rst
+++ /dev/null
@@ -1,338 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=====
-fluid
-=====
-
-.. _api_fluid_Program:
-
-Program
--------
-
-.. autoclass:: paddle.fluid.Program
- :members:
- :noindex:
-
-.. _api_fluid_Operator:
-
-Operator
---------
-
-.. autoclass:: paddle.fluid.Operator
- :members:
- :noindex:
-
-.. _api_fluid_Parameter:
-
-Parameter
----------
-
-.. autoclass:: paddle.fluid.Parameter
- :members:
- :noindex:
-
-.. _api_fluid_default_startup_program:
-
-default_startup_program
------------------------
-
-.. autofunction:: paddle.fluid.default_startup_program
- :noindex:
-
-.. _api_fluid_default_main_program:
-
-default_main_program
---------------------
-
-.. autofunction:: paddle.fluid.default_main_program
- :noindex:
-
-.. _api_fluid_program_guard:
-
-program_guard
--------------
-
-.. autofunction:: paddle.fluid.program_guard
- :noindex:
-
-.. _api_fluid_get_var:
-
-get_var
--------
-
-.. autofunction:: paddle.fluid.get_var
- :noindex:
-
-.. _api_fluid_Executor:
-
-Executor
---------
-
-.. autoclass:: paddle.fluid.Executor
- :members:
- :noindex:
-
-.. _api_fluid_global_scope:
-
-global_scope
-------------
-
-.. autofunction:: paddle.fluid.global_scope
- :noindex:
-
-.. _api_fluid_scope_guard:
-
-scope_guard
------------
-
-.. autofunction:: paddle.fluid.scope_guard
- :noindex:
-
-.. _api_fluid__switch_scope:
-
-_switch_scope
--------------
-
-.. autofunction:: paddle.fluid._switch_scope
- :noindex:
-
-.. _api_fluid_Trainer:
-
-Trainer
--------
-
-.. autoclass:: paddle.fluid.Trainer
- :members:
- :noindex:
-
-.. _api_fluid_BeginEpochEvent:
-
-BeginEpochEvent
----------------
-
-.. autoclass:: paddle.fluid.BeginEpochEvent
- :members:
- :noindex:
-
-.. _api_fluid_EndEpochEvent:
-
-EndEpochEvent
--------------
-
-.. autoclass:: paddle.fluid.EndEpochEvent
- :members:
- :noindex:
-
-.. _api_fluid_BeginStepEvent:
-
-BeginStepEvent
---------------
-
-.. autoclass:: paddle.fluid.BeginStepEvent
- :members:
- :noindex:
-
-.. _api_fluid_EndStepEvent:
-
-EndStepEvent
-------------
-
-.. autoclass:: paddle.fluid.EndStepEvent
- :members:
- :noindex:
-
-.. _api_fluid_CheckpointConfig:
-
-CheckpointConfig
-----------------
-
-.. autoclass:: paddle.fluid.CheckpointConfig
- :members:
- :noindex:
-
-.. _api_fluid_Inferencer:
-
-Inferencer
-----------
-
-.. autoclass:: paddle.fluid.Inferencer
- :members:
- :noindex:
-
-.. _api_fluid_DistributeTranspiler:
-
-DistributeTranspiler
---------------------
-
-.. autoclass:: paddle.fluid.DistributeTranspiler
- :members:
- :noindex:
-
-.. _api_fluid_InferenceTranspiler:
-
-InferenceTranspiler
--------------------
-
-.. autoclass:: paddle.fluid.InferenceTranspiler
- :members:
- :noindex:
-
-.. _api_fluid_memory_optimize:
-
-memory_optimize
----------------
-
-.. autofunction:: paddle.fluid.memory_optimize
- :noindex:
-
-.. _api_fluid_release_memory:
-
-release_memory
---------------
-
-.. autofunction:: paddle.fluid.release_memory
- :noindex:
-
-.. _api_fluid_DistributeTranspilerConfig:
-
-DistributeTranspilerConfig
---------------------------
-
-.. autoclass:: paddle.fluid.DistributeTranspilerConfig
- :members:
- :noindex:
-
-.. _api_fluid_ParallelExecutor:
-
-ParallelExecutor
-----------------
-
-.. autoclass:: paddle.fluid.ParallelExecutor
- :members:
- :noindex:
-
-.. _api_fluid_ExecutionStrategy:
-
-ExecutionStrategy
------------------
-
-.. autoclass:: paddle.fluid.ExecutionStrategy
- :members:
- :noindex:
-
-.. _api_fluid_BuildStrategy:
-
-BuildStrategy
--------------
-
-.. autoclass:: paddle.fluid.BuildStrategy
- :members:
- :noindex:
-
-.. _api_fluid_create_lod_tensor:
-
-create_lod_tensor
------------------
-
-.. autofunction:: paddle.fluid.create_lod_tensor
- :noindex:
-
-.. _api_fluid_create_random_int_lodtensor:
-
-create_random_int_lodtensor
----------------------------
-
-.. autofunction:: paddle.fluid.create_random_int_lodtensor
- :noindex:
-
-.. _api_fluid_LoDTensor:
-
-LoDTensor
----------
-
-.. autoclass:: paddle.fluid.LoDTensor
- :members:
- :noindex:
-
-.. _api_fluid_LoDTensorArray:
-
-LoDTensorArray
---------------
-
-.. autoclass:: paddle.fluid.LoDTensorArray
- :members:
- :noindex:
-
-.. _api_fluid_CPUPlace:
-
-CPUPlace
---------
-
-.. autoclass:: paddle.fluid.CPUPlace
- :members:
- :noindex:
-
-.. _api_fluid_CUDAPlace:
-
-CUDAPlace
----------
-
-.. autoclass:: paddle.fluid.CUDAPlace
- :members:
- :noindex:
-
-.. _api_fluid_CUDAPinnedPlace:
-
-CUDAPinnedPlace
----------------
-
-.. autoclass:: paddle.fluid.CUDAPinnedPlace
- :members:
- :noindex:
-
-.. _api_fluid_Tensor:
-
-Tensor
-------
-
-.. autoclass:: paddle.fluid.Tensor
- :members:
- :noindex:
-
-.. _api_fluid_ParamAttr:
-
-ParamAttr
----------
-
-.. autoclass:: paddle.fluid.ParamAttr
- :members:
- :noindex:
-
-.. _api_fluid_WeightNormParamAttr:
-
-WeightNormParamAttr
--------------------
-
-.. autoclass:: paddle.fluid.WeightNormParamAttr
- :members:
- :noindex:
-
-.. _api_fluid_DataFeeder:
-
-DataFeeder
-----------
-
-.. autoclass:: paddle.fluid.DataFeeder
- :members:
- :noindex:
-
-.. _api_fluid_Scope:
-
-Scope
------
-
-.. autoclass:: paddle.fluid.Scope
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
deleted file mode 100644
index 02efce2bf8392c62a7600c272bedcadc6563f927..0000000000000000000000000000000000000000
--- a/doc/fluid/api/gen_doc.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import argparse
-import sys
-import types
-
-import paddle.fluid as fluid
-
-
-def parse_arg():
- parser = argparse.ArgumentParser()
- parser.add_argument('--submodules', nargs="*")
- parser.add_argument(
- 'module', type=str, help='Generate the documentation of which module')
- return parser.parse_args()
-
-
-class DocGenerator(object):
- def __init__(self, module_name=None, stream=sys.stdout):
- if module_name == "":
- module_name = None
- self.stream = stream
- if module_name is None:
- self.module_name = "fluid"
- else:
- self.module_name = "fluid." + module_name
- if module_name is None:
- self.module = fluid
- else:
- if not hasattr(fluid, module_name):
- raise ValueError("Cannot find fluid.{0}".format(module_name))
- else:
- self.module = getattr(fluid, module_name)
- self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-''')
-
- self._print_header_(self.module_name, dot='=', is_title=True)
-
- def print_submodule(self, submodule_name):
- submodule = getattr(self.module, submodule_name)
- if submodule is None:
- raise ValueError("Cannot find submodule {0}".format(submodule_name))
- self.print_section(submodule_name)
-
- for item in submodule.__all__:
- self.print_item(item)
-
- def print_current_module(self):
- for item in self.module.__all__:
- self.print_item(item)
-
- def print_section(self, name):
- self._print_header_(name, dot='=', is_title=False)
-
- def print_item(self, name):
- item = getattr(self.module, name, None)
- if item is None:
- return
- if isinstance(item, types.TypeType):
- self.print_class(name)
- elif isinstance(item, types.FunctionType):
- self.print_method(name)
- else:
- pass
-
- def print_class(self, name):
- self._print_ref_(name)
- self._print_header_(name, dot='-', is_title=False)
- self.stream.write('''.. autoclass:: paddle.{0}.{1}
- :members:
- :noindex:
-
-'''.format(self.module_name, name))
-
- def print_method(self, name):
- self._print_ref_(name)
- self._print_header_(name, dot='-', is_title=False)
- self.stream.write('''.. autofunction:: paddle.{0}.{1}
- :noindex:
-
-'''.format(self.module_name, name))
-
- def _print_header_(self, name, dot, is_title):
- dot_line = dot * len(name)
- if is_title:
- self.stream.write(dot_line)
- self.stream.write('\n')
- self.stream.write(name)
- self.stream.write('\n')
- self.stream.write(dot_line)
- self.stream.write('\n')
- self.stream.write('\n')
-
- def _print_ref_(self, name):
- self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
- self.module_name.split(".")), name))
-
-
-def main():
- args = parse_arg()
- gen = DocGenerator(args.module)
- if args.submodules is None:
- gen.print_current_module()
- else:
- for submodule_name in args.submodules:
- gen.print_submodule(submodule_name)
-
-
-if __name__ == '__main__':
- main()
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
deleted file mode 100755
index b14ee29873c50fd011f6c48b754767ac8918252a..0000000000000000000000000000000000000000
--- a/doc/fluid/api/gen_doc.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
-
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
-do
- python gen_doc.py ${module} > ${module}.rst
-done
-
-python gen_doc.py "" > fluid.rst
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
deleted file mode 100644
index 359406819a993e7eaf2155c839373df44d97b103..0000000000000000000000000000000000000000
--- a/doc/fluid/api/index_en.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-=============
-API Reference
-=============
-
-.. toctree::
- :maxdepth: 1
-
- fluid.rst
- layers.rst
- data_feeder.rst
- executor.rst
- initializer.rst
- metrics.rst
- nets.rst
- clip.rst
- optimizer.rst
- param_attr.rst
- profiler.rst
- regularizer.rst
- io.rst
- data.rst
- transpiler.rst
- recordio_writer.rst
- backward.rst
- average.rst
- profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
deleted file mode 100644
index dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607..0000000000000000000000000000000000000000
--- a/doc/fluid/api/initializer.rst
+++ /dev/null
@@ -1,131 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=================
-fluid.initializer
-=================
-
-.. _api_fluid_initializer_Constant:
-
-Constant
---------
-
-.. autoclass:: paddle.fluid.initializer.Constant
- :members:
- :noindex:
-
-.. _api_fluid_initializer_Uniform:
-
-Uniform
--------
-
-.. autoclass:: paddle.fluid.initializer.Uniform
- :members:
- :noindex:
-
-.. _api_fluid_initializer_Normal:
-
-Normal
-------
-
-.. autoclass:: paddle.fluid.initializer.Normal
- :members:
- :noindex:
-
-.. _api_fluid_initializer_Xavier:
-
-Xavier
-------
-
-.. autoclass:: paddle.fluid.initializer.Xavier
- :members:
- :noindex:
-
-.. _api_fluid_initializer_Bilinear:
-
-Bilinear
---------
-
-.. autoclass:: paddle.fluid.initializer.Bilinear
- :members:
- :noindex:
-
-.. _api_fluid_initializer_MSRA:
-
-MSRA
-----
-
-.. autoclass:: paddle.fluid.initializer.MSRA
- :members:
- :noindex:
-
-.. _api_fluid_initializer_force_init_on_cpu:
-
-force_init_on_cpu
------------------
-
-.. autofunction:: paddle.fluid.initializer.force_init_on_cpu
- :noindex:
-
-.. _api_fluid_initializer_init_on_cpu:
-
-init_on_cpu
------------
-
-.. autofunction:: paddle.fluid.initializer.init_on_cpu
- :noindex:
-
-.. _api_fluid_initializer_ConstantInitializer:
-
-ConstantInitializer
--------------------
-
-.. autoclass:: paddle.fluid.initializer.ConstantInitializer
- :members:
- :noindex:
-
-.. _api_fluid_initializer_UniformInitializer:
-
-UniformInitializer
-------------------
-
-.. autoclass:: paddle.fluid.initializer.UniformInitializer
- :members:
- :noindex:
-
-.. _api_fluid_initializer_NormalInitializer:
-
-NormalInitializer
------------------
-
-.. autoclass:: paddle.fluid.initializer.NormalInitializer
- :members:
- :noindex:
-
-.. _api_fluid_initializer_XavierInitializer:
-
-XavierInitializer
------------------
-
-.. autoclass:: paddle.fluid.initializer.XavierInitializer
- :members:
- :noindex:
-
-.. _api_fluid_initializer_BilinearInitializer:
-
-BilinearInitializer
--------------------
-
-.. autoclass:: paddle.fluid.initializer.BilinearInitializer
- :members:
- :noindex:
-
-.. _api_fluid_initializer_MSRAInitializer:
-
-MSRAInitializer
----------------
-
-.. autoclass:: paddle.fluid.initializer.MSRAInitializer
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
deleted file mode 100644
index a31930af8552a0fea51235f5e44d39e44d42d7f9..0000000000000000000000000000000000000000
--- a/doc/fluid/api/io.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-========
-fluid.io
-========
-
-.. _api_fluid_io_save_vars:
-
-save_vars
----------
-
-.. autofunction:: paddle.fluid.io.save_vars
- :noindex:
-
-.. _api_fluid_io_save_params:
-
-save_params
------------
-
-.. autofunction:: paddle.fluid.io.save_params
- :noindex:
-
-.. _api_fluid_io_save_persistables:
-
-save_persistables
------------------
-
-.. autofunction:: paddle.fluid.io.save_persistables
- :noindex:
-
-.. _api_fluid_io_load_vars:
-
-load_vars
----------
-
-.. autofunction:: paddle.fluid.io.load_vars
- :noindex:
-
-.. _api_fluid_io_load_params:
-
-load_params
------------
-
-.. autofunction:: paddle.fluid.io.load_params
- :noindex:
-
-.. _api_fluid_io_load_persistables:
-
-load_persistables
------------------
-
-.. autofunction:: paddle.fluid.io.load_persistables
- :noindex:
-
-.. _api_fluid_io_save_inference_model:
-
-save_inference_model
---------------------
-
-.. autofunction:: paddle.fluid.io.save_inference_model
- :noindex:
-
-.. _api_fluid_io_load_inference_model:
-
-load_inference_model
---------------------
-
-.. autofunction:: paddle.fluid.io.load_inference_model
- :noindex:
-
-.. _api_fluid_io_get_inference_program:
-
-get_inference_program
----------------------
-
-.. autofunction:: paddle.fluid.io.get_inference_program
- :noindex:
-
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
deleted file mode 100644
index ed0777c6ff82d58e174c12c0c6bc1c716b6e7a59..0000000000000000000000000000000000000000
--- a/doc/fluid/api/layers.rst
+++ /dev/null
@@ -1,1700 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-============
-fluid.layers
-============
-
-control_flow
-============
-
-.. _api_fluid_layers_While:
-
-While
------
-
-.. autoclass:: paddle.fluid.layers.While
- :members:
- :noindex:
-
-.. _api_fluid_layers_Switch:
-
-Switch
-------
-
-.. autoclass:: paddle.fluid.layers.Switch
- :members:
- :noindex:
-
-.. _api_fluid_layers_increment:
-
-increment
----------
-
-.. autofunction:: paddle.fluid.layers.increment
- :noindex:
-
-.. _api_fluid_layers_array_write:
-
-array_write
------------
-
-.. autofunction:: paddle.fluid.layers.array_write
- :noindex:
-
-.. _api_fluid_layers_create_array:
-
-create_array
-------------
-
-.. autofunction:: paddle.fluid.layers.create_array
- :noindex:
-
-.. _api_fluid_layers_less_than:
-
-less_than
----------
-
-.. autofunction:: paddle.fluid.layers.less_than
- :noindex:
-
-.. _api_fluid_layers_equal:
-
-equal
------
-
-.. autofunction:: paddle.fluid.layers.equal
- :noindex:
-
-.. _api_fluid_layers_array_read:
-
-array_read
-----------
-
-.. autofunction:: paddle.fluid.layers.array_read
- :noindex:
-
-.. _api_fluid_layers_array_length:
-
-array_length
-------------
-
-.. autofunction:: paddle.fluid.layers.array_length
- :noindex:
-
-.. _api_fluid_layers_IfElse:
-
-IfElse
-------
-
-.. autoclass:: paddle.fluid.layers.IfElse
- :members:
- :noindex:
-
-.. _api_fluid_layers_DynamicRNN:
-
-DynamicRNN
-----------
-
-.. autoclass:: paddle.fluid.layers.DynamicRNN
- :members:
- :noindex:
-
-.. _api_fluid_layers_StaticRNN:
-
-StaticRNN
----------
-
-.. autoclass:: paddle.fluid.layers.StaticRNN
- :members:
- :noindex:
-
-.. _api_fluid_layers_reorder_lod_tensor_by_rank:
-
-reorder_lod_tensor_by_rank
---------------------------
-
-.. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
- :noindex:
-
-.. _api_fluid_layers_ParallelDo:
-
-ParallelDo
-----------
-
-.. autoclass:: paddle.fluid.layers.ParallelDo
- :members:
- :noindex:
-
-.. _api_fluid_layers_Print:
-
-Print
------
-
-.. autofunction:: paddle.fluid.layers.Print
- :noindex:
-
-.. _api_fluid_layers_is_empty:
-
-is_empty
---------
-
-.. autofunction:: paddle.fluid.layers.is_empty
- :noindex:
-
-device
-======
-
-io
-==
-
-.. _api_fluid_layers_data:
-
-data
-----
-
-.. autofunction:: paddle.fluid.layers.data
- :noindex:
-
-.. _api_fluid_layers_open_recordio_file:
-
-open_recordio_file
-------------------
-
-.. autofunction:: paddle.fluid.layers.open_recordio_file
- :noindex:
-
-.. _api_fluid_layers_open_files:
-
-open_files
-----------
-
-.. autofunction:: paddle.fluid.layers.open_files
- :noindex:
-
-.. _api_fluid_layers_read_file:
-
-read_file
----------
-
-.. autofunction:: paddle.fluid.layers.read_file
- :noindex:
-
-.. _api_fluid_layers_shuffle:
-
-shuffle
--------
-
-.. autofunction:: paddle.fluid.layers.shuffle
- :noindex:
-
-.. _api_fluid_layers_batch:
-
-batch
------
-
-.. autofunction:: paddle.fluid.layers.batch
- :noindex:
-
-.. _api_fluid_layers_double_buffer:
-
-double_buffer
--------------
-
-.. autofunction:: paddle.fluid.layers.double_buffer
- :noindex:
-
-.. _api_fluid_layers_random_data_generator:
-
-random_data_generator
----------------------
-
-.. autofunction:: paddle.fluid.layers.random_data_generator
- :noindex:
-
-.. _api_fluid_layers_py_reader:
-
-py_reader
----------
-
-.. autofunction:: paddle.fluid.layers.py_reader
- :noindex:
-
-.. _api_fluid_layers_Preprocessor:
-
-Preprocessor
-------------
-
-.. autoclass:: paddle.fluid.layers.Preprocessor
- :members:
- :noindex:
-
-.. _api_fluid_layers_load:
-
-load
-----
-
-.. autofunction:: paddle.fluid.layers.load
- :noindex:
-
-nn
-==
-
-.. _api_fluid_layers_fc:
-
-fc
---
-
-.. autofunction:: paddle.fluid.layers.fc
- :noindex:
-
-.. _api_fluid_layers_embedding:
-
-embedding
----------
-
-.. autofunction:: paddle.fluid.layers.embedding
- :noindex:
-
-.. _api_fluid_layers_dynamic_lstm:
-
-dynamic_lstm
-------------
-
-.. autofunction:: paddle.fluid.layers.dynamic_lstm
- :noindex:
-
-.. _api_fluid_layers_dynamic_lstmp:
-
-dynamic_lstmp
--------------
-
-.. autofunction:: paddle.fluid.layers.dynamic_lstmp
- :noindex:
-
-.. _api_fluid_layers_dynamic_gru:
-
-dynamic_gru
------------
-
-.. autofunction:: paddle.fluid.layers.dynamic_gru
- :noindex:
-
-.. _api_fluid_layers_gru_unit:
-
-gru_unit
---------
-
-.. autofunction:: paddle.fluid.layers.gru_unit
- :noindex:
-
-.. _api_fluid_layers_linear_chain_crf:
-
-linear_chain_crf
-----------------
-
-.. autofunction:: paddle.fluid.layers.linear_chain_crf
- :noindex:
-
-.. _api_fluid_layers_crf_decoding:
-
-crf_decoding
-------------
-
-.. autofunction:: paddle.fluid.layers.crf_decoding
- :noindex:
-
-.. _api_fluid_layers_cos_sim:
-
-cos_sim
--------
-
-.. autofunction:: paddle.fluid.layers.cos_sim
- :noindex:
-
-.. _api_fluid_layers_cross_entropy:
-
-cross_entropy
--------------
-
-.. autofunction:: paddle.fluid.layers.cross_entropy
- :noindex:
-
-.. _api_fluid_layers_square_error_cost:
-
-square_error_cost
------------------
-
-.. autofunction:: paddle.fluid.layers.square_error_cost
- :noindex:
-
-.. _api_fluid_layers_chunk_eval:
-
-chunk_eval
-----------
-
-.. autofunction:: paddle.fluid.layers.chunk_eval
- :noindex:
-
-.. _api_fluid_layers_sequence_conv:
-
-sequence_conv
--------------
-
-.. autofunction:: paddle.fluid.layers.sequence_conv
- :noindex:
-
-.. _api_fluid_layers_conv2d:
-
-conv2d
-------
-
-.. autofunction:: paddle.fluid.layers.conv2d
- :noindex:
-
-.. _api_fluid_layers_conv3d:
-
-conv3d
-------
-
-.. autofunction:: paddle.fluid.layers.conv3d
- :noindex:
-
-.. _api_fluid_layers_sequence_pool:
-
-sequence_pool
--------------
-
-.. autofunction:: paddle.fluid.layers.sequence_pool
- :noindex:
-
-.. _api_fluid_layers_sequence_softmax:
-
-sequence_softmax
-----------------
-
-.. autofunction:: paddle.fluid.layers.sequence_softmax
- :noindex:
-
-.. _api_fluid_layers_softmax:
-
-softmax
--------
-
-.. autofunction:: paddle.fluid.layers.softmax
- :noindex:
-
-.. _api_fluid_layers_pool2d:
-
-pool2d
-------
-
-.. autofunction:: paddle.fluid.layers.pool2d
- :noindex:
-
-.. _api_fluid_layers_pool3d:
-
-pool3d
-------
-
-.. autofunction:: paddle.fluid.layers.pool3d
- :noindex:
-
-.. _api_fluid_layers_batch_norm:
-
-batch_norm
-----------
-
-.. autofunction:: paddle.fluid.layers.batch_norm
- :noindex:
-
-.. _api_fluid_layers_beam_search_decode:
-
-beam_search_decode
-------------------
-
-.. autofunction:: paddle.fluid.layers.beam_search_decode
- :noindex:
-
-.. _api_fluid_layers_conv2d_transpose:
-
-conv2d_transpose
-----------------
-
-.. autofunction:: paddle.fluid.layers.conv2d_transpose
- :noindex:
-
-.. _api_fluid_layers_conv3d_transpose:
-
-conv3d_transpose
-----------------
-
-.. autofunction:: paddle.fluid.layers.conv3d_transpose
- :noindex:
-
-.. _api_fluid_layers_sequence_expand:
-
-sequence_expand
----------------
-
-.. autofunction:: paddle.fluid.layers.sequence_expand
- :noindex:
-
-.. _api_fluid_layers_sequence_pad:
-
-sequence_pad
-------------
-
-.. autofunction:: paddle.fluid.layers.sequence_pad
- :noindex:
-
-.. _api_fluid_layers_lstm_unit:
-
-lstm_unit
----------
-
-.. autofunction:: paddle.fluid.layers.lstm_unit
- :noindex:
-
-.. _api_fluid_layers_reduce_sum:
-
-reduce_sum
-----------
-
-.. autofunction:: paddle.fluid.layers.reduce_sum
- :noindex:
-
-.. _api_fluid_layers_reduce_mean:
-
-reduce_mean
------------
-
-.. autofunction:: paddle.fluid.layers.reduce_mean
- :noindex:
-
-.. _api_fluid_layers_reduce_max:
-
-reduce_max
-----------
-
-.. autofunction:: paddle.fluid.layers.reduce_max
- :noindex:
-
-.. _api_fluid_layers_reduce_min:
-
-reduce_min
-----------
-
-.. autofunction:: paddle.fluid.layers.reduce_min
- :noindex:
-
-.. _api_fluid_layers_reduce_prod:
-
-reduce_prod
------------
-
-.. autofunction:: paddle.fluid.layers.reduce_prod
- :noindex:
-
-.. _api_fluid_layers_sequence_first_step:
-
-sequence_first_step
--------------------
-
-.. autofunction:: paddle.fluid.layers.sequence_first_step
- :noindex:
-
-.. _api_fluid_layers_sequence_last_step:
-
-sequence_last_step
-------------------
-
-.. autofunction:: paddle.fluid.layers.sequence_last_step
- :noindex:
-
-.. _api_fluid_layers_dropout:
-
-dropout
--------
-
-.. autofunction:: paddle.fluid.layers.dropout
- :noindex:
-
-.. _api_fluid_layers_split:
-
-split
------
-
-.. autofunction:: paddle.fluid.layers.split
- :noindex:
-
-.. _api_fluid_layers_ctc_greedy_decoder:
-
-ctc_greedy_decoder
-------------------
-
-.. autofunction:: paddle.fluid.layers.ctc_greedy_decoder
- :noindex:
-
-.. _api_fluid_layers_edit_distance:
-
-edit_distance
--------------
-
-.. autofunction:: paddle.fluid.layers.edit_distance
- :noindex:
-
-.. _api_fluid_layers_l2_normalize:
-
-l2_normalize
-------------
-
-.. autofunction:: paddle.fluid.layers.l2_normalize
- :noindex:
-
-.. _api_fluid_layers_matmul:
-
-matmul
-------
-
-.. autofunction:: paddle.fluid.layers.matmul
- :noindex:
-
-.. _api_fluid_layers_topk:
-
-topk
-----
-
-.. autofunction:: paddle.fluid.layers.topk
- :noindex:
-
-.. _api_fluid_layers_warpctc:
-
-warpctc
--------
-
-.. autofunction:: paddle.fluid.layers.warpctc
- :noindex:
-
-.. _api_fluid_layers_sequence_reshape:
-
-sequence_reshape
-----------------
-
-.. autofunction:: paddle.fluid.layers.sequence_reshape
- :noindex:
-
-.. _api_fluid_layers_transpose:
-
-transpose
----------
-
-.. autofunction:: paddle.fluid.layers.transpose
- :noindex:
-
-.. _api_fluid_layers_im2sequence:
-
-im2sequence
------------
-
-.. autofunction:: paddle.fluid.layers.im2sequence
- :noindex:
-
-.. _api_fluid_layers_nce:
-
-nce
----
-
-.. autofunction:: paddle.fluid.layers.nce
- :noindex:
-
-.. _api_fluid_layers_hsigmoid:
-
-hsigmoid
---------
-
-.. autofunction:: paddle.fluid.layers.hsigmoid
- :noindex:
-
-.. _api_fluid_layers_beam_search:
-
-beam_search
------------
-
-.. autofunction:: paddle.fluid.layers.beam_search
- :noindex:
-
-.. _api_fluid_layers_row_conv:
-
-row_conv
---------
-
-.. autofunction:: paddle.fluid.layers.row_conv
- :noindex:
-
-.. _api_fluid_layers_multiplex:
-
-multiplex
----------
-
-.. autofunction:: paddle.fluid.layers.multiplex
- :noindex:
-
-.. _api_fluid_layers_layer_norm:
-
-layer_norm
-----------
-
-.. autofunction:: paddle.fluid.layers.layer_norm
- :noindex:
-
-.. _api_fluid_layers_softmax_with_cross_entropy:
-
-softmax_with_cross_entropy
---------------------------
-
-.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
- :noindex:
-
-.. _api_fluid_layers_smooth_l1:
-
-smooth_l1
----------
-
-.. autofunction:: paddle.fluid.layers.smooth_l1
- :noindex:
-
-.. _api_fluid_layers_one_hot:
-
-one_hot
--------
-
-.. autofunction:: paddle.fluid.layers.one_hot
- :noindex:
-
-.. _api_fluid_layers_autoincreased_step_counter:
-
-autoincreased_step_counter
---------------------------
-
-.. autofunction:: paddle.fluid.layers.autoincreased_step_counter
- :noindex:
-
-.. _api_fluid_layers_reshape:
-
-reshape
--------
-
-.. autofunction:: paddle.fluid.layers.reshape
- :noindex:
-
-.. _api_fluid_layers_lod_reset:
-
-lod_reset
----------
-
-.. autofunction:: paddle.fluid.layers.lod_reset
- :noindex:
-
-.. _api_fluid_layers_lrn:
-
-lrn
----
-
-.. autofunction:: paddle.fluid.layers.lrn
- :noindex:
-
-.. _api_fluid_layers_pad:
-
-pad
----
-
-.. autofunction:: paddle.fluid.layers.pad
- :noindex:
-
-.. _api_fluid_layers_label_smooth:
-
-label_smooth
-------------
-
-.. autofunction:: paddle.fluid.layers.label_smooth
- :noindex:
-
-.. _api_fluid_layers_roi_pool:
-
-roi_pool
---------
-
-.. autofunction:: paddle.fluid.layers.roi_pool
- :noindex:
-
-.. _api_fluid_layers_dice_loss:
-
-dice_loss
----------
-
-.. autofunction:: paddle.fluid.layers.dice_loss
- :noindex:
-
-.. _api_fluid_layers_image_resize:
-
-image_resize
-------------
-
-.. autofunction:: paddle.fluid.layers.image_resize
- :noindex:
-
-.. _api_fluid_layers_image_resize_short:
-
-image_resize_short
-------------------
-
-.. autofunction:: paddle.fluid.layers.image_resize_short
- :noindex:
-
-.. _api_fluid_layers_resize_bilinear:
-
-resize_bilinear
----------------
-
-.. autofunction:: paddle.fluid.layers.resize_bilinear
- :noindex:
-
-.. _api_fluid_layers_gather:
-
-gather
-------
-
-.. autofunction:: paddle.fluid.layers.gather
- :noindex:
-
-.. _api_fluid_layers_random_crop:
-
-random_crop
------------
-
-.. autofunction:: paddle.fluid.layers.random_crop
- :noindex:
-
-.. _api_fluid_layers_mean_iou:
-
-mean_iou
---------
-
-.. autofunction:: paddle.fluid.layers.mean_iou
- :noindex:
-
-.. _api_fluid_layers_relu:
-
-relu
-----
-
-.. autofunction:: paddle.fluid.layers.relu
- :noindex:
-
-.. _api_fluid_layers_log:
-
-log
----
-
-.. autofunction:: paddle.fluid.layers.log
- :noindex:
-
-.. _api_fluid_layers_crop:
-
-crop
-----
-
-.. autofunction:: paddle.fluid.layers.crop
- :noindex:
-
-.. _api_fluid_layers_rank_loss:
-
-rank_loss
----------
-
-.. autofunction:: paddle.fluid.layers.rank_loss
- :noindex:
-
-.. _api_fluid_layers_prelu:
-
-prelu
------
-
-.. autofunction:: paddle.fluid.layers.prelu
- :noindex:
-
-.. _api_fluid_layers_flatten:
-
-flatten
--------
-
-.. autofunction:: paddle.fluid.layers.flatten
- :noindex:
-
-ops
-===
-
-.. _api_fluid_layers_mean:
-
-mean
-----
-
-.. autofunction:: paddle.fluid.layers.mean
- :noindex:
-
-.. _api_fluid_layers_mul:
-
-mul
----
-
-.. autofunction:: paddle.fluid.layers.mul
- :noindex:
-
-.. _api_fluid_layers_scale:
-
-scale
------
-
-.. autofunction:: paddle.fluid.layers.scale
- :noindex:
-
-.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
-
-sigmoid_cross_entropy_with_logits
----------------------------------
-
-.. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
- :noindex:
-
-.. _api_fluid_layers_elementwise_add:
-
-elementwise_add
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_add
- :noindex:
-
-.. _api_fluid_layers_elementwise_div:
-
-elementwise_div
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_div
- :noindex:
-
-.. _api_fluid_layers_elementwise_sub:
-
-elementwise_sub
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_sub
- :noindex:
-
-.. _api_fluid_layers_elementwise_mul:
-
-elementwise_mul
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_mul
- :noindex:
-
-.. _api_fluid_layers_elementwise_max:
-
-elementwise_max
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_max
- :noindex:
-
-.. _api_fluid_layers_elementwise_min:
-
-elementwise_min
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_min
- :noindex:
-
-.. _api_fluid_layers_elementwise_pow:
-
-elementwise_pow
----------------
-
-.. autofunction:: paddle.fluid.layers.elementwise_pow
- :noindex:
-
-.. _api_fluid_layers_clip:
-
-clip
-----
-
-.. autofunction:: paddle.fluid.layers.clip
- :noindex:
-
-.. _api_fluid_layers_clip_by_norm:
-
-clip_by_norm
-------------
-
-.. autofunction:: paddle.fluid.layers.clip_by_norm
- :noindex:
-
-.. _api_fluid_layers_logical_and:
-
-logical_and
------------
-
-.. autofunction:: paddle.fluid.layers.logical_and
- :noindex:
-
-.. _api_fluid_layers_logical_or:
-
-logical_or
-----------
-
-.. autofunction:: paddle.fluid.layers.logical_or
- :noindex:
-
-.. _api_fluid_layers_logical_xor:
-
-logical_xor
------------
-
-.. autofunction:: paddle.fluid.layers.logical_xor
- :noindex:
-
-.. _api_fluid_layers_logical_not:
-
-logical_not
------------
-
-.. autofunction:: paddle.fluid.layers.logical_not
- :noindex:
-
-.. _api_fluid_layers_uniform_random_batch_size_like:
-
-uniform_random_batch_size_like
-------------------------------
-
-.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
- :noindex:
-
-.. _api_fluid_layers_gaussian_random:
-
-gaussian_random
----------------
-
-.. autofunction:: paddle.fluid.layers.gaussian_random
- :noindex:
-
-.. _api_fluid_layers_gaussian_random_batch_size_like:
-
-gaussian_random_batch_size_like
--------------------------------
-
-.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
- :noindex:
-
-.. _api_fluid_layers_scatter:
-
-scatter
--------
-
-.. autofunction:: paddle.fluid.layers.scatter
- :noindex:
-
-.. _api_fluid_layers_sum:
-
-sum
----
-
-.. autofunction:: paddle.fluid.layers.sum
- :noindex:
-
-.. _api_fluid_layers_slice:
-
-slice
------
-
-.. autofunction:: paddle.fluid.layers.slice
- :noindex:
-
-.. _api_fluid_layers_shape:
-
-shape
------
-
-.. autofunction:: paddle.fluid.layers.shape
- :noindex:
-
-.. _api_fluid_layers_maxout:
-
-maxout
-------
-
-.. autofunction:: paddle.fluid.layers.maxout
- :noindex:
-
-.. _api_fluid_layers_sigmoid:
-
-sigmoid
--------
-
-.. autofunction:: paddle.fluid.layers.sigmoid
- :noindex:
-
-.. _api_fluid_layers_logsigmoid:
-
-logsigmoid
-----------
-
-.. autofunction:: paddle.fluid.layers.logsigmoid
- :noindex:
-
-.. _api_fluid_layers_exp:
-
-exp
----
-
-.. autofunction:: paddle.fluid.layers.exp
- :noindex:
-
-.. _api_fluid_layers_tanh:
-
-tanh
-----
-
-.. autofunction:: paddle.fluid.layers.tanh
- :noindex:
-
-.. _api_fluid_layers_tanh_shrink:
-
-tanh_shrink
------------
-
-.. autofunction:: paddle.fluid.layers.tanh_shrink
- :noindex:
-
-.. _api_fluid_layers_softshrink:
-
-softshrink
-----------
-
-.. autofunction:: paddle.fluid.layers.softshrink
- :noindex:
-
-.. _api_fluid_layers_sqrt:
-
-sqrt
-----
-
-.. autofunction:: paddle.fluid.layers.sqrt
- :noindex:
-
-.. _api_fluid_layers_abs:
-
-abs
----
-
-.. autofunction:: paddle.fluid.layers.abs
- :noindex:
-
-.. _api_fluid_layers_ceil:
-
-ceil
-----
-
-.. autofunction:: paddle.fluid.layers.ceil
- :noindex:
-
-.. _api_fluid_layers_floor:
-
-floor
------
-
-.. autofunction:: paddle.fluid.layers.floor
- :noindex:
-
-.. _api_fluid_layers_cos:
-
-cos
----
-
-.. autofunction:: paddle.fluid.layers.cos
- :noindex:
-
-.. _api_fluid_layers_sin:
-
-sin
----
-
-.. autofunction:: paddle.fluid.layers.sin
- :noindex:
-
-.. _api_fluid_layers_round:
-
-round
------
-
-.. autofunction:: paddle.fluid.layers.round
- :noindex:
-
-.. _api_fluid_layers_reciprocal:
-
-reciprocal
-----------
-
-.. autofunction:: paddle.fluid.layers.reciprocal
- :noindex:
-
-.. _api_fluid_layers_square:
-
-square
-------
-
-.. autofunction:: paddle.fluid.layers.square
- :noindex:
-
-.. _api_fluid_layers_softplus:
-
-softplus
---------
-
-.. autofunction:: paddle.fluid.layers.softplus
- :noindex:
-
-.. _api_fluid_layers_softsign:
-
-softsign
---------
-
-.. autofunction:: paddle.fluid.layers.softsign
- :noindex:
-
-.. _api_fluid_layers_brelu:
-
-brelu
------
-
-.. autofunction:: paddle.fluid.layers.brelu
- :noindex:
-
-.. _api_fluid_layers_leaky_relu:
-
-leaky_relu
-----------
-
-.. autofunction:: paddle.fluid.layers.leaky_relu
- :noindex:
-
-.. _api_fluid_layers_soft_relu:
-
-soft_relu
----------
-
-.. autofunction:: paddle.fluid.layers.soft_relu
- :noindex:
-
-.. _api_fluid_layers_elu:
-
-elu
----
-
-.. autofunction:: paddle.fluid.layers.elu
- :noindex:
-
-.. _api_fluid_layers_relu6:
-
-relu6
------
-
-.. autofunction:: paddle.fluid.layers.relu6
- :noindex:
-
-.. _api_fluid_layers_pow:
-
-pow
----
-
-.. autofunction:: paddle.fluid.layers.pow
- :noindex:
-
-.. _api_fluid_layers_stanh:
-
-stanh
------
-
-.. autofunction:: paddle.fluid.layers.stanh
- :noindex:
-
-.. _api_fluid_layers_hard_sigmoid:
-
-hard_sigmoid
-------------
-
-.. autofunction:: paddle.fluid.layers.hard_sigmoid
- :noindex:
-
-.. _api_fluid_layers_swish:
-
-swish
------
-
-.. autofunction:: paddle.fluid.layers.swish
- :noindex:
-
-.. _api_fluid_layers_uniform_random:
-
-uniform_random
---------------
-
-.. autofunction:: paddle.fluid.layers.uniform_random
- :noindex:
-
-.. _api_fluid_layers_hard_shrink:
-
-hard_shrink
------------
-
-.. autofunction:: paddle.fluid.layers.hard_shrink
- :noindex:
-
-.. _api_fluid_layers_cumsum:
-
-cumsum
-------
-
-.. autofunction:: paddle.fluid.layers.cumsum
- :noindex:
-
-.. _api_fluid_layers_thresholded_relu:
-
-thresholded_relu
-----------------
-
-.. autofunction:: paddle.fluid.layers.thresholded_relu
- :noindex:
-
-tensor
-======
-
-.. _api_fluid_layers_create_tensor:
-
-create_tensor
--------------
-
-.. autofunction:: paddle.fluid.layers.create_tensor
- :noindex:
-
-.. _api_fluid_layers_create_parameter:
-
-create_parameter
-----------------
-
-.. autofunction:: paddle.fluid.layers.create_parameter
- :noindex:
-
-.. _api_fluid_layers_create_global_var:
-
-create_global_var
------------------
-
-.. autofunction:: paddle.fluid.layers.create_global_var
- :noindex:
-
-.. _api_fluid_layers_cast:
-
-cast
-----
-
-.. autofunction:: paddle.fluid.layers.cast
- :noindex:
-
-.. _api_fluid_layers_concat:
-
-concat
-------
-
-.. autofunction:: paddle.fluid.layers.concat
- :noindex:
-
-.. _api_fluid_layers_sums:
-
-sums
-----
-
-.. autofunction:: paddle.fluid.layers.sums
- :noindex:
-
-.. _api_fluid_layers_assign:
-
-assign
-------
-
-.. autofunction:: paddle.fluid.layers.assign
- :noindex:
-
-.. _api_fluid_layers_fill_constant_batch_size_like:
-
-fill_constant_batch_size_like
------------------------------
-
-.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
- :noindex:
-
-.. _api_fluid_layers_fill_constant:
-
-fill_constant
--------------
-
-.. autofunction:: paddle.fluid.layers.fill_constant
- :noindex:
-
-.. _api_fluid_layers_argmin:
-
-argmin
-------
-
-.. autofunction:: paddle.fluid.layers.argmin
- :noindex:
-
-.. _api_fluid_layers_argmax:
-
-argmax
-------
-
-.. autofunction:: paddle.fluid.layers.argmax
- :noindex:
-
-.. _api_fluid_layers_argsort:
-
-argsort
--------
-
-.. autofunction:: paddle.fluid.layers.argsort
- :noindex:
-
-.. _api_fluid_layers_ones:
-
-ones
-----
-
-.. autofunction:: paddle.fluid.layers.ones
- :noindex:
-
-.. _api_fluid_layers_zeros:
-
-zeros
------
-
-.. autofunction:: paddle.fluid.layers.zeros
- :noindex:
-
-.. _api_fluid_layers_reverse:
-
-reverse
--------
-
-.. autofunction:: paddle.fluid.layers.reverse
- :noindex:
-
-learning_rate_scheduler
-=======================
-
-.. _api_fluid_layers_exponential_decay:
-
-exponential_decay
------------------
-
-.. autofunction:: paddle.fluid.layers.exponential_decay
- :noindex:
-
-.. _api_fluid_layers_natural_exp_decay:
-
-natural_exp_decay
------------------
-
-.. autofunction:: paddle.fluid.layers.natural_exp_decay
- :noindex:
-
-.. _api_fluid_layers_inverse_time_decay:
-
-inverse_time_decay
-------------------
-
-.. autofunction:: paddle.fluid.layers.inverse_time_decay
- :noindex:
-
-.. _api_fluid_layers_polynomial_decay:
-
-polynomial_decay
-----------------
-
-.. autofunction:: paddle.fluid.layers.polynomial_decay
- :noindex:
-
-.. _api_fluid_layers_piecewise_decay:
-
-piecewise_decay
----------------
-
-.. autofunction:: paddle.fluid.layers.piecewise_decay
- :noindex:
-
-.. _api_fluid_layers_noam_decay:
-
-noam_decay
-----------
-
-.. autofunction:: paddle.fluid.layers.noam_decay
- :noindex:
-
-.. _api_fluid_layers_append_LARS:
-
-append_LARS
------------
-
-.. autofunction:: paddle.fluid.layers.append_LARS
- :noindex:
-
-detection
-=========
-
-.. _api_fluid_layers_prior_box:
-
-prior_box
----------
-
-.. autofunction:: paddle.fluid.layers.prior_box
- :noindex:
-
-.. _api_fluid_layers_multi_box_head:
-
-multi_box_head
---------------
-
-.. autofunction:: paddle.fluid.layers.multi_box_head
- :noindex:
-
-.. _api_fluid_layers_bipartite_match:
-
-bipartite_match
----------------
-
-.. autofunction:: paddle.fluid.layers.bipartite_match
- :noindex:
-
-.. _api_fluid_layers_target_assign:
-
-target_assign
--------------
-
-.. autofunction:: paddle.fluid.layers.target_assign
- :noindex:
-
-.. _api_fluid_layers_detection_output:
-
-detection_output
-----------------
-
-.. autofunction:: paddle.fluid.layers.detection_output
- :noindex:
-
-.. _api_fluid_layers_ssd_loss:
-
-ssd_loss
---------
-
-.. autofunction:: paddle.fluid.layers.ssd_loss
- :noindex:
-
-.. _api_fluid_layers_detection_map:
-
-detection_map
--------------
-
-.. autofunction:: paddle.fluid.layers.detection_map
- :noindex:
-
-.. _api_fluid_layers_rpn_target_assign:
-
-rpn_target_assign
------------------
-
-.. autofunction:: paddle.fluid.layers.rpn_target_assign
- :noindex:
-
-.. _api_fluid_layers_anchor_generator:
-
-anchor_generator
-----------------
-
-.. autofunction:: paddle.fluid.layers.anchor_generator
- :noindex:
-
-.. _api_fluid_layers_iou_similarity:
-
-iou_similarity
---------------
-
-.. autofunction:: paddle.fluid.layers.iou_similarity
- :noindex:
-
-.. _api_fluid_layers_box_coder:
-
-box_coder
----------
-
-.. autofunction:: paddle.fluid.layers.box_coder
- :noindex:
-
-.. _api_fluid_layers_polygon_box_transform:
-
-polygon_box_transform
----------------------
-
-.. autofunction:: paddle.fluid.layers.polygon_box_transform
- :noindex:
-
-metric_op
-=========
-
-.. _api_fluid_layers_accuracy:
-
-accuracy
---------
-
-.. autofunction:: paddle.fluid.layers.accuracy
- :noindex:
-
-.. _api_fluid_layers_auc:
-
-auc
----
-
-.. autofunction:: paddle.fluid.layers.auc
- :noindex:
-
-tensor
-======
-
-.. _api_fluid_layers_create_tensor:
-
-create_tensor
--------------
-
-.. autofunction:: paddle.fluid.layers.create_tensor
- :noindex:
-
-.. _api_fluid_layers_create_parameter:
-
-create_parameter
-----------------
-
-.. autofunction:: paddle.fluid.layers.create_parameter
- :noindex:
-
-.. _api_fluid_layers_create_global_var:
-
-create_global_var
------------------
-
-.. autofunction:: paddle.fluid.layers.create_global_var
- :noindex:
-
-.. _api_fluid_layers_cast:
-
-cast
-----
-
-.. autofunction:: paddle.fluid.layers.cast
- :noindex:
-
-.. _api_fluid_layers_concat:
-
-concat
-------
-
-.. autofunction:: paddle.fluid.layers.concat
- :noindex:
-
-.. _api_fluid_layers_sums:
-
-sums
-----
-
-.. autofunction:: paddle.fluid.layers.sums
- :noindex:
-
-.. _api_fluid_layers_assign:
-
-assign
-------
-
-.. autofunction:: paddle.fluid.layers.assign
- :noindex:
-
-.. _api_fluid_layers_fill_constant_batch_size_like:
-
-fill_constant_batch_size_like
------------------------------
-
-.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
- :noindex:
-
-.. _api_fluid_layers_fill_constant:
-
-fill_constant
--------------
-
-.. autofunction:: paddle.fluid.layers.fill_constant
- :noindex:
-
-.. _api_fluid_layers_argmin:
-
-argmin
-------
-
-.. autofunction:: paddle.fluid.layers.argmin
- :noindex:
-
-.. _api_fluid_layers_argmax:
-
-argmax
-------
-
-.. autofunction:: paddle.fluid.layers.argmax
- :noindex:
-
-.. _api_fluid_layers_argsort:
-
-argsort
--------
-
-.. autofunction:: paddle.fluid.layers.argsort
- :noindex:
-
-.. _api_fluid_layers_ones:
-
-ones
-----
-
-.. autofunction:: paddle.fluid.layers.ones
- :noindex:
-
-.. _api_fluid_layers_zeros:
-
-zeros
------
-
-.. autofunction:: paddle.fluid.layers.zeros
- :noindex:
-
-.. _api_fluid_layers_reverse:
-
-reverse
--------
-
-.. autofunction:: paddle.fluid.layers.reverse
- :noindex:
-
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
deleted file mode 100644
index 0f54b2e2eb7ead353215c5dbd529293794e37123..0000000000000000000000000000000000000000
--- a/doc/fluid/api/metrics.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=============
-fluid.metrics
-=============
-
-.. _api_fluid_metrics_MetricBase:
-
-MetricBase
-----------
-
-.. autoclass:: paddle.fluid.metrics.MetricBase
- :members:
- :noindex:
-
-.. _api_fluid_metrics_CompositeMetric:
-
-CompositeMetric
----------------
-
-.. autoclass:: paddle.fluid.metrics.CompositeMetric
- :members:
- :noindex:
-
-.. _api_fluid_metrics_Precision:
-
-Precision
----------
-
-.. autoclass:: paddle.fluid.metrics.Precision
- :members:
- :noindex:
-
-.. _api_fluid_metrics_Recall:
-
-Recall
-------
-
-.. autoclass:: paddle.fluid.metrics.Recall
- :members:
- :noindex:
-
-.. _api_fluid_metrics_Accuracy:
-
-Accuracy
---------
-
-.. autoclass:: paddle.fluid.metrics.Accuracy
- :members:
- :noindex:
-
-.. _api_fluid_metrics_ChunkEvaluator:
-
-ChunkEvaluator
---------------
-
-.. autoclass:: paddle.fluid.metrics.ChunkEvaluator
- :members:
- :noindex:
-
-.. _api_fluid_metrics_EditDistance:
-
-EditDistance
-------------
-
-.. autoclass:: paddle.fluid.metrics.EditDistance
- :members:
- :noindex:
-
-.. _api_fluid_metrics_DetectionMAP:
-
-DetectionMAP
-------------
-
-.. autoclass:: paddle.fluid.metrics.DetectionMAP
- :members:
- :noindex:
-
-.. _api_fluid_metrics_Auc:
-
-Auc
----
-
-.. autoclass:: paddle.fluid.metrics.Auc
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
deleted file mode 100644
index 059733af18517257b6821d95fd628a9e13e6e98e..0000000000000000000000000000000000000000
--- a/doc/fluid/api/nets.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-==========
-fluid.nets
-==========
-
-.. _api_fluid_nets_simple_img_conv_pool:
-
-simple_img_conv_pool
---------------------
-
-.. autofunction:: paddle.fluid.nets.simple_img_conv_pool
- :noindex:
-
-.. _api_fluid_nets_sequence_conv_pool:
-
-sequence_conv_pool
-------------------
-
-.. autofunction:: paddle.fluid.nets.sequence_conv_pool
- :noindex:
-
-.. _api_fluid_nets_glu:
-
-glu
----
-
-.. autofunction:: paddle.fluid.nets.glu
- :noindex:
-
-.. _api_fluid_nets_scaled_dot_product_attention:
-
-scaled_dot_product_attention
-----------------------------
-
-.. autofunction:: paddle.fluid.nets.scaled_dot_product_attention
- :noindex:
-
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
deleted file mode 100644
index dfd587afd704ecbd17fb14d1fef0752d9313048b..0000000000000000000000000000000000000000
--- a/doc/fluid/api/optimizer.rst
+++ /dev/null
@@ -1,169 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-===============
-fluid.optimizer
-===============
-
-.. _api_fluid_optimizer_SGD:
-
-SGD
----
-
-.. autoclass:: paddle.fluid.optimizer.SGD
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_Momentum:
-
-Momentum
---------
-
-.. autoclass:: paddle.fluid.optimizer.Momentum
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_Adagrad:
-
-Adagrad
--------
-
-.. autoclass:: paddle.fluid.optimizer.Adagrad
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_Adam:
-
-Adam
-----
-
-.. autoclass:: paddle.fluid.optimizer.Adam
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_Adamax:
-
-Adamax
-------
-
-.. autoclass:: paddle.fluid.optimizer.Adamax
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_DecayedAdagrad:
-
-DecayedAdagrad
---------------
-
-.. autoclass:: paddle.fluid.optimizer.DecayedAdagrad
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_Ftrl:
-
-Ftrl
-----
-
-.. autoclass:: paddle.fluid.optimizer.Ftrl
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_SGDOptimizer:
-
-SGDOptimizer
-------------
-
-.. autoclass:: paddle.fluid.optimizer.SGDOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_MomentumOptimizer:
-
-MomentumOptimizer
------------------
-
-.. autoclass:: paddle.fluid.optimizer.MomentumOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_AdagradOptimizer:
-
-AdagradOptimizer
-----------------
-
-.. autoclass:: paddle.fluid.optimizer.AdagradOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_AdamOptimizer:
-
-AdamOptimizer
--------------
-
-.. autoclass:: paddle.fluid.optimizer.AdamOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_AdamaxOptimizer:
-
-AdamaxOptimizer
----------------
-
-.. autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_DecayedAdagradOptimizer:
-
-DecayedAdagradOptimizer
------------------------
-
-.. autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
-----------------
-
-.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_FtrlOptimizer:
-
-FtrlOptimizer
--------------
-
-.. autoclass:: paddle.fluid.optimizer.FtrlOptimizer
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_Adadelta:
-
-Adadelta
---------
-
-.. autoclass:: paddle.fluid.optimizer.Adadelta
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_ModelAverage:
-
-ModelAverage
-------------
-
-.. autoclass:: paddle.fluid.optimizer.ModelAverage
- :members:
- :noindex:
-
-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
-----------------
-
-.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
deleted file mode 100644
index 33035bbc7ca5c8d000adeaf1cb79806a3ea64604..0000000000000000000000000000000000000000
--- a/doc/fluid/api/param_attr.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-================
-fluid.param_attr
-================
-
-.. _api_fluid_param_attr_ParamAttr:
-
-ParamAttr
----------
-
-.. autoclass:: paddle.fluid.param_attr.ParamAttr
- :members:
- :noindex:
-
-.. _api_fluid_param_attr_WeightNormParamAttr:
-
-WeightNormParamAttr
--------------------
-
-.. autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
deleted file mode 100644
index c750a2d588df56728ac7f73051ab7a9e44dee232..0000000000000000000000000000000000000000
--- a/doc/fluid/api/profiler.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-==============
-fluid.profiler
-==============
-
-.. _api_fluid_profiler_cuda_profiler:
-
-cuda_profiler
--------------
-
-.. autofunction:: paddle.fluid.profiler.cuda_profiler
- :noindex:
-
-.. _api_fluid_profiler_reset_profiler:
-
-reset_profiler
---------------
-
-.. autofunction:: paddle.fluid.profiler.reset_profiler
- :noindex:
-
-.. _api_fluid_profiler_profiler:
-
-profiler
---------
-
-.. autofunction:: paddle.fluid.profiler.profiler
- :noindex:
-
-.. _api_fluid_profiler_start_profiler:
-
-start_profiler
---------------
-
-.. autofunction:: paddle.fluid.profiler.start_profiler
- :noindex:
-
-.. _api_fluid_profiler_stop_profiler:
-
-stop_profiler
--------------
-
-.. autofunction:: paddle.fluid.profiler.stop_profiler
- :noindex:
-
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
deleted file mode 100644
index f0c12fd115478a29fbd178b533b7490b2f663717..0000000000000000000000000000000000000000
--- a/doc/fluid/api/recordio_writer.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=====================
-fluid.recordio_writer
-=====================
-
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
-
-convert_reader_to_recordio_file
--------------------------------
-
-.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
- :noindex:
-
-.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
-
-convert_reader_to_recordio_files
---------------------------------
-
-.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
- :noindex:
-
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
deleted file mode 100644
index 5b3004a783930cc1ccac1c4db30603eee6e52769..0000000000000000000000000000000000000000
--- a/doc/fluid/api/regularizer.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=================
-fluid.regularizer
-=================
-
-.. _api_fluid_regularizer_L1Decay:
-
-L1Decay
--------
-
-.. autoclass:: paddle.fluid.regularizer.L1Decay
- :members:
- :noindex:
-
-.. _api_fluid_regularizer_L2Decay:
-
-L2Decay
--------
-
-.. autoclass:: paddle.fluid.regularizer.L2Decay
- :members:
- :noindex:
-
-.. _api_fluid_regularizer_L1DecayRegularizer:
-
-L1DecayRegularizer
-------------------
-
-.. autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
- :members:
- :noindex:
-
-.. _api_fluid_regularizer_L2DecayRegularizer:
-
-L2DecayRegularizer
-------------------
-
-.. autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
deleted file mode 100644
index b6e169ff5d1bed9338745874fbc570e5be5f316b..0000000000000000000000000000000000000000
--- a/doc/fluid/api/transpiler.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-================
-fluid.transpiler
-================
-
-.. _api_fluid_transpiler_DistributeTranspiler:
-
-DistributeTranspiler
---------------------
-
-.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
- :members:
- :noindex:
-
-.. _api_fluid_transpiler_InferenceTranspiler:
-
-InferenceTranspiler
--------------------
-
-.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
- :members:
- :noindex:
-
-.. _api_fluid_transpiler_memory_optimize:
-
-memory_optimize
----------------
-
-.. autofunction:: paddle.fluid.transpiler.memory_optimize
- :noindex:
-
-.. _api_fluid_transpiler_release_memory:
-
-release_memory
---------------
-
-.. autofunction:: paddle.fluid.transpiler.release_memory
- :noindex:
-
-.. _api_fluid_transpiler_HashName:
-
-HashName
---------
-
-.. autoclass:: paddle.fluid.transpiler.HashName
- :members:
- :noindex:
-
-.. _api_fluid_transpiler_RoundRobin:
-
-RoundRobin
-----------
-
-.. autoclass:: paddle.fluid.transpiler.RoundRobin
- :members:
- :noindex:
-
-.. _api_fluid_transpiler_DistributeTranspilerConfig:
-
-DistributeTranspilerConfig
---------------------------
-
-.. autoclass:: paddle.fluid.transpiler.DistributeTranspilerConfig
- :members:
- :noindex:
-
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
deleted file mode 120000
index ae4e8c7c48e584ec16a7be5466f83dd154ffb5fb..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/build_from_source_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
deleted file mode 120000
index 1ac828c973826bb8374c4aa8e17fda3ea1bb939f..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/build_from_source_en.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
deleted file mode 120000
index 965b2e20559291989422938c418fadbac16941b9..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/docker_install_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
deleted file mode 120000
index 79d7341a7bbb9e477c773134f24983fd7607769a..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/docker_install_en.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
deleted file mode 120000
index f697fcd8fac9131862ae7f8f51c5ebe93737ad2d..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/index_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/index_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
deleted file mode 120000
index 502f66a41319d4f41ae1774628ca36da9dca76ce..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/index_en.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/index_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
deleted file mode 120000
index c3eb1457acc77cab9360e654240d1e8f548035b4..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/paddleci.png
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/paddleci.png
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
deleted file mode 120000
index 07deca84b82ff553e0c19324695089dcfb6be90e..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/pip_install_cn.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
deleted file mode 120000
index 7f39c998195b719b05443e96f1c4a6a8d44b98c9..0000000000000000000000000000000000000000
--- a/doc/fluid/build_and_install/pip_install_en.rst
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/design/algorithm/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif
deleted file mode 100644
index 4a0da7bf6df9326a2aab1638b77c5455c18b8c4e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/algorithm/images/asgd.gif and /dev/null differ
diff --git a/doc/fluid/design/algorithm/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif
deleted file mode 100644
index dd24d33e124396be3fc410c9b12f33148f64efe2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/algorithm/images/theta_star.gif and /dev/null differ
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
deleted file mode 100644
index 0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9..0000000000000000000000000000000000000000
--- a/doc/fluid/design/algorithm/index_cn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-梯度更新算法
-------------
-
-.. toctree::
- :maxdepth: 1
-
- parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
deleted file mode 100644
index 59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc..0000000000000000000000000000000000000000
--- a/doc/fluid/design/algorithm/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Gradient Update Algorithm
---------------------------------------
-
-.. toctree::
- :maxdepth: 1
-
- parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
deleted file mode 100644
index 28ad6495d97515442eb8af2050158829814acd33..0000000000000000000000000000000000000000
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# Averaging Parameter in PaddlePaddle
-
-## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
-
-Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows:
-
-
-
-
-
-We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
-
-### How to perform Parameter Averaging in PaddlePaddle
-
-Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
-2. The optimizer itself is responsible for updating the parameters.
-3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
- 1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
- 2. However, saving all N instances of the parameters in memory is not feasible.
- 3. Therefore, an approximation algorithm is used.
-
-Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
-
-During the testing/saving the model phase, we perform the following steps:
-1. Perform the delayed operations.
-2. Save current values of the parameters to a temporary variable.
-3. Replace the values of the parameters with the averaged values.
-4. Perform testing and/or save the parameters.
-5. Restore the values of the parameters once done.
-
-### How to implement Averaging of Parameter in PaddlePaddle
-
-We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
-
- **Advantages**:
- - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
- - Makes it easy for the users to customize and extend the framework.
-
- **Disadvantages**:
- - Implementation requires re-writing the averaging methodology in Python.
-
-### Low-Level implementation
-
-In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
-- the optimizer
-- the window_size to keep the updates
-
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
-
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
-
-### Python API implementation for ParameterAverageOptimizer
-
-Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
-- Any optimizer (RMSProp , AdaGrad etc.)
-- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
-
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
-
-#### Creation of the ParameterAverageOptimizer operator
-There are two ways for creating the ParameterAverageOptimizer op:
-1. We create the op immediately while building the computation graph.
-2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
-
-The proposal is to add the op immediately while building the computation graph.
-
-#### High-level API
-
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
deleted file mode 100644
index 8ded0ad22f4013a521bf3bee260565dc5cf855ae..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/README.md
+++ /dev/null
@@ -1,174 +0,0 @@
-A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
-
-Here are some initial thoughts. Your comments are welcome!
-
-# Required CMake Function
-
-I think we need only the following few CMake functions to make a project description mean and clean:
-
-
-
-
-C++ |
-CUDA C++ |
-Go |
-
-
-
-
-cc_library |
-nv_library |
-go_library |
-
-
-cc_binary |
-nv_binary |
-go_binary |
-
-
- cc_test |
- nv_test |
- go_test |
-
-
-
-
-
-- The `_library` functions generate .a files from source code.
-- The `_binary` functions generate executable binary files.
-- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
-
-The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
-
-Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
-
-Also,
-
-- to describe external dependencies, we need `external_library`.
-- to build shared libraries, we need `shared_library`.
-
-## An Example Project
-
-Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files:
-
-- tensor.h
-- tensor.cc
-- tensor_test.cc
-- ops.h
-- ops.cu
-- ops_test.cu
-- api.go
-- api_test.go
-
-Suppose that ops.cu depends on CUDNN.
-
-```cmake
-# cc_binary parses tensor.cc and figures out that target also depend
-# on tensor.h.
-cc_binary(tensor
- SRCS
- tensor.cc)
-
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-cc_test(tensor_test
- SRCS
- tensor_test.cc
- DEPS
- tensor)
-
-# I don't have a clear idea what parameters external_library need to
-# have. @gangliao as a CMake expert would have better ideas.
-external_library(cudnn
- ....)
-
-# Suppose that ops.cu depends on external target CUDNN. Also, ops.cu
-# include global functions that take Tensor as their parameters, so
-# ops depend on tensor. This implies that if any of tensor.{h.cc},
-# ops.{h,cu} is changed, ops need to be re-built.
-nv_library(ops
- SRCS
- ops.cu
- DEPS
- tensor
- cudnn) # cudnn is defined later.
-
-nv_test(ops_test
- SRCS
- ops_test.cu
- DEPS
- ops)
-
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both. This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-go_library(api
- SRCS
- api.go
- DEPS
- tensor # Because ops depend on tensor, this line is optional.
- ops)
-
-go_test(api_test
- SRCS
- api_test.go
- DEPS
- api)
-
-
-# This builds libapi.so. shared_library might use CMake target
-# api_shared so to distinguish it from above target api.
-shared_library(api
- DEPS
- api)
-
-```
-
-## Implementation
-
-As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
-
-## Using Package Manager For Go
-
-Building Go binaries and libraries need to satisfy their dependencies, generally
-we can do `go get ./...` to download and compile all external dependencies. The
-problems are:
-
-1. `go get` will always get the latest code from the default branch of the
- remote repo, so changes of dependents might break the build. This is very
- different with what we already have in `cmake/external` which download a
- specific version or commit id of the dependency.
-1. Some locations can not access external dependencies through the internet, as mentioned
- in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
- tools can package the dependencies as a "vendor" package, which can be mirrored
- at many cloud file hosting, so users what to compile paddle by themselves can
- download this "vendor" package from a mirror site.
-
-### Choose A Suitable Tool
-
-As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
-list dozens of Go package managers. We choose the tool using following principles:
-
-- Most "active" projects with more stars, more pull requests or commits
-- Widely used project
-
-After comparing all these projects, we shall choose between the most popular
-tools: Godep and Glide.
-
-Here's a brief comparison between Godep and Glide
-: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
-also many complaints about using `Godep`. There's also a new "official" pakcage
-management tool has been started at: https://github.com/golang/dep to resolve
-such problems, but it's currently at Alpha stage. So the best choice now is
-glide obviously.
-
-### Manage Go Packages
-
-- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
- is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
- with their commit id. Builds will "lock" to these packages if we don't `glide up`
- them
-- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
- will download the code corresponding to `go/glide.lock`. If we put a vendor folder
- under `go/`, cmake will just check the commit id to the packages under the folder,
- if commit id matches, there will be no download at all.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
deleted file mode 100644
index 3757cd055c818be1e63ee8c0f000f4dd299b59f4..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/block.md
+++ /dev/null
@@ -1,375 +0,0 @@
-# Design Doc: Block and Scope
-
-## The Representation of Computation
-
-Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation:
-
-- Caffe, Torch, and Paddle: sequences of layers.
-- TensorFlow, Caffe2, Mxnet: graph of operators.
-- PaddlePaddle: nested blocks, like C++ and Java programs.
-
-## Block in Programming Languages and Deep Learning
-
-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
-
-Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
-
-
-
-
-programming languages |
-PaddlePaddle |
-
-
-
-
-for, while loop |
-RNN, WhileOp |
-
-
-if, if-else, switch |
-IfElseOp, SwitchOp |
-
-
-sequential execution |
-a sequence of layers |
-
-
-
-
-
-A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
-
-## Stack Frames and the Scope Hierarchy
-
-The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
-
-
-
-
-programming languages |
-PaddlePaddle |
-
-
-
-
-stack |
-scope hierarchy |
-
-
-stack frame |
-scope |
-
-
-push at entering block |
-push at entering block |
-
-
-pop at leaving block |
-destroy when minibatch completes |
-
-
-
-
-
-1. In traditional programs:
-
- - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
- - After the execution leaves the right curly brace, the runtime pops the frame.
- - The maximum number of frames in the stack is the maximum depth of nested blocks.
-
-1. In PaddlePaddle
-
- - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
- - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*.
- - The height of the highest tree is the maximum depth of nested blocks.
- - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
-
-## Use Blocks in C++ and PaddlePaddle Programs
-
-Let us consolidate the discussion by presenting some examples.
-
-### Blocks with `if-else` and `IfElseOp`
-
-The following C++ programs shows how blocks are used with the `if-else` structure:
-
-```c++
-namespace pd = paddle;
-
-int x = 10;
-int y = 1;
-int z = 10;
-bool cond = false;
-int o1, o2;
-if (cond) {
- int z = x + y;
- o1 = z;
- o2 = pd::layer::softmax(z);
-} else {
- int d = pd::layer::fc(z);
- o1 = d;
- o2 = d+1;
-}
-
-```
-
-An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
-
-```python
-import paddle as pd
-
-x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(1) # shape=[1], value=1
-z = minibatch([10, 20, 30]) # shape=[None, 1]
-cond = larger_than(x, 15) # [false, true, true]
-
-ie = pd.ifelse()
-with ie.true_block():
- d = pd.layer.add_scalar(x, y)
- ie.output(d, pd.layer.softmax(d))
-with ie.false_block():
- d = pd.layer.fc(z)
- ie.output(d, d+1)
-o1, o2 = ie(cond)
-```
-
-In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
-
-The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
-
-
-### Blocks with `for` and `RNNOp`
-
-The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
-
-```python
-x = sequence([10, 20, 30]) # shape=[None, 1]
-m = var(0) # shape=[1]
-W = var(0.314, param=true) # shape=[1]
-U = var(0.375, param=true) # shape=[1]
-
-rnn = pd.rnn()
-with rnn.step():
- h = rnn.memory(init = m)
- h_prev = rnn.previous_memory(h)
- a = layer.fc(W, x)
- b = layer.fc(U, h_prev)
- s = pd.add(a, b)
- act = pd.sigmoid(s)
- rnn.update_memory(h, act)
- rnn.output(a, b)
-o1, o2 = rnn()
-```
-has its equivalent C++ program as follows
-
-```c++
-int* x = {10, 20, 30};
-int* m = {0};
-int* W = {0.314};
-int* U = {0.375};
-
-int mem[sizeof(x) / sizeof(x[0]) + 1];
-int o1[sizeof(x) / sizeof(x[0]) + 1];
-int o2[sizeof(x) / sizeof(x[0]) + 1];
-for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
- int x = x[i-1];
- if (i == 1) mem[0] = m;
- int a = W * x;
- int b = Y * mem[i-1];
- int s = fc_out + hidden_out;
- int act = sigmoid(sum);
- mem[i] = act;
- o1[i] = act;
- o2[i] = hidden_out;
-}
-```
-
-## Compilation and Execution
-
-Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
-
-The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
-
-## The "Binary Executable File Format"
-
-The definition of the protobuf message is as follows:
-
-```protobuf
-message BlockDesc {
- repeated VarDesc vars = 1;
- repeated OpDesc ops = 2;
-}
-```
-
-The step net in above RNN example would look like
-
-```
-BlockDesc {
- vars = {
- VarDesc {...} // x
- VarDesc {...} // h
- VarDesc {...} // fc_out
- VarDesc {...} // hidden_out
- VarDesc {...} // sum
- VarDesc {...} // act
- }
- ops = {
- OpDesc {...} // matmul
- OpDesc {...} // add_two
- OpDesc {...} // sigmoid
- }
-};
-```
-
-Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
-
-```
-OpDesc {
- inputs = {0} // the index of x in vars of BlockDesc above
- outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
- attrs {
- "states" : {1} // the index of h
- "step_net" :
- }
-};
-```
-
-This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
-
-
-## The Compilation of Blocks
-
-During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
-
-VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
-
-```python
-a = pd.Variable(shape=[20, 20])
-b = pd.fc(a, params=["fc.w", "fc.b"])
-
-rnn = pd.create_rnn()
-with rnn.stepnet():
- x = a.as_step_input()
- # reuse fc's parameter
- fc_without_b = pd.get_variable("fc.w")
- rnn.output(fc_without_b)
-
-out = rnn()
-```
-The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
-
-In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
-
-To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
-
-`SymbolTable` can do the following:
-
-- store the definitions (some names and attributes) of variables and operators,
-- verify if a variable was declared,
-- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
-
-
-```c++
-// Information in SymbolTable is enough to trace the dependency graph. So maybe
-// the Eval() interface takes a SymbolTable is enough.
-class SymbolTable {
- public:
- SymbolTable(SymbolTable* parent) : parent_(parent) {}
-
- OpDesc* NewOp(const string& name="");
-
- // TODO determine whether name is generated by python or C++.
- // Currently assume that a unique name will be generated by C++ if the
- // argument name is left default.
- VarDesc* Var(const string& name="");
-
- // find a VarDesc by name, if recursive is true, find parent's SymbolTable
- // recursively.
- // this interface is introduced to support InferShape, find protobuf messages
- // of variables and operators, pass pointers into InferShape.
- //
- // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
- // be proposed and embedded into pybind to enable python operation on C++ pointers.
- VarDesc* FindVar(const string& name, bool recursive=true);
-
- OpDesc* FindOp(const string& name);
-
- BlockDesc Compile() const;
-
- private:
- SymbolTable* parent_;
-
- map ops_;
- map vars_;
-};
-```
-
-After all the description of variables and operators is added into SymbolTable,
-the block has enough information to run.
-
-The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
-
-
-```c++
-namespace {
-
-class Block : OperatorBase {
-public:
- Block(const BlockDesc& desc) desc_(desc) {}
-
- void InferShape(const framework::Scope& scope) const override {
- if (!symbols_ready_) {
- CreateVariables(scope);
- CreateOperators();
- }
- // should run InferShape first.
- for (auto& op : runtime_table_.ops()) {
- op->InferShape(scope);
- }
- }
-
- void Run(const framework::Scope& scope,
- const platform::Place& place) const override {
- PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
- for (auto& op : runtime_table_.ops()) {
- op->Run(scope, place);
- }
- }
-
- void CreateVariables(const framework::Scope& scope);
- void CreateOperators();
-
- // some other necessary interfaces of NetOp are listed below
- // ...
-
-private:
- BlockDesc desc_;
- bool symbols_ready_{false};
-};
-```
-
-## The Execution of Blocks
-
-Block inherits from OperatorBase, which has a Run method.
-Block's Run method will run its operators sequentially.
-
-There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
-
-The definition of Eval is as follows:
-
-```c++
-// clean a block description by targets using the corresponding dependency graph.
-// return a new BlockDesc with minimal number of operators.
-// NOTE: The return type is not a Block but the block's description so that this can be distributed
-// to a cluster.
-BlockDesc Prune(const BlockDesc& desc, vector targets);
-
-void Block::Eval(const vector& targets,
- const framework::Scope& scope,
- const platform::DeviceContext& dev_ctx) {
- BlockDesc min_desc = Prune(desc_, targets);
- Block min_block(min_desc);
- min_block.Run(scope, dev_ctx);
-}
-```
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
deleted file mode 100644
index aabc1ba75a67c5767d409bd6e7e6240dec86b16c..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/cpp_data_feeding.md
+++ /dev/null
@@ -1,204 +0,0 @@
-# C++ Data Feeding
-
-While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
-
-In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
-
-## Overview
-
-![](images/readers.png)
-
-## Reader
-
-In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
-
-
-### ReaderBase
-
-`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
-
-```cpp
-class ReaderBase {
- public:
- // Reads the next batch of data. (A 'batch' can be only one instance)
- // If the next batch doesn't exist, it throws an exception
- virtual void ReadNext(std::vector* out) = 0;
-
- // Checks whether the next instance exists.
- virtual bool HasNext() = 0;
-
- // Reinitializes the reader and read the file from the beginning.
- virtual void ReInit() = 0;
-
- virtual ~ReaderBase();
-};
-```
-
-### FileReader
-
-`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
-
-```cpp
-class FileReader : public ReaderBase {
- public:
- explicit FileReader(const std::vector& dims);
-
- void ReadNext(std::vector* out) override;
-
- protected:
- virtual void ReadNextImpl(std::vector* out) = 0;
-
- private:
- std::vector dims_;
-};
-```
-
-A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
-
-The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.
-
-### DecoratedReader
-
-A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
-
-```cpp
-class DecoratedReader : public ReaderBase {
- public:
- explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
- PADDLE_ENFORCE_NOT_NULL(reader_);
- }
-
- void ReInit() override { reader_->ReInit(); }
-
- bool HasNext() const override { return reader_->HasNext(); }
-
- protected:
- ReaderBase* reader_;
-};
-```
-
-Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
-
-### MultipleReader
-
-All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
-
-So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
-
-![](images/multiple_reader.png)
-
-This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
-
-To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel.
-
-### ReaderHolder
-
-Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
-
-```cpp
-var->Get("batch_reader");
-```
-
-We would have to write:
-
-```cpp
-var->Get("batch_reader");
-```
-
-This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
-
-To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get("...")` and regard the obtained object as a reader.
-
-## Related Operators
-
-To create and invoke readers, some new ops are introduced:
-
-### Operators That Create Readers
-
-Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
-
-However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
-
-### OpenFilesOp
-
-The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
-
-To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
-
-### HasNextOp
-
-`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
-
-### ResetOp
-
-`ResetOp` is used to reset a reader via its `ReInit()` interface.
-
-### ReadOp
-
-A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
-
-## Program with Readers
-
-A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
-
-The ops of a `startup_program` with readers would be like this:
-
-```
-multiple_reader = open_files_op(...)
-batch_reader = create_batch_reader_op(multiple_reader)
-double_buffer_reader = create_double_buffer_op(batch_reader)
-... (other initializers)
-```
-
-The forwarding ops of the corresponding `main_program` would be like this:
-
-```
-not_completed = true
-pass_count = 0
-while_op(not_completed) {
- has_next = has_next_op(double_buffer_reader)
- if_else_op(has_next) {
- batch_data = read_op(double_buffer_reader)
- ... (subsequent training ops)
- } else {
- reset_op(double_buffer_reader)
- increase_op(pass_count)
- not_completed = less_than_op(pass_count, reqiured_pass_num)
- }
-}
-```
-
-A few important considerations for these programs are as follows:
-
-1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
-
-2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
-
-3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
-
-### Simplify Configuration by MultiPassReader
-
-The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
-
-`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
-
-With `MultiPassReader`, the startup program would be like this:
-
-```
-multiple_reader = open_files_op(...)
-batch_reader = create_batch_reader_op(multiple_reader)
-multi_pass_reader = create_multi_pass_reader_op(batch_reader)
-double_buffer_reader = create_double_buffer_op(multi_pass_reader)
-... (other initializers)
-```
-
-The forwarding part of the corresponding `main_program` would be like this:
-
-```
-not_completed = true
-while_op(not_completed) {
- batch_data = read_op(double_buffer_reader)
- ... (subsequent training ops)
- not_completed = has_next_op(double_buffer_reader)
-}
-```
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
deleted file mode 100644
index 3fcddf4dd90f826ee1a16713f4371fb010f8eac5..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/executor.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Executor Design Doc
-
-## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
-[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-
-The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
-
-## Overview
-
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
-
-## Executor
-
-The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
-It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
-
-### The interface
-```c++
- Executor(places);
-```
-A executor does not own any computing resources, a user can only construct an executor using the specified places.
-
-### Running an Executor
-
-```
- void Run(ProgramDesc, Scope, block_id, create_local_scope);
-```
-An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
deleted file mode 100644
index 1f86b99e5197c3e0b85fd76fe704520ef21b06d3..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Design Doc: Functions, Operators, and Layers
-
-In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator.
-
-Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation.
-
-In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
-
-```c++
-template T add(T x, T y) { return x + y; }
-template T mul(T x, T y) { return x * y; }
-```
-
-Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation
-
-```c++
-#define MAKE_FUNCTION_OPERATOR(mul);
-```
-
-generates
-
-```c++
-template class mulOp : public OperatorBase {...};
-REGISTER_OP(mulOp, "mul");
-```
-
-so that in Python we can create operator mul by:
-
-```python
-X1 = Var()
-X2 = Var()
-Y = Var()
-paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
-```
-
-Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
-
-```c++
-template
-class FCOp : public OperatorBase {
- public:
- void Run(...) {
- add(mul(Input("X"), Input("W")), Input("b"));
- }
-};
-REGISTER_OP(FCOp, "fc");
-```
-
-We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API.
-
-Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
-
-```python
-def operator.mul(X1, X2):
- O = Var()
- paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
- return O
-
-def operator.add(X1, X2):
- O = Var()
- paddle.cpp.create_operator("add", input={X1, X2}, output=O)
- return O
-```
-
-Above code snippets are automatically generated. Given them, users can define
-
-```python
-def layer.fc(X):
- W = Var()
- b = Var()
- return operator.add(operator.mul(X, W), b)
-```
-
-If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
-
-```python
-def layer.fc(X):
- W = Var()
- b = Var()
- O1 = Var()
- paddle.cpp.create_operator("mul", input=[X, W], output=O1)
- O2 = Var()
- paddle.cpp.create_operator("add", input=[O1, b], output=O2)
- return O2
-```
-
-We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
-
-
-
-
-C++ functions/functors |
-mul |
-add |
- |
- |
-
-
-
-
-C++ operator class |
-mulOp |
-addOp |
-FCOp |
- |
-
-
-Python binding |
-operator.mul |
- operator.add |
-operator.fc |
- |
-
-
-Python function |
- |
- |
- |
-layer.fc |
-
-
-
-
-
-This is how we differentiate layer and operators in PaddlePaddle:
-
-- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
-- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
deleted file mode 100644
index b22126b31db4982c13fc3a0827805e6aaf955046..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/concepts/images/multiple_reader.png and /dev/null differ
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
deleted file mode 100644
index 40753cb140540c08d9d4c449b8d377e315280436..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/images/parallel_executor_overview.dot
+++ /dev/null
@@ -1,83 +0,0 @@
-digraph G {
- subgraph cluster_init {
- label="Initialization"
- startup_program [label="startup", shape=box]
- node_w_g0 [label="W\nGPU0"]
- startup_program -> node_w_g0 [label="Initialize"]
- node_w_g1 [label="W\nGPU1"]
- node_w_g0 -> node_w_g1 [label="broadcast"]
- }
-
- subgraph cluster_train {
- label="forward_backward"
-
- subgraph cluster_gpu0 {
- label="GPU0"
- fc_0 [label="fc\nGPU0", shape=box]
- hidden_0 [label="hidden\nGPU0"]
- node_w_g0 -> fc_0
- fc_0 -> hidden_0
- loss0 [label="loss\nGPU0"]
- hidden_0 -> loss0 [label="many ops omitted"]
- scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
- loss_g0 [label="loss_grad\nGPU0"]
- scale_loss_0->loss_g0
-
- fc_g_0 [label="w_grad\nGPU0", shape=box]
- loss0 -> fc_g_0
- loss_g0 -> fc_g_0
- hidden_0 -> fc_g_0
- }
-
- subgraph cluster_gpu1 {
- label="GPU1"
- fc_1 [label="fc\nGPU1", shape=box]
- hidden_1 [label="hidden\nGPU1"]
- node_w_g1 -> fc_1
- fc_1 -> hidden_1
- loss1 [label="loss\nGPU1"]
- hidden_1 -> loss1 [label="many ops omitted"]
- scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
- loss_g1 [label="loss_grad\nGPU1"]
- scale_loss_1->loss_g1
-
- fc_g_1 [label="w_grad\nGPU1", shape=box]
- loss1 -> fc_g_1
- loss_g1 -> fc_g_1
- hidden_1 -> fc_g_1
- }
- }
-
- all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
- fc_g_0 -> all_reduce_w
- fc_g_1 -> all_reduce_w
-
- fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
- fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
- all_reduce_w -> fc_g_0_merged
- all_reduce_w -> fc_g_1_merged
-
- subgraph cluster_optimization {
- label="Optimization"
- subgraph cluster_opt_gpu0 {
- label="GPU0"
- sgd_0 [label="SGD Op\nGPU0", shape=box]
-
- fc_g_0_merged -> sgd_0
- node_w_g0 -> sgd_0
- optimized_w_0 [label="Optimized W\nGPU0"]
- sgd_0 -> optimized_w_0
- }
- subgraph cluster_opt_gpu1 {
- label="GPU1"
- sgd_1 [label="SGD Op\nGPU1", shape=box]
-
- fc_g_1_merged -> sgd_1
- node_w_g1 -> sgd_1
- optimized_w_1 [label="Optimized W\nGPU0"]
- sgd_1 -> optimized_w_1
- }
- }
-
-
-}
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
deleted file mode 100644
index d890c0ffee3b38dc7cb74a2b56c2ab4831532211..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/concepts/images/parallel_executor_overview.png and /dev/null differ
diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png
deleted file mode 100644
index fd59168ce16c9e2a0ef45303c28c997cfd7740be..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/concepts/images/readers.png and /dev/null differ
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
deleted file mode 100644
index dcdc894937ff328e6002623275ca3c65e87b2bb0..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/index_cn.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-核心概念
--------------
-
-.. toctree::
- :maxdepth: 1
-
- README.md
- cpp_data_feeding.md
- functions_operators_layers.md
- program.md
- variable.md
- var_desc.md
- tensor.md
- tensor_array.md
- lod_tensor.md
- block.md
- scope.md
- executor.md
- parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
deleted file mode 100644
index b85a3055746facaa642e8fc899976b58435f1ef2..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/index_en.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-Core Concepts
---------------------------------------
-
-.. toctree::
- :maxdepth: 1
-
- README.md
- cpp_data_feeding.md
- functions_operators_layers.md
- program.md
- variable.md
- var_desc.md
- tensor.md
- tensor_array.md
- lod_tensor.md
- block.md
- scope.md
- executor.md
- parallel_executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
deleted file mode 100644
index 748488f6d5f2f1272e87b89047570632418da8dc..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ /dev/null
@@ -1,211 +0,0 @@
-# Design Doc: LoD (Level-of-Detail) Tensor
-
-Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
-
-
-
-
- |
-TensorFlow |
-PaddlePaddle |
-
-
-
-
-RNN |
-Support |
-Support |
-
-
-recursive RNN |
-Support |
-Support |
-
-
-padding zeros |
- Must |
-No need |
-
-
- blob data type |
- Tensor |
- LoDTensor |
-
-
-
-
-
-PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor.
-
-
-## The Challenge: Variable-length Sequences
-
-Most deep learning systems represent a mini-batch as a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor. Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector. Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
-
-Both examples show that the elements of sequences are usually of the same size. In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors. It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
-
-The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences. Also, sequences might consist of sub-sequences.
-
-
-## A Solution: The LoD Index
-
-To understand our solution, it is best to look at some examples.
-
-### A Mini-Batch of Sentences
-
-Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively. We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
-
-```
-3 1 2
-||| | ||
-```
-
-where each `|` represents a D-dimensional word vector. The numbers, 3, 1, and 2, form a 1-level LoD.
-
-### Recursive Sequences
-
-Let check another example of a 2-level LoD Tensor. Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
-
-```
-3 1 2
-3 2 4 1 2 3
-||| || |||| | || |||
-```
-
-### A Mini-Batch of Videos
-
-LoD tensors generalize to the case where elements are higher dimensional objects, like images. Suppose that a mini-batch contains videos of the same frame size 640x480. Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
-
-```
-3 1 2
-口口口 口 口口
-```
-
-The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
-
-### A Mini-Batch of Images
-
-In traditional cases like a mini-batch with N fixed-sized images, the LoD Tensor representation is as
-
-```
-1 1 1 1 1
-口口口口 ... 口
-```
-
-In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
-
-```
-口口口口 ... 口
-```
-
-### Model Parameters
-
-A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
-
-
-## The LoD Tensor
-
-Let us revisit above example of the 2-level LoD Tensor
-
-```
-3 1 2
-3 2 4 1 2 3
-||| || |||| | || |||
-```
-
-It is indeed a tree, where leaves are elementary sequences identified by **branches**.
-
-For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
-
-### The LoD Index
-
-We can save the LoD index in the above example
-
-```
-3 1 2
-3 2 4 1 2 3
-```
-
-in a not-full 2D matrix:
-
-```c++
-typedef std::vector > LoD;
-```
-
-where
-
-- `LoD.size()` is the number of levels, or the maximum length of branches,
-- `LoD[i][j]` is the length of the j-th segment at the i-th level.
-
-## The Offset Representation
-
-To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
-
-In the above example, we accumulate the length of elementary sequences:
-
-```
-3 2 4 1 2 3
-```
-
-into offsets
-
-```
-0 3 5 9 10 12 15
- = = = = = =
- 3 2+3 4+5 1+9 2+10 3+12
-```
-
-so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
-
-Similarly, the lengths in the top level LoD
-
-```
-3 1 2
-```
-
-are transformed into offsets of elements/words as follows:
-
-```
-0 3 4 6
- = = =
- 3 3+1 4+2
-```
-
-## Slicing of LoD Tensors
-
-
-When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch as the **-slice**.
-
-For example, the <2>-slice of above example is
-
-```
-10 15
-10 12 15
- || |||
-```
-
-and the <2,0>-slice of above slice is
-
-```
-10 12
- ||
-```
-
-## Length Representation vs Offset Representation
-
-The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
-Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API.
-Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
-```Python
-# length representation of lod called recursive_sequence_lengths
-recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
-# Create a LoDTensor that has the above recursive_sequence_lengths info.
-# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
-tensor = fluid.LoDTensor(lod)
-
-# Set/Change the recursive_sequence_lengths info of LoDTensor
-tensor.set_recursive_sequence_lengths([[3, 1, 2]])
-# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted
-# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
-new_recursive_seq_lens = tensor.recursive_sequence_lengths()
-```
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
deleted file mode 100644
index 4f88e27bed722e9f2f535e368926fe49b4e72e56..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# ParallelExecutor
-
-## Background
-
-Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
-
-The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
-
-We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs.
-
-ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
-
-
-## Overview of MultiGPUs logic
-
-The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
-
-![alt](images/parallel_executor_overview.png)
-
-There are several optimizations for this logic.
-
-1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
-2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready.
- * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
- * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
-3. The streams of computation, merge gradients and fetch data are different.
-
-The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
-
-| Number of GPUs | 1 | 2 | 3 | 4|
-| --- | --- | --- | --- | --- |
-| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
-| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
-
-
-## Static single assignment Graph
-
-[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
-
-The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
-
-The data structure of `SSA` graph is:
-
-```c++
-struct VarHandleBase {
- OpHandleBase* generated_op_;
- vector pending_ops_;
-
- string name;
- Place place;
- size_t version;
-};
-
-struct OpHandleBase {
- vector inputs_;
- vector outputs_;
-};
-
-struct SSAGraph {
- // vars on each devices.
- // * the vars in each map in vector is on different device.
- // * the map is mapping a variable name to variable handles
- // with different versions
- vector>> vars_;
-
- // All ops
- vector ops_;
-};
-```
-The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
-
-When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
-
-## Execute SSA Graph
-
-The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
-
-1. Maintaining a map of an operator and its needed input number.
-2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
-3. If there is an operator which needed input number is decreased to zero, just run this operator.
-4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
-
-Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
-
-## Synchronize GPU Kernels
-
-The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
-
-1. `OpHandle` will record `DeviceContext` that it is used.
-2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
-
-The `wait` are implemented by two strategies:
-
-1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
-2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
-
-Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
-
-## What's next?
-
-* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
-* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
-* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
-* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
deleted file mode 100644
index cfcd21ecdb9d2844bf93ed98a56db09651077c40..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/program.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Design Doc: PaddlePaddle Programs
-
-## Compile and Execution
-
-A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
-
-A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
-
-```python
-x = layer.data("images")
-l = layer.data("label")
-y = layer.fc(x)
-cost = layer.mse(y, l)
-optimize(cost)
-train(cost, reader=mnist.train())
-```
-
-The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it.
-
-## Programs and Blocks
-
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
-
-- program: some nested blocks
-- [block](./block.md):
- - some local variable definitions, and
- - a sequence of operators
-
-The concept of block comes from usual programs. For example, the following C++ program has three blocks:
-
-```c++
-int main() { // block 0
- int i = 0;
- if (i < 10) { // block 1
- for (int j = 0; j < 10; j++) { // block 2
- }
- }
- return 0;
-}
-```
-
-The following PaddlePaddle program has three blocks:
-
-```python
-import paddle as pd // block 0
-
-x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(1) # shape=[1], value=1
-z = minibatch([10, 20, 30]) # shape=[None, 1]
-cond = larger_than(x, 15) # [false, true, true]
-
-ie = pd.ifelse()
-with ie.true_block(): // block 1
- d = pd.layer.add_scalar(x, y)
- ie.output(d, pd.layer.softmax(d))
-with ie.false_block(): // block 2
- d = pd.layer.fc(z)
- ie.output(d, d+1)
-o1, o2 = ie(cond)
-```
-
-## `BlockDesc` and `ProgramDesc`
-
-All protobuf messages are defined in `framework.proto`.
-
-`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
-
-```protobuf
-message BlockDesc {
- required int32 parent = 1;
- repeated VarDesc vars = 2;
- repeated OpDesc ops = 3;
-}
-```
-
-The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
-
-All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
-
-```protobuf
-message ProgramDesc {
- repeated BlockDesc blocks = 1;
-}
-```
-
-
-### Global Block
-
-The global block is the first one in the above array.
-
-## Operators that Use Blocks
-
-In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
-
-The definition of `OpDesc` shows that an operator could have some attributes:
-
-```protobuf
-message OpDesc {
- AttrDesc attrs = 1;
- ...
-}
-```
-
-and an attribute could be of type block, which is, in fact, a block ID as described above:
-
-```
-message AttrDesc {
- required string name = 1;
-
- enum AttrType {
- INT = 1,
- STRING = 2,
- ...
- BLOCK = ...
- }
- required AttrType type = 2;
-
- optional int32 block = 10; // when type == BLOCK
- ...
-}
-```
-
-## InferShape
-
-With this design, the InferShape function should take the following parameters:
-
-```c++
-void InferShape(int current_block,
- int current_operator,
- ProgramDesc* program // might change VarDesc values.
- ) {
- ...
-}
-```
-
-where
-
-- `current_block` indices into `ProgramDesc::blocks`,
-- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
deleted file mode 100644
index dffee8e02bacbc99bdfa8c54f1a146de340ad778..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ /dev/null
@@ -1,130 +0,0 @@
-# Python Data Feeding
-
-In the former implementation of Paddle Fluid, there are two ways to feed data:
-
-- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
-
-- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
-
-In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
-
-
-## Design of LoDTensorBlockingQueue
-`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
-
-```C++
-class LoDTensorBlockingQueueHolder;
-
-class LoDTensorBlockingQueue {
- friend class LoDTensorBlockingQueueHolder;
- private:
- // `LoDTensorBlockingQueue` can only be constructed by
- // `LoDTensorBlockingQueueHolder::InitOnce()`
- LoDTensorBlockingQueue(size_t capacity, const std::vector& dims);
-
- public:
- size_t Size() const { return queue_.Size(); } // Get the current size of the queue
-
- size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
-
- void Close() { return queue_.Close(); }
-
- bool IsClosed() const { return queue_.IsClosed(); }
-
- // Block if Size() == Cap()
- // Return false only when queue_.IsClosed() == true
- bool Push(const std::vector &lod_tensor_vec);
-
- // Block if Size() == 0.
- // *Success == false when queue_.IsClosed() == true
- std::vector Pop(bool *success = nullptr);
-
- private:
- // Use reader::BlockingQueue as the inner data structure
- BlockingQueue> queue_;
- std::vector dims_;
-};
-
-class LoDTensorBlockingQueueHolder {
- public:
- // Call the constructor of `LoDTensorBlockingQueue` to create queue_
- // `InitOnce` can only called once, otherwise an exception would raise
- void InitOnce(size_t capacity, const std::vector& dims) {
- PADDLE_ENFORCE(queue_ == nullptr);
- queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
- }
-
- const std::shared_ptr& GetQueue() const { return queue_; }
-
- private:
- std::shared_ptr queue_;
-};
-```
-
-There are some major things that must be concerned:
-- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
-- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
-- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
-
-
-## Release of the GIL in pybind
-`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
-
-
-## Design of PyReader
-`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
-```C++
-class PyReader : public ReaderBase {
- public:
- explicit PyReader(const std::shared_ptr& queue);
-
- void ReadNext(std::vector* out) override {
- bool success;
- *out = queue_->Pop(&success);
- if (!success) out->clear();
- }
-
- void ReInit() override { return; }
-
- private:
- std::shared_ptr queue_;
-};
-```
-
-
-## Design of CreatePyReaderOp
-`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
-```C++
-class CreatePyReaderOp : public framework::OperatorBase {
- public:
- using framework::OperatorBase::OperatorBase;
- private:
- void RunImpl(const framework::Scope& scope,
- const platform::Place& dev_place) const override {
- auto* out = scope.FindVar(Output("Out"))
- ->template GetMutable();
- if (out->Get() != nullptr) return;
-
- const std::string& queue_name = Input("blocking_queue");
- auto* queue_holder_var = scope.FindVar(queue_name);
- PADDLE_ENFORCE(queue_holder_var != nullptr);
- auto* queue_holder = queue_holder_var
- ->template GetMutable();
- out->Reset(new PyReader(queue_holder->GetQueue()));
- }
-};
-```
-
-## Design of Python codes
-The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
-```Python
-def py_reader(capacity, shapes):
- queue_name = unique_name.generate("lod_tensor_blocking_queue")
- var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
- feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
- out = create_var()
- create_py_reader_op_with_queue_name(
- inputs={'blocking_queue': queue_name},
- outputs={'Out':[out]})
- return out, feed_queue
-```
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
deleted file mode 100644
index dcf76649357aaef80d6bc1a933ece8c4c1063547..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/scope.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Design of Scope in Paddle
-
-## Overview
-
-Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
-
-- Scope is an association of a name to variable.
-- Variables in a parent scope can be retrieved from local scope.
-
-A detailed explanation of these two attributes goes as following.
-
-
-## Scope is an association of a name to variable.
-
-Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
-
-
-1. Scope only contains a map of a name to variable.
-
- All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
-
-1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
-
-1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
- `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
- - `Create` is used to create a Variable by its name and add the mapping relation.
- - `Get` is used to find a Variable by name.
-
-1. Every variable only belongs to one certain Scope.
-
- Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
-
-1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
-
- Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
-
-```cpp
-class Scope {
- public:
- Variable* Var(const std::string& name);
- const Variable* FindVar(const std::string& name) const;
-
- private:
- std::unordered_map> vars_;
-};
-```
-
-
-## Parent scope and local scope
-
-Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
-
-1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
-2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
-
-```cpp
-class Scope {
- public:
- Scope(const std::shared_ptr& scope): parent_(scope) {}
-
- Variable* FindVar(const std::string& name) const {
- auto it = vars_.find(name);
- if (it != vars_.end()) {
- return it->second.get();
- } else if (parent_ != nullptr) {
- return parent_->FindVar(name);
- } else {
- return nullptr;
- }
- }
-
- private:
- std::shared_ptr parent_ {nullptr};
-};
-```
-
-In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
-
-A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
-
-## Interface Design
-
-```cpp
-class Variable {
- private:
- Variable() = default;
- friend class Scope;
-};
-
-class Scope {
- private:
- Scope(const std::shared_ptr& parent = nullptr);
-
- public:
- static std::shared_ptr Create(const std::shared_ptr& parent = nullptr);
-
- // return nullptr if not found.
- Variable* FindVar(const std::string& name) const;
-
- // return if already contains same name variable.
- Variable* Var(const std::string& name);
-
- private:
- std::shared_ptr parent_;
- std::unordered_map> vars_;
-};
-```
-## Only scope can create a variable
-
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
-
-## When scope destroyed, all variables inside this scope should be destroyed together
-
-The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
-
-## Sharing a parent scope
-
-Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
-
-Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
-
-## Orthogonal interface
-
-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/fluid/design/concepts/tensor.md b/doc/fluid/design/concepts/tensor.md
deleted file mode 100644
index 0a27ac9bb6b03649d42e12100fda9e80a56e7f56..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/tensor.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# Tensor: An Unified Data Type in PaddlePaddle
-
-## Pain Point
-
-In this week, we discussed several potential weaknesses of PaddlePaddle caused by rapid iteration and development to promote new business products on the line in recent four years. For instance, current Matrix/Vector implementation in PaddlePaddle are long and tedious to read, which interfered seriously with the contribution of both fresh and professional engineers. More seriously for this issue, it will also become too challenging to maintain over time.
-
-
-## Learn from Majel
-
-Consequently, we decide to refactor PaddlePaddle step-by-step. First, refactor and replace Matrix/Vector to Tensor, a modern terminology in the deep learning system. Fortunately, we can learn from Majel how to define a Tensor.
-
-To simplify heterogeneous resource allocation in any dimensions (1-9) and types (double, float, float16), Majel consists of several primitives such as `Dim`, `Place` and `Array`, all of them are standard C++ class templates.
-
-1. `Place`: memory location [i.e. CPU/GPU].
-2. `Allocation`: heterogeneous resource allocator [i.e. 20MB in GPU].
-3. `Dim`: size of each dimension. [i.e. Dim<4>({10, 2, 5, 1})]
-4. `Array`: dynamic array consists of `Place`, `Dim`, and a pointer to memory.
-
-If you dig deeper into Majel source code, you will find Majel heavily use `boost.variant`. The variant class template is a safe, generic, stack-based discriminated union container, **offering a simple solution for manipulating an object from a heterogeneous set of types in a uniform manner**. Whereas standard containers such as std::vector may be thought of as "multi-value, single type," variant is "multi-type, single value."
-
-As a simple example, consider the following:
-
-```c++
-#include "boost/variant.hpp"
-#include
-
-class my_visitor : public boost::static_visitor
-{
-public:
- int operator()(int i) const
- {
- return i;
- }
-
- int operator()(const std::string & str) const
- {
- return str.length();
- }
-};
-
-int main()
-{
- boost::variant< int, std::string > u("hello world");
- std::cout << u; // output: hello world
-
- int result = boost::apply_visitor( my_visitor(), u );
- std::cout << result; // output: 11 (i.e., length of "hello world")
-}
-```
-
-In Majel, `DDimVar` is derived from `Dim`, `DArrayVar` is from `Array`.
-
-```c++
-template
-struct Dim {
-...
-int head;
-Dim tail;
-}
-```
-
-```c++
-template
-class Array : public Buffer {
- ...
-private:
- Dim size_;
- Dim stride_;
- T* ptr_;
-};
-```
-
-```c++
-typedef boost::variant Place;
-typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
- Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
-typedef boost::variant<
- Array,
- Array,
- Array,
- Array,
-
- Array,
- Array,
- Array,
- Array,
-
- Array,
- Array,
- Array,
- Array > DArrayVar;
-```
-
-Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle.
-
-`DDim` plays two kinds of roles in Majel. First, it is used to indicate the size of a tensor. For example, we can construct a new `DArray` by following way:
-
- ```c++
- DArray arr = make_darray(make_ddim({2,3}), 0.0f);
- ```
- It means that `arr` will be a two-dimension tensor, or a matrix. The size of its first dimension is 2 and the second is 3. All the element value of `arr` will be initialized as 0.0 .
-
- The second meaning of `DDim` is tensor index. For example, if we want to access the value in the 1st row and 2nd column of `arr` and set it to 1.0, we can do like this:
-
- ```c++
- arr[make_ddim({0, 1})] = 1.0;
- ```
-
-## Implement Tensor in Paddle
-
-We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers.
-
-Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware.
-
-Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
-
-
-### Memory Management
-`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle.
-
-### Dim and Array
-#### Dim
-
-`Dim` decribes the dimension information of an array.
-
-`DDimVar` is an alias of a specializd class of boost.variant class template.
-
-`DDim` is introduced to represent a dynamically sized dimension.
-
-For example:
-
-```
-Dim<2> d1 = make_dim(3, 3);
-DDim d2 = make_ddim({1, 2, 3});
-```
-
-You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension.
-#### Array
-
-`Array` represents for a tensor with specific type and size.
-
-`DArrarVar` is an alias of a specialized class of boost.variant class template.
-
-`DArray` is introduced to represent a dynamically typed array.
-
-For example:
-
-```
-Array a1(Dim<2>(2, 2));
-DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace());
-```
-
-You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array.
-
-
-Please reference the section of `Learn from Majel` for more details.
-
-### ArrayView
-
-`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on.
-
-`ArrayView` is an encapsulation of `Array`, which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
-
-`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
-
-
-A global function `make_view` is provided to transform an array to corresponding arrayview.
-
-```
-template
-ArrayView make_view(const Array& in) {
- return in;
-}
-```
-
-A global function `make_iterator` is provided to make iterator of an array.
-
-```
-template
-ViewIterator> make_iterator(const Array& in, Dim idx) {
- return make_iterator(make_view(in), idx);
-}
-```
-
-### Basic Operations
-
-The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on.
-
-An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU).
diff --git a/doc/fluid/design/concepts/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md
deleted file mode 100644
index 37e4f7b90f94fa3eb015e733999cd84c96b2239c..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/tensor_array.md
+++ /dev/null
@@ -1,271 +0,0 @@
-# Design for TensorArray
-This design doc presents the necessity of a new C++ class `TensorArray`.
-In addition to the very simple C++ implementation
-
-```c++
-class TensorArray {
- public:
- explicit TensorArray(const LoDTensor&);
- explicit TensorArray(size_t size);
-
- private:
- vector values_;
-};
-```
-
-We also need to expose it to PaddlePaddle's Python API,
-because users would want to use it with our very flexible operators `WhileLoop`.
-An example for a RNN based on dynamic operators is
-
-```python
-input = pd.data(...)
-num_steps = Var(12)
-
-TensorArray states(size=num_steps)
-TensorArray step_inputs(unstack_from=input)
-TensorArray step_outputs(size=num_steps)
-
-W = Tensor(...)
-U = Tensor(...)
-default_state = some_op()
-
-step = Var(1)
-
-wloop = paddle.create_whileloop(loop_vars=[step])
-with wloop.frame():
- wloop.break_if(pd.equal(step, num_steps)
- pre_state = states.read(step-1, default_state)
- step_input = step_inputs.read(step)
- state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
- states.write(step, state)
- step_outputs.write(step, state) # output state
- step.update(state+1)
-
-output = step_outputs.stack()
-```
-
-## Background
-Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
-
-An RNN can be implemented with the following pseudocode
-
-```c++
-Array states;
-Array input_segments;
-Array output_segments;
-Parameter W, U;
-
-step = 1
-seq_len = 12
-while_loop {
- if (step == seq_len) break;
- states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
- output_segments[step] = states[step] // take state as output
- step++;
-}
-```
-According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
-
-Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
-
-
-Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
-Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
-
-As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
-
-The implementation is similar to `recurrent_op`.
-The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
-
-
-Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
-the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
-
-## Why `TensorArray`
-The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
-
-The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes.
-
-So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
-
-**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
-This is where the notion of `TensorArray` comes from.
-
-## Introduce TensorArray to uniform all the three RNNs
-TensorArray as a new concept is borrowed from TensorFlow,
-it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
-
-This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers,
-such as `recurrent_op`, `RecurrentGradientMachine`.
-
-In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401),
-`TensorArray` is used to segment inputs and store states in all time steps.
-By providing some methods similar to a C++ array,
-the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
-
-## Dynamic-operations on TensorArray
-
-`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
-
-```python
-# several helper operators for TensorArray
-def tensor_array_stack(ta, tensor):
- '''
- get a tensor array `ta`, return a packed `tensor`.
- '''
- pass
-
-def tensor_array_unstack(tensor, ta):
- '''
- get a `tensor`, unstack it and get a tensor array `ta`.
- '''
- pass
-
-def tensor_array_write(ta, index, tensor, data_shared):
- '''
- get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
- value of the tensor array `ta`.
- `data_shared` is an attribute that specifies whether to copy or reference the tensors.
- '''
- pass
-
-def tensor_array_read(ta, index, tensor):
- '''
- get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
- `ta` and return as the `tensor`.
- '''
- pass
-
-def tensor_array_size(ta, tensor):
- '''
- get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
- '''
- pass
-```
-
-It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use,
-for example
-
-```python
-class TensorArray:
- def __init__(self, name):
- self.name = name
- self.desc = TensorArrayDesc()
-
- def stack(self, name=None):
- '''
- Pack the values in a `TensorArray` into a tensor with rank one higher
- than each tensor in `values`.
- `stack` can be used to split tensor into time steps for RNN or whileloop.
-
- @name: str
- the name of the variable to output.
- '''
- tensor = Var(name)
- tensor_array_stack(self.name, tensor)
- return tensor
-
- def unstack(self, input):
- '''
- Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
- `unstack` can be used to concatenate all the time steps for RNN or whileloop.
-
- @input: str
- the name of input tensor
- '''
- tensor_array_unstack(tensor, self.name)
-
- def write(self, index, value, data_shared=True):
- '''
- Write value into index of the TensorArray.
- If `data_shared` is set to True, than the index-th value in TensorArray will
- be shared with the tensor passed in.
-
- @index: str
- name of a scalar tensor
- @value: str
- name of a tensor
- @data_shared: bool
- '''
- tensor_array_write(self.name, index, value, data_shared)
-
- def read(self, index, output):
- '''
- Read the value at location `index` in the `TensorArray`.
-
- @index: str
- name of a scalar tensor
- @output:
- name of a output variable
- '''
- tensor_array_read(self.name, index, output)
-
-
- def size(self, output):
- '''
- Return the number of values.
-
- @output: str
- name of a scalar tensor
- '''
- tensor_array_size(self.name, output)
-```
-
-## LoDTensor-related Supports
-The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
-
-Since each step of RNN can only take a tensor-represented batch of data as input,
-some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
-
-Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
-these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
-
-Some definitions are like
-
-```python
-def unpack(level):
- '''
- Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
- will sort by length.
-
- Returns:
- - a new `TensorArray`, whose values are LodTensors and represents batches
- of data.
- - an int32 Tensor, which stores the map from the new batch's indices to
- original LoDTensor
- '''
- pass
-
-def pack(level, indices_map):
- '''
- Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
- and `level` and `indices_map`.
- '''
- pass
-```
-
-With these two methods, a varience-length sentence supported RNN can be implemented like
-
-```c++
-// input is the varient-length data
-LodTensor sentence_input(xxx);
-TensorArray ta;
-Tensor indice_map;
-Tensor boot_state = xxx; // to initialize rnn's first state
-TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
-TessorArray step_outputs;
-TensorArray states;
-
-for (int step = 0; step = ta.size(); step++) {
- auto state = states.read(step);
- // rnnstep is a function which acts like a step of RNN
- auto step_input = ta.read(step);
- auto step_output = rnnstep(step_input, state);
- step_outputs.write(step_output, true/*data_shared*/);
-}
-
-// rnn_output is the final output of an rnn
-LoDTensor rnn_output = ta.pack(ta, indice_map);
-```
-the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
-the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
deleted file mode 100644
index 8db67f6703d142da71cf06bd4f7e2cb13556f9b0..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/var_desc.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Design Doc: Var_desc
-
-## Background
-PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
-
-PaddlePaddle uses proto message to describe compile time program because :
-
-1. The computation program description must be serializable and saved in a file.
-1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
-
-The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below.
-
-
-
-
- |
-compile time |
-runtime |
-
-
-
-
-Data |
-VarDesc(proto) |
-Variable(cpp) |
-
-
-Operation |
-OpDesc(proto) |
-Operator(cpp) |
-
-
-
-
-
-## Definition of VarType
-
-A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
-
-```proto
-message VarDesc {
- required string name = 1;
- required VarType type = 2;
- optional bool persistable = 3 [ default = false ];
-}
-```
-
-## Definition of TensorDesc
-
-```proto
-message TensorDesc {
- // Should only be PODType. Is enforced in C++
- required Type data_type = 1;
- repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-}
-```
-
-The `Type` here comes from the enum defined inside of `VarType` :
-
-```proto
-enum Type {
- // Pod Types
- BOOL = 0;
- INT16 = 1;
- INT32 = 2;
- INT64 = 3;
- FP16 = 4;
- FP32 = 5;
- FP64 = 6;
-
- // Other types that may need additional descriptions
- LOD_TENSOR = 7;
- SELECTED_ROWS = 8;
- FEED_MINIBATCH = 9;
- FETCH_LIST = 10;
- STEP_SCOPES = 11;
- LOD_RANK_TABLE = 12;
- LOD_TENSOR_ARRAY = 13;
- PLACE_LIST = 14;
- READER = 15;
- CHANNEL = 16;
-}
-```
-
-A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
-
-## Definition of LodTensorDesc
-
-```proto
-message LoDTensorDesc {
- required TensorDesc tensor = 1;
- optional int32 lod_level = 2 [ default = 0 ];
-}
-```
-
-A LoDTensorDesc contains a tensor and a lod_level.
-
-## Definition of Variable in Python
-
-For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/fluid/design/concepts/variable.md b/doc/fluid/design/concepts/variable.md
deleted file mode 100644
index 442ef6b718b227d79ca73031efcbb55817558252..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concepts/variable.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Design Doc: Variable
-
-
-Variable is also known as *blob* in MxNet and Caffe2. It is the input and output type of operators, where a neural network is a graph of operators.
-
-## Requirements: Lazy Memory Allocation
-
-For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
-
-To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation. Let's take the following example:
-
-```cpp
-Variable vr, v1, v2;
-
-Tensor* t1 = new Tensor();
-Tensor* t2 = new Tensor();
-
-Randomize(
- /* malloc */ v1.GetMutable().mutable_data(DDim(100,200)),
- /* size */ t1.Size());
-
-Randomize(
- /* malloc */ v2.GetMutable().mutable_data(DDim(200,300)),
- /* size */ t2.Size());
-
-Mult(
- /*result*/ vr.GetMutable().mutable_data(SizeOfMult(v1, v2)),
- /*input1*/ v1.Get().data(),
- /*input2*/ v2.Get().data());
-```
-
-We see that a variable holds nothing until `Variable::GetMutable()` allocates a tensor and puts it in the variable. Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
-
-This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
-
-
-## Implementation: Type Hiding
-
-To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time. In other words, `class Variable` cannot be a template `template class Variable`.
-
-Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member. Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
-
-But anyway, Variable needs to know `T` so could it `delete(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
-
-We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`. Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
-
-Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
-
-
-## Conclusion
-
-The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid). This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code.
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
deleted file mode 100644
index df67438bcc741ac521b00ee962fc13c93db21182..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/channel.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Channel Design
-
-## Introduction
-
-A Channel is a data structure that allows for synchronous interprocess
-communication via message passing. It is a fundemental component of CSP
-(communicating sequential processes), and allows for users to pass data
-between threads without having to worry about synchronization.
-
-## How to use it
-
-Paddle offers python APIs to open and close channels, along with sending
-and receiving data to/from a channel.
-
-### Create a channel
-
-Creates a new channel that takes in variables of a specific dtype.
-
-- **fluid.make_channel(dtype, capacity=0)**
- - **dtype**: The data type of variables being sent/received through channel
- - **capacity**: The capacity of the channel. A capacity of 0 represents
- an unbuffered channel. Capacity > 0 represents a buffered channel
-
-```
-ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
-```
-
-### Close a channel
-
-Closes a channel. Any pending senders and receivers will be awoken during
-this time. Receivers can still receive from a closed channel, but senders
-are not allowed to send any additional data to the channel (Paddle will
-raise an exception if users try to send to a closed channel.)
-
-- **fluid.channel_close(channel)**
-
-```
-fluid.channel_close(ch)
-```
-
-### Send data to a channel
-
-Sends a variable to a channel. Currently, variables of dtype `LoDTensor`,
-`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
-`ChannelHolder` are supported.
-
-By default, the data of the Variable is moved from the sender to the receiver,
-however the user can optionally copy the data before performing the send.
-
-- **channel_send(channel, variable, is_copy=False)**
- - **channel**: The channel to send the variable to
- - **variable**: The variable to send to the channel
- - **is_copy**: If set to True, channel_send will perform a variable assign
- to copy the source variable to a new variable to be sent.
-
-```
-ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
-fluid.channel_send(ch, var, True)
-```
-
-### Receive data from a channel
-
-Receives a variable from a channel. The data of the variable is moved to the
-receiving variable.
-
-- **channel_recv(channel, return_variable)**
- - **channel**: The channel to receive the variable from
- - **return_variable**: The destination variable used to store the data of the
- variable received from the channel
-
-```
-ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
-fluid.channel_recv(ch, var)
-```
-
-## How it Works
-
-Channels provides a simple interface for different threads to share data.
-To support the synchronization requirements, channels utilizes a series of
-internal queues, locks, and conditional variables.
-
-### QueueMessage
-
-QueueMessage encapsulates the state of the channel send/receive operation to be
-put in the **sendq/recvq**. It contains a condition variable used to lock the
-thread (when there are no available sends/receives). In addition, it contains
-a callback function to notify a thread when the QueueMessage is being
-processed by the channel.
-
-### Queues
-
-- **buff_**: This queue holds the data buffer in a buffered channel. The
-capacity is set to the capacity of the channel. This data buffer is not
-used in an unbuffered channel.
-
-- **sendq**: This queue holds the QueueMessage of any pending senders of a
-channel. When a thread performs a channel_send operation on the channel, the
-channel_send operation will put a new QueueMessage on the sendq and block the
-current thread under two conditions:
- 1. The channel is buffered and is full
- 2. The channel is unbuffered and does not have a receiver
-
-- **recvq**: This queue holds the QueueMessage of any pending receivers of a
-channel. When a thread performs a channel_recv operation on the channel, the
-channel_recv operation will put a new QueueMessage on the recvq and block the
-current thread under two conditions:
- 1. The channel is buffered and there is no data on the buff_
- 2. The channel is unbuffered and does not have a sender
-
-### State diagram
-
-#### Channel Send
-
-
-
-
-
-#### Channel Receive
-
-
-
-
-
-## Limitations and Considerations
-
-### Variable Copy
-
-In golang, variables in channels are copied from the sender to the receiver.
-In Paddle, the data from our variables are **moved** from sender to receiver.
-As a result, these variables should not be used after they are sent. We
-provide a flag in channel_send method to allow users to copy the variable to
-be sent before it is sent.
-
-Please note that this is acheived by adding an **assign** operator and creating
-a temporary variable that is sent in place of the original variable. Please
-note that **assign** operator has limited support for only certain variables
-datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
deleted file mode 100644
index 0428e74f9e00a87f6b0972057f48479b8ae56ad6..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Design Doc: Concurrent Programming with Fluid
-
-With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.
-
-Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**
-
-The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
-
-## An Analogy
-
-The following table compares concepts in Fluid and Go
-
-
-
-
-## An Example Concurrent Program
-
-To review all above concepts in an example, let us take a simple program and writes its distributed version.
-
-Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
-
-```go
-import "fluid"
-
-func paddlepaddle() {
- X = fluid.read(...)
- W = fluid.Tensor(...)
- Y = fluid.mult(X, W)
-}
-```
-
-Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
-
-```protobuf
-message ProgramDesc {
- block[0] = Block {
- vars = [X, W, Y],
- ops = [
- read(output = X)
- assign(input = ..., output = W)
- mult(input = {X, W}, output = Y)
- ],
- }
-}
-```
-
-Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
-
-The default `main` function is defined as follows:
-
-```go
-func main() {
- paddlepaddle()
- fluid.run()
-}
-```
-
-## The Concurrent Version
-
-By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
-
-In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
-
-### The Master Program
-
-The master program could look like the following:
-
-```protobuf
-message ProgramDesc {
- block[0] = Block {
- vars = [X, L, Y],
- ops = [
- read(output = X)
- kube_get_workers_addrs(output = L)
- Y = tensor_array(len(L))
- parallel_for(input = X, output = Y,
- attrs = {L, block_id(1)}) # referring to block 1
- ]
- }
-
- block[1] = Block {
- parent = 0,
- vars = [x, y, index],
- ops = [
- slice(input = [X, index], output = x) # index is initialized by parallel_for
- send(input = x, attrs = L[index])
- recv(outputs = y, attrs = L[index])
- assign(input = y, output = Y[index])
- ]
- }
-}
-```
-
-The equivalent Fluid program (calling the Go binding) is:
-
-```go
-func main() { //// block 0
- X = fluid.read(...)
- L = fluid.k8s.get_worker_addrs()
- Y = fluid.tensor_array(len(L))
- fluid.parallel_for(X, L,
- func(index int) { //// block 1
- x = X[index]
- fluid.send(L[index], x)
- y = fluid.recv(L[index])
- Y[index] = y
- })
-}
-```
-
-An explanation of the above program:
-
-- `fluid.k8s` is a package that provides access to Kubernetes API.
-- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
-
- 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
- 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
- 1. creates an Executor instance, and
- 2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
-1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
-
-### The Worker Program
-
-The worker program looks like
-
-```go
-func main() {
- W = Tensor(...)
- x = fluid.listen_and_do(
- fluid.k8s.self_addr(),
- func(input Tensor) {
- output = fluid.mult(input, W)
- })
-}
-```
-
-where
-
-- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
- 1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
- 2. once a connection is established,
- 1. creates a scope of two parameters, "input" and "output",
- 2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
- 3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
-
-## Summarization
-
-From the above example, we see that:
-
-1. Fluid enables the imperative programming paradigm by:
- 1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
- 2. call the `fluid.run` function that runs the program implicitly.
-1. The program is described as a `ProgramDesc` protobuf message.
-2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
-3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
-4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
-5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
-6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
deleted file mode 100644
index 66d19f44baf861c7847e81ca83f61024ec877faf..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/csp.md
+++ /dev/null
@@ -1,251 +0,0 @@
-# Design Doc: CSP in PaddlePaddle Fluid
-
-## Motivation
-
-Concurrent programming is important for deep learning. Few example applications are:
-
-1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
-2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
-
-Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
-
-## Concurrent Programming Models
-
-There were many concurrent programming models, implemented in various forms:
-
-
-
-
-concurrent programming model |
-implementation |
-
-
-
-
-mutex |
-types and functions in standard libraries |
-
-
-semaphore |
- types and functions in standard libraries |
-
-
- communicating sequential processes (CSP) |
- Go programming language |
-
-
- actor model |
- Erlang programming language |
-
-
- message passing |
- MPI |
-
-
- bulk synchronous parallel (BSP) |
- Pregel distributed programming framework |
-
-
-
-
-
-Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
-
-### CSP v.s. Actor Model
-
-A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
-
-## CSP in Fluid
-
-Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
-
-1. a new data type: *channel* and operators *send* and *recv*,
-1. *goroutine* or thread, and
-1. a new control-flow: select.
-
-We also need Python wrappers for the above components.
-
-The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
-
-The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
-
-It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
-
-### Type Channel
-
-Fluid supports many data types:
-
-1. Tensor,
-1. Row-sparse Tensor
-1. LoD Tensor,
-1. Tensor array, etc
-
-Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
-
-To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
-
-## Syntax Design
-
-### Create Channel
-
-In Go, we create a channel by specifying the element type and buffer size:
-
-```go
-ch := make(chan int) // a channel without buffer
-ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
-```
-
-In Fluid, we should be able to do the same:
-
-```python
-ch = fluid.make_channel(dtype=INT)
-ch1 = fluid.make_channel(dtype=INT, 100)
-```
-
-In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
-
-```python
-ch = fluid.make_channel(dtype=Tensor, etype=float16)
-```
-
-or Tensors of Tensors of float16 etc.
-
-The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor...> >`.
-
-### Send and Recv
-
-Go's CSP implementation depends on data type *channel*. There are two types of channels:
-
-1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
-1. blocked channel, or unbuffered channel, is a blocking queue with no buffer. Both sending and receiving block with unbuffered channels.
-
-There are four types of actions with a channel:
-
-1. Create a channel
-
- ```go
- ch := make(chan int) // this is an unbuffered channel
- ch := make(chan int, 100) // this is a buffered channel of 100 ints.
- ```
-
-1. Send
-
- ```go
- ch <- 111
- ```
-
-1. Recv
-
- ```go
- y, ok <- ch
- ```
-
-1. Close
-
- ```go
- close(ch)
- ```
-
- Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-
-There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
-
-1. A send to a nil channel blocks forever
-
-1. A receive from a nil channel blocks forever
-
-1. A send to a closed channel panics
-
-1. A receive from a closed channel returns the residual values and then zeros.
-
-In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
-
-The following program illustrates the Python syntax for accessing Fluid buffers.
-
-```python
-import fluid
-
-buffer_size = 10
-ch = fluid.make_channel(dtype=INT, buffer_size)
-
-# Now write three elements to the channel
-with fluid.while(steps=buffer_size):
- fluid.send(ch, step)
-
-fluid.close_channel(ch)
-
-with fluid.while(steps=buffer_size):
- fluid.print(fluid.recv(ch))
-```
-
-The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
-
-```python
-import fluid
-
-ch = fluid.make_channel(dtype=INT)
-
-with fluid.go():
- fluid.send(ch)
-
-y = fluid.recv(ch)
-
-fluid.close_channel(ch)
-```
-
-### Select
-
-In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
-
-```go
-
-ch1 := make(chan int)
-ch2 := make(chan int, 100)
-
-x := 0
-
-for {
- select {
- case ch1 <- x:
- x := x + 1
- case y <- ch2:
- fmt.Println("Received on channel")
- default:
- fmt.Println("Default")
- }
- }
-
-```
-
-In Fluid, we should be able to do the same:
-
-```python
-ch1 = fluid.make_chan(dtype=INT)
-ch2 = fluid.make_chan(dtype=INT, 100)
-
-sel = fluid.select()
-
-with sel.case(ch1, 'w', X):
- fluid.layers.increment(X)
-
-with sel.case(ch2, 'r', Y):
- fluid.print("Received on Channel")
-
-with sel.default():
- fluid.print("Default")
-
-```
-
-In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
-
-- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
-
-- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
-
-- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
-
-## Example Programs
-
-### 1. RPC between Trainers and Parameter Servers
-
-### 2. Concurrent Minibatch Loading
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
deleted file mode 100644
index c18b788e80f432ebb2f14b15229e7823c112001e..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/go_op.md
+++ /dev/null
@@ -1,231 +0,0 @@
-# go_op Design
-
-## Introduction
-
-The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
-thread. It works in conjuction with CSP operators (channel_send,
-channel_receive, channel_open, channel_close, and select) to allow users to
-concurrently process data and communicate easily between different threads.
-
-## How to use it
-
-```
-channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-
-with fluid.Go():
- # Send a tensor of value 99 to "channel" on a detached thread
- tensor = fill_constant(shape=[1], dtype='int', value=99)
- tensor.stop_gradient = True
- fluid.channel_send(channel, tensor)
-
-# Receive sent tensor from "channel" on the main thread
-result = fill_constant(shape=[1], dtype='int', value=-1)
-fluid.channel_recv(ch, result)
-```
-
-The go operator can be accessed by using the fluid.Go() control flow. This
-will create a new sub block, where the user can add additional operators
-to be ran on the thread.
-
-**Note:** Since back propegation is currently not support in the go_op, users
-should ensure that operators in the go block does not require gradient
-calculations.
-
-## How it Works
-
-Similar to other control blocks, go_op will create a sub block and add it
-as a child to the current block. Operators and variables defined in this
-block will be added to the go sub_block.
-
-In addition, the go operator will create a new child scope whose parent is
-the global scope. Please refer to [block captures](#block-captures) for more
-information.
-
-When Paddle executor runs go_op, go_op will take the sub_block and pass it to
-the executor.run method (along with a newly created local scope) on a detached
-thread.
-
-An example of the generated program description is shown below. Take note of
-the **go_op** in particular. It is added as an operator in the current
-block (in this example, block0). The **go_op** contains a `sub_block`
-attribute, which points to the id of the block that will be executed in a
-detached thread.
-
-```
-blocks {
- idx: 0
- parent_idx: -1
- vars {
- name: "return_value"
- type {
- type: LOD_TENSOR
- lod_tensor {
- tensor {
- data_type: INT64
- }
- }
- }
- }
- vars {
- name: "status_recv"
- type {
- type: LOD_TENSOR
- lod_tensor {
- tensor {
- data_type: BOOL
- }
- }
- }
- }
- ...
- ops {
- outputs {
- parameter: "Out"
- arguments: "channel"
- }
- type: "channel_create"
- attrs {
- name: "data_type"
- type: INT
- i: 7
- }
- attrs {
- name: "capacity"
- type: INT
- i: 0
- }
- }
- ops {
- inputs {
- parameter: "X"
- arguments: "channel"
- }
- type: "go"
- attrs {
- name: "sub_block"
- type: BLOCK
- block_idx: 1
- }
- }
- ops {
- inputs {
- parameter: "Channel"
- arguments: "channel"
- }
- outputs {
- parameter: "Out"
- arguments: "return_value"
- }
- outputs {
- parameter: "Status"
- arguments: "status_recv"
- }
- type: "channel_recv"
- }
- ...
-}
-
-blocks {
- idx: 1
- parent_idx: 0
- vars {
- name: "status"
- type {
- type: LOD_TENSOR
- lod_tensor {
- tensor {
- data_type: BOOL
- }
- }
- }
- }
- ...
-
- ops {
- outputs {
- parameter: "Out"
- arguments: "fill_constant_1.tmp_0"
- }
- type: "fill_constant"
- attrs {
- name: "force_cpu"
- type: BOOLEAN
- b: false
- }
- attrs {
- name: "value"
- type: FLOAT
- f: 99.0
- }
- attrs {
- name: "shape"
- type: INTS
- ints: 1
- }
- attrs {
- name: "dtype"
- type: INT
- i: 3
- }
- }
- ops {
- inputs {
- parameter: "Channel"
- arguments: "channel"
- }
- inputs {
- parameter: "X"
- arguments: "fill_constant_1.tmp_0"
- }
- outputs {
- parameter: "Status"
- arguments: "status"
- }
- type: "channel_send"
- attrs {
- name: "copy"
- type: BOOLEAN
- b: false
- }
- }
-```
-
-## Current Limitations
-
-#### Scopes and block captures:
-
-Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
-block. When a block is executed, a new local scope is created from the parent
-scope (ie: scope derived from the parent block) and associated with the new
-child block. After the block finishes executing, then the local scope and
-all associated variables in the scope is deleted.
-
-This works well in a single threaded scenario, however with introduction of
-go_op, a child block may continue to execute even after the parent block has
-exited. If the go_op tries to access variables located in the parent block's
-scope, it may receive a segmentation fault because the parent scope may have
-been deleted.
-
-We need to implement block closures in order to prevent access to parent
-scope variables from causing a segmentation fault. As a temporary workaround,
-please ensure that all variables accessed in the go block is not destructed
-before it is being accessed. Currently, the go_op will explicitly enforce
-this requirement and raise an exception if a variable could not be found in
-the scope.
-
-Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
-for more details.
-
-#### Green Threads
-
-Golang utilizes `green threads`, which is a mechnism for the runtime library to
-manage multiple threads (instead of natively by the OS). Green threads usually
-allows for faster thread creation and switching, as there is less overhead
-when spawning these threads. For the first version of CSP, we only support
-OS threads.
-
-
-#### Backward Propegation:
-
-go_op currently does not support backwards propagation. Please use go_op with
-non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
deleted file mode 100644
index c06cd15ae7b8a8c94d5742f6675e389081fcf789..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/concurrent/images/channel_recv.png and /dev/null differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
deleted file mode 100644
index 006ebb4a5a4bcd32c97847e9fb7729a740255f7c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/concurrent/images/channel_send.png and /dev/null differ
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
deleted file mode 100644
index 719ed76f9d542d6c4f20c30f27656bb53325aa85..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/concurrent/images/select_op_workflow.png and /dev/null differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
deleted file mode 100644
index e47135e9fc42760898083710e0a6767252a0225b..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/index_cn.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-并发编程
-------------
-
-.. toctree::
- :maxdepth: 1
-
- concurrent_programming.md
- parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
deleted file mode 100644
index 0727e75798b2a869588f80d3cce7a886554e4ffb..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/index_en.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Concurrent Programming
--------------------------
-
-.. toctree::
- :maxdepth: 1
-
- concurrent_programming.md
- parallel_do.md
diff --git a/doc/fluid/design/concurrent/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md
deleted file mode 100644
index 42bd136f825986d94fafaeaa5f58edb02848a74c..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/parallel_do.md
+++ /dev/null
@@ -1,163 +0,0 @@
-# Design Doc: Parallel_Do in PaddlePaddle
-
-In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.
-
-## Design overview
-
-The definition of a parallel_do op looks like the following
-
-```c++
-AddInput(kInputs, "Inputs needed to be split onto different devices").AsDuplicable();
-AddInput(kParameters, "Parameters are duplicated over different devices")
- .AsDuplicable();
-AddInput(kPlaces, "Devices used for parallel processing");
-AddOutput(kOutputs, "Outputs needed to be merged from different devices").AsDuplicable();
-AddOutput(kParallelScopes,
- "Scopes for all local variables in forward pass. One scope for each device");
-AddAttr(kParallelBlock,
- "List of operaters to be executed in parallel");
-```
-
-A vanilla implementation of parallel_do can be shown as the following (`|` means single thread and
-`||||` means multiple threads)
-
-```
-In the forward pass
- | Split input onto different devices
- | Copy parameter onto different devices
- |||| Compute forward pass in parallel
- | Merge output from different devices
-
-In the backward pass
- | Split output@grad onto different devices
- |||| Compute backward pass in parallel
- | accumulate param@grad from different devices to the first device
- | Merge input@grad from different devices
- | Copy param@grad to the place of parallel_do_op
-```
-
-This implementation allows to write mixed device program like this
-
-```python
-W1 = fluid.tensor(size=[100,20], parameter=true)
-W2 = fluid.tensor(size=[20,15], parameter=true)
-
-data = layers.data()
-
-gpu_places = layers.get_place(use_gpu=True)
-# parallel processing on multiple GPUs
-pd = ParallelDo(gpu_places)
-with pd.do(input=data):
- prediction = softmax(fc(fc(data, W1), W2))
- write_output(prediction)
-prediction = pd()
-loss = cross_entropy(prediction, label)
-```
-
-And the programDesc are like the following
-
-```
-# start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU
-start_program
-{
- vars: w1, w2
- ops: init(w1), init(w2)
-}
-
-main_program
-{
-block0 {
- vars: data, places, w1, w2, w1_grad, w2_grad,
- ops: data, get_place, parallel_do(block1),
- parallel_do_grad(block2),
- sgd(w2, w2_grad),
- sgd(w1, w1_grad)
-}
-block1 { # the forward pass
- parent_block: 0
- vars: data, h1, h2, loss
- ops: fc, fc, softmax
-}
-block2 { # the backward pass
- parent_block: 1
- vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
- ops: softmax_grad,
- fc_grad
- fc_grad
-}
-}
-```
-
-## Performance Imporvement
-
-There are serial places we can make this parallel_do faster.
-
-### forward: split input onto different devices
-
-If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by
-prefetching the input onto different devices in a seperate background thread. And the python code
-looks like this.
-```python
-pd = ParallelDo(gpu_places)
-with pd.do():
- feature = get_data_from_prefetch_queue(gpu_places)
- prediction = my_net(feature)
- write_output(activation)
-```
-
-### forward: Copy parameter to onto different devices
-
-We can avoid this step by making each device have a copy of the parameter. This requires:
-
-1. `fluid.default_start_up_program()` to be run on all devices
-1. In the backward, allreduce param@grad at different devices, this requires
- 1. `backward.py` add `allreduce` operators at parallel_do_grad
- 1. `allreduce` operators need to be called in async mode to achieve maximum throughput
-1. apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel
-
-By doing so, we also avoided "backward: accumulate param@grad from different devices to the first device".
-And the ProgramDesc looks like the following
-
-```
-# w1, w2 will be allocated on all GPUs
-start_program
-{
-block0 {
- parallel_do(block1)
-}
-block1 {
- parent_block: 0
- vars: w1, w2
- ops: init(w1), init(w2)
-}
-}
-
-main_program
-{
-block0 {
- vars: data, places, w1, w2
- ops: data, get_place, parallel_do(block1),
- parallel_do_grad(block2), # append_backward
- parallel_do(block3) # append_optimization
-
-}
-block1 {
- parent_block: 0
- vars: data, h1, h2, loss
- ops: fc, fc, softmax
-}
-block2 {
- parent_block: 1
- vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
- ops: softmax_grad,
- fc_grad, allreduce(places, scopes, w1_grad),
- fc_grad, allreduce(places, scopes, w2_grad)
-}
-block3 {
- parent_block: 0
- vars: lr
- ops: sgd(w2, w2_grad),
- sgd(w1, w1_grad)
-}
-}
-```
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
deleted file mode 100644
index 4fcae57cc7932cdaebe549486e7f7cebf0bd038a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/concurrent/select_op.md
+++ /dev/null
@@ -1,265 +0,0 @@
-# select_op Design
-
-## Introduction
-
-In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
-statement lets a goroutine wait on multiple communication operations at the
-same time. The **select** blocks until one of its cases can run, then
-executes the case. If multiple cases are ready to run, then one case is
-choosen at random to be executed.
-
-With the introduction of CSP for Paddle, we mimic this behavior by
-creating a ***select_op***.
-
-## How to use it
-
-The **select_op** is available as a c++ operator. However most users
-will prefer to use the much simplier Python API.
-
-- **fluid.Select()**: Creates a select operator and adds it to the current
-block within the main program. Also creates a sub block and adds it to the
-main program. This sub block is used to hold all variables and operators
-used by the case statements.
-
-Within the select block, users can add cases by
-calling **select.case** or **select.default** method.
-
-- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
-a fluid channel send/recv case. This method creates a SelectCase block
-guard and adds it to the Select block. The arguments into this method tells
-the select which channel operation to listen to.
-
-- **fluid.Select.default()**: Represents the fluid default case. This default
-case is executed if none of the channel send/recv cases are available to
-execute.
-
-**Example:**
-```
-ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-
-x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
-
-while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
-while_op = While(cond=while_cond)
-
-with while_op.block():
- with fluid.Select() as select:
- with select.case(fluid.channel_send, channel, x):
- # Send x, then perform Fibonacci calculation on x and y
- x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
- assign(input=x, output=x_tmp)
- assign(input=y, output=x)
- assign(elementwise_add(x=x_tmp, y=y), output=y)
- with select.case(fluid.channel_recv, quit_channel, result2):
- # Exit out of While loop
- while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
- helper = layer_helper.LayerHelper('assign')
- helper.append_op(
- type='assign',
- inputs={'X': [while_false]},
- outputs={'Out': [while_cond]})
-```
-
-## How it Works
-
-### Program Description
-
-```
-blocks {
- idx: 0
- ...
- // Create "case_to_execute" variable
- ops {
- outputs {
- parameter: "Out"
- arguments: "fill_constant_110.tmp_0"
- }
- type: "fill_constant"
- attrs {
- name: "force_cpu"
- type: BOOLEAN
- b: false
- }
- attrs {
- name: "value"
- type: FLOAT
- f: -1.0
- }
- attrs {
- name: "shape"
- type: INTS
- ints: 1
- }
- attrs {
- name: "dtype"
- type: INT
- i: 2
- }
- }
- // Create "select" operator.
- // inputs:
- // X: All input variables used by operators within the select block
- // case_to_execute: Variable filled in by select_op when it determines
- // which case to execute.
- //
- // outputs:
- // Out: All output variables referenced by operators within select block.
- //
- // attrs:
- // sub_block: The block id containing the select "cases"
- // cases: Serialized list of all cases in the select op.
- // Each case is serialized as: ',,,'
- // where type is 0 for default, 1 for send, and 2 for receive.
- // No channel and values are needed for default cases.
- ops {
- inputs {
- parameter: "X"
- arguments: "fill_constant_103.tmp_0"
- arguments: "fill_constant_104.tmp_0"
- }
- inputs {
- parameter: "case_to_execute"
- arguments: "fill_constant_110.tmp_0"
- }
- outputs {
- parameter: "Out"
- arguments: "fill_constant_110.tmp_0"
- }
- type: "select"
- attrs {
- name: "sub_block"
- type: BLOCK
- block_idx: 1
- }
- attrs {
- name: "cases"
- type: STRINGS
- strings: "0,1,channel_101,fill_constant_109.tmp_0"
- strings: "1,2,channel_102,fill_constant_108.tmp_0"
- }
- }
- ...
-}
-```
-
-The python select API will add the **select_op** to the current block. In addition, it will
-iterate through all it's case statements and add any input variables required by case statements
-into **X**. It will also create a temp variable called **case_to_execute**. This variable is
-filled in by the select_op after it has completed processing the case statements.
-
-If there are no available cases to execute (ie: all cases are blocked on channel operations, and
-there is no default statement), then the select_op will block the current thread. The thread will
-unblock once there is a channel operation affecting one of the case statements, at which point, the
-**select_op** will set the **case_to_execute** variable to the index of the case to execute.
-
-Finally the select_op will call executor.run on the **sub_block**.
-
-```
-blocks {
- idx: 1
- parent_idx: 0
- ...
- // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
- ops {
- outputs {
- parameter: "Out"
- arguments: "fill_constant_111.tmp_0"
- }
- type: "fill_constant"
- attrs {
- name: "force_cpu"
- type: BOOLEAN
- b: false
- }
- attrs {
- name: "value"
- type: FLOAT
- f: 0.0
- }
- attrs {
- name: "shape"
- type: INTS
- ints: 1
- }
- attrs {
- name: "dtype"
- type: INT
- i: 2
- }
- }
- // Create an "equal" operator to compare the case index with the "case_to_execute"
- // tensor (which was filled in by the select op).
- ops {
- inputs {
- parameter: "X"
- arguments: "fill_constant_111.tmp_0" // case 0
- }
- inputs {
- parameter: "Y"
- arguments: "fill_constant_110.tmp_0" // case_to_execute
- }
- outputs {
- parameter: "Out"
- arguments: "equal_0.tmp_0"
- }
- type: "equal"
- attrs {
- name: "axis"
- type: INT
- i: -1
- }
- }
- // Use the output of the "equal" operator as a condition for the "conditional_block".
- // If the condition evaluates to true, then execute the "sub_block" (which represents
- // the select case's body)
- ops {
- inputs {
- parameter: "Params"
- }
- inputs {
- parameter: "X"
- arguments: "equal_0.tmp_0"
- }
- outputs {
- parameter: "Out"
- }
- outputs {
- parameter: "Scope"
- arguments: "_generated_var_0"
- }
- type: "conditional_block"
- attrs {
- name: "is_scalar_condition"
- type: BOOLEAN
- b: true
- }
- attrs {
- name: "sub_block"
- type: BLOCK
- block_idx: 4
- }
- }
- ...
- // Repeat the above operators for each case statements inside the select body
-}
-
-```
-
-Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
-equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block,
-only one case will be executed.
-
-### select_op flow
-
-
-
-
-
-The select algorithm is inspired by golang's select routine. Please refer to
-http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
-
-## Backward Pass
-
-TODO
diff --git a/doc/fluid/design/data_type/float16.md b/doc/fluid/design/data_type/float16.md
deleted file mode 100644
index 844d2aafcf257b85057e1ac200ed3d5cf0be2ff0..0000000000000000000000000000000000000000
--- a/doc/fluid/design/data_type/float16.md
+++ /dev/null
@@ -1,183 +0,0 @@
-# Design Doc: float16
-
-## Why float16
-Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range.
-
-When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially
-
-- reduce storage space, memory bandwidth, and power usages;
-- increase the chance of data fitting into a smaller cache of lower latency;
-- provide arithmetic speed up if supported by hardware.
-
-## Survey of current float16 support
-A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
-
-The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier.
-
-### Compiler
-- nvcc supports `__half` data type after CUDA 7.5.
-- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
-- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
-
-### Hardware
-- `__half` is supported on GPU with compute capability >= 5.3.
-- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
-- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
-
-### Libraries
-- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
-- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
-
-### CUDA version issue
-There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0.
-CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
-```
-typedef struct __align__(2) {
- unsigned short x;
-} __half;
-
-typedef __half half;
-```
-This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
-```
-__global__ void Add() {
- half a, b, c;
- c = __hadd(a, b); // correct
- c = a + b; // compiler error: no operator "+" matches these operands
-}
-```
-CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
-
-Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
-```
-typedef struct __CUDA_ALIGN__(2) {
- unsigned short x;
-} __half_raw;
-
-
-struct __CUDA_ALIGN__(2) __half {
-protected:
- unsigned short __x;
-public:
- // constructors and conversion operators from/to
- // __half_raw and other built-in data types
-}
-
-typedef __half half;
-
-__device__ __forceinline__
-__half operator+(const __half &lh, const __half &rh) {
- return __hadd(lh, rh);
-}
-
-// Other overloaded operators
-```
-This new design makes `c = a + b` work correctly for CUDA half data type.
-
-## Implementation
-The float16 class holds a 16-bit `uint16_t` data internally.
-```
-struct float16 {
- uint16_t x;
-};
-```
-
-float16 supports the following features:
- - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double.
- - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
- - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen.
- - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware.
-
-To support the above features, two fundamental conversion functions are provided:
-```
-float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode
-float half_to_float(float16 h);
-```
-which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
-
-## float16 inference
-In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one.
-
-### Operator level requirement
-Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output.
-
-This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type.
-
-The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable.
-
-So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator.
-
-### Variable level requirement
-Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type.
-
-When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights.
-
-In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h).
-
-The following code demonstrates how to do the tensor conversion.
-```Python
-# var is the variable of float weights
-# tensor is a numpy array of data copied from the tensor data in var
-# fp16_var is the variable that will contain float16 weights converted from var
-tensor = numpy.array(var.get_tensor())
-fp16_tensor = fp16_var.get_tensor()
-
-# After the original tensor data is converted to numpy float16 data type,
-# view(numpy.uint16) is used so that the internal memory of the numpy array
-# will be reinterpreted to be of uint16 data type, which is binded to
-# Fluid float16 class via pybind with the help of uint16_t built-in c++ type
-fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace)
-```
-
-### Consistent API requirement
-The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type.
-
-This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode.
-
-### float16 transpiler
-Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one.
-
-Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md),
-this transpiler mainly does the following modifications:
-
-1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel.
-
-2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result.
-
-3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program.
-
-4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart.
-
-Below is an example of usage:
-```Python
-# Get the float inference program
-[float_inference_program, feed_target_names,
- fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-# Prepare the float input data
-tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32)
-
-# Running inference_program in float mode
-float_results = exe.run(float_inference_program,
- feed={feed_target_names[0]: tensor_img},
- fetch_list=fetch_targets)
-
-# Use float16 transpiler to speedup
-float16_inference_program = float_inference_program.clone()
-t = fluid.InferenceTranspiler()
-t.float16_transpile(float16_inference_program, GPUPlace)
-
-# Running
-float16_results = exe.run(float16_inference_program,
- feed={feed_target_names[0]: tensor_img},
- fetch_list=fetch_targets)
-```
-
-As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode.
-
-### Speedup on GPU
-Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart.
-
-Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
-
-Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details.
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
deleted file mode 100644
index b60167b6b1599df69dfc5073ebf32bdbb0a316ec..0000000000000000000000000000000000000000
--- a/doc/fluid/design/data_type/index_cn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-数据类型
-------------
-
-.. toctree::
- :maxdepth: 1
-
- float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
deleted file mode 100644
index 6a88d17943f49134a2d00363845e919537ff4545..0000000000000000000000000000000000000000
--- a/doc/fluid/design/data_type/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Data Type
-------------
-
-.. toctree::
- :maxdepth: 1
-
- float16.md
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
deleted file mode 100644
index 2dd652d8bdcb8f3b6e759347bd55b217be909386..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-## Distributed training overview doc
-
-Currently Paddle Fluid use parameter server architecture to support distributed training.
-
-For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
-
-### Synchronous training
-
-The training process of synchronous training is:
-
-![synchronous distributed training](./src/sync_distributed_training.png)
-
-1. Pserver
- 1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
-1. Trainer
- 1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
- 1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
- 1. Trainer use send_op to send all the split gradients to corresponding parameter server.
- 1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
- 1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
-1. Pserver
- 1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
- 1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
- 1. Pserver will run the optimization block to optimize the parameters.
- 1. After optimization, pserver set `barrier_condition_` to 1.
- 1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
-1. Trainer.
- 1. The trainer uses GetVariable to get all the parameters from pserver.
- 1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
-1. Pserver.
- 1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
-
-### Asynchronous training
-In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
-
-The training process of asynchronous training can be:
-
-![asynchronous distributed training](./src/async_distributed_training.png)
-
-1. Pserver:
- 1. Each parameter has a queue to receive its gradient from trainers.
- 1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
- 1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
-
-1. Trainer:
- 1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
- 1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
- 2. Trainer gets all parameters back from pserver.
-
-### Note:
-There are also some conditions that need to consider. For exmaple:
-
-1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
-1. If we need a lock between parameter update and parameter fetch.
-1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
-
-The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
deleted file mode 100644
index 248d2ec18dafdecac9184527638754b6ba4d85b8..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/async_update.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Design Doc: Asynchronous Update With Distributed Training
-
-## Background
-
-For the typical synchronous distributed training, some significant steps are as follows:
-
-1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
-1. After the PS node received gradients came from all the Trainers, It will aggregate the
-gradient variables for the same parameter into one gradient variable and then apply the aggregated
-gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
-to update the parameters.
-1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
-so all the Trainers would get the same parameters.
-
-In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
-have completed running current mini-batch. After that, all trainers can continue to run the next
-mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends
-on the slowest node.
-
-In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
-the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
-train such models that achieve scaling, better throughput. In this design doc, we will introduce how to
-implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
-
-## Design
-
-
-
-As the figure above, we describe a global view of the asynchronous update process and use
-the parameter `w1` as an example to introduce the steps:
-1. For each gradient variables, they may distribute on different GPU card and aggregate
-them while they are all calculated.
-1. Split the gradient variable into multiple blocks according to the number of PS
-instances and then send them.
-1. PS would run an `Optimize Block` using a specified optimize algorithm to update
-the specified parameter.
-1. The trainer will fetch the latest parameter from PS before running forward Op which depends
-on the specified parameter.
-1. Broadcast the received variable into multiple GPU cards and continue to run the next
-mini-batch.
-
-### Trainer
-
-- For the multiple devices distributed training, we need to aggregate the gradient
-variables which placed on different devices firstly and then schedule a `SendVars` Operator to
-send the gradient variables to the multiple PS instances.
-- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
-the forward ops.
-- There could be a large number of gradient variables to be sent, so we need to use another
-thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
-computing thread pool to avoid competitive the thread resources with computing.
-
-### Parameter Server
-
-
-
-- There should be multiple trainer instances want to optimize the same parameter at
-the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
-variable to process them one by one.
-- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
-can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
deleted file mode 100644
index aa7455ec5de0d46d7c2b0cef3b7ebf4754af3cb1..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Distributed Training with NCCL2
-
-We design a pattern that can enable training with `ParallelExecutor` and
-using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
-communication library.
-
-In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
-to do multi GPU training. And if we initialize NCCL2 communicators as
-ranks in a distributed environment, we can simply run the `ParallelExecutor`
-as a distributed program! The only thing that may be different than in
-the single node version is that we need to broadcast the NCCL unique ID
-to all the nodes, and initialize communicators using that ID, so NCCL2
-will know each other as ranks.
-
-To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
-so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
-what ever platform you like.
-
-It have two running modes:
-
-1. Generate and broadcast mode, which should be used on trainer 0;
-1. Listen and fetch mode, which should be used on trainers other than 0.
-
-In both two modes, this op can save the NCCL ID into current scope as a
-persistable variable, Then we can insert this op at the end of
-"startup program" of fluid, so that all workers can get the same ID to
-initialize NCCL communicator objects.
-
-
-
-The above figure indicates the general process when training with NCCL2
-distributed. Each trainer have the number of communicators equal to the
-number of GPUs, but the ranks should match the global ranks number: here
-we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
-be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
deleted file mode 100644
index 371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Design Doc: Fluid Distributed Training Architecture
-
-## Abstract
-
-PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
-
-1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
-
-2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
-
-3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
-
-This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
-
-## Analysis
-
-The assumption is that the user writes the trainer program in either Python or C++.
-
-### Limitation 1
-
-There are two basic functionalities in the trainer program:
-
-1. The training logic such as loading / saving the model and printing out the logs.
-2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
- optimizer.
-
-When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
-training logic as well as the neural network computation logic, is replicated.
-
-The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
-replicate the training logic, the limitation mentioned above can be avoided.
-
-### Limitation 2
-
-Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
-inter-model-shard communication between nodes.
-
-PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
-computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
-
-Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-
-
-
-PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-
-
-
-The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
-
-### Limitation 3
-
-The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
-
-This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
-For a detailed explanation, refer to this document -
-[Design Doc: Parameter Server](./parameter_server.md)
-
-## Distributed Training Architecture
-
-The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-
-
-
-The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
-
-### Python API
-
-Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
-
-```Python
-images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-...
-predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-optimizer.minimize(avg_cost)
-
-train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.mnist.train(), buf_size=500),
- batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-
-for pass_id in range(10):
- for data in train_reader():
- loss, acc = exe.run(trainer_prog,
- feed=feeder.feed(data),
- fetch_list=[avg_cost])
-```
-
-The code above is a typical local training program, the "Training Program" is built using helper functions such as
-`fluid.layer.fc`. The training is done by calling `Executor.run`
-iteratively.
-
-For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
-
-[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
-`Executor` to run the program locally. For any kind of distributed training, you can use
-`RemoteExecutor` to specify desired distributed training method with some optional arguments.
-
-### Distributed Transpiler
-
-The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
-the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
-Below are the steps that are followed :
-
-1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
-1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
- distributed training program:
- 1. Parse configurations from `RemoteExecutor`.
- 1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
- 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
- DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
- "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
-1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
-1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
-
-
-### RemoteExecutor
-
-As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
-You can also use parameter `fetch_list` to interactively fetch variable back to local for
-log printing.
-
-The Python `RemoteExecutor` is derived from `Executor` class.
-
-```python
-exe = RemoteExecutor(
- feed=feeder.feed(data),
- fetch_list=[avg_cost],
- job_desc=JobDesc(
- jobname,
- num_trainer,
- num_pserver,
- cpu_per_trainer,
- gpu_per_trainer,
- mem_per_trainer,
- cpu_per_pserver,
- mem_per_pserver
- ))
-for data in train_reader():
- loss, acc = exe.run(trainer_prog,
- feed=feeder.feed(data),
- fetch_list=[avg_cost])
-```
-
-`JobDesc` object describe the distributed job resource specification to run on
-Cluster environment.
-
-
-
-`RemoteExecutor.run` sends the `ProgramDesc` and
-[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
-to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
-to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
-
-
-### Placement Algorithm
-
-Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
-
-In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
-
-
-### Local Training Architecture
-
-The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
-
-
-
-
-### Training Data
-
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](./README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `Executor.run`. However, should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
-
-
-## References:
-
-[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
-
-[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
deleted file mode 100644
index e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Design Doc: Distributed Lookup Table Operator
-
-A distribute lookup table operator in PaddlePaddle where the table could be out
-of the memory of a computer.
-
-## Background
-
-A lookup table operator is well-used in deep learning for learning the
-representation, or the
-[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
-symbols.
-
-### The Forward Algorithm
-
-The forward algorithm of the lookup table is a multiplication of the
-input vector x and the lookup table matrix W:
-
-$$y = x * W$$
-
-When x is a sparse vector of symbols, the above multiplication
-simplifies into looking up rows in W that correspond to symbols in x,
-denoted by W(x). Please be aware that W could be huge and out of the
-memory, so we'd need a distributed storage service, which supports the
-lookup of rows.
-
-The following figure illustrates the multiplication of x with two
-non-zero elements, or say two symbols, and a lookup table W:
-
-![lookup table](./src/lookup_table.png)
-
-### The Backward Algorithm
-
-The backward algorithm computes W'(x) using W(x). W'(x) has the same
-the scale of size as W(x) and is much smaller than W.
-
-To optimize W given W', we can do simple SGD update:
-
-$$W = f(W') = \lambda * W'$$
-
-or some more sophisticated algorithms that rely on both W' and W:
-
-$$W = f(W, W')$$
-
-The following figure illustrates the backward pass of the lookup
-operator: ![lookup table training](./src/lookup_table_training.png)
-
-## Distributed Lookup Table
-### Problem 1: The lookup table may be very large.
-
- In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
-
- ```
- 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
- ```
-
-### Solution: Distributed storage
-
-1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
-
-1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
-
-### Problem 2. The Id in the lookup table is not sure before training.
-
- The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
-
-### Solution: Id auto growth
-
-At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
-
-### Problem 3: parameter load and save
-
-For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
-
-### Solution: Parameter server side save and load
-
-Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
-
-## Architecture
-The whole architecture of the distribute lookup table is as below:
-
-### Training steps:
-1. Read a batch of data, the data is feature ids.
-1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
-1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
-1. Run forward-backward to get the gradient of the lookup table.
-1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
-1. parameter server update the table with the received gradient.
-
-![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
deleted file mode 100644
index c09b7c99159ace9b3df989f803ede20bc3585d92..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/distributed_traing_review.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Parallelism, Asynchronous, Synchronous, Codistillation
-
-
-For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
-
-# Model Parallelism
-In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
-
-A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
-
-More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
-
-# Data Parallelism
-Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
-
-# Asynchronous Training
-In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
-
-In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
-
-Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
-
-# Synchronous Training
-Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
-
-An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
-
-Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
-
-Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
-
-# Codistillation
-Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
-
-
-# Reference
-
-[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
-
-[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
-
-[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
-
-[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
deleted file mode 100644
index ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-分布式训练
-------------
-
-.. toctree::
- :maxdepth: 1
-
- distributed_architecture.md
- distributed_lookup_table_design.md
- parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
deleted file mode 100644
index f84688f168021113bd933802709bcd787b474bca..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Distributed Training
----------------------
-
-.. toctree::
- :maxdepth: 1
-
- distributed_architecture.md
- distributed_lookup_table_design.md
- parameter_server.md
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
deleted file mode 100644
index 4ad3afc7b7522c60460c6f1f387f9415d3738778..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/mpi_enabled_design.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# MPI-enabled PaddlePaddle Design doc
-
-# Background
-When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
-1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
-2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
-3. TCP connections can not make full use of RDMA 100Gb devices.
-
-We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
-1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
-2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
-
-# Change list
-* Compile args: Need add compile args to enable MPI support.
-* Execute args: Need add execute args to assign when and how to use MPI operations.
-* New ops: Need new op ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
-* Transpiler optimized: Which can add ```mpi_send_op``` and ```mpi_listenandserve_op``` to the running graph.
-* MPI utils package: Need MPI utils package as the low-level API supported.
-
-## Compile args
-Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI``` compile args to control MPI to use or not. If the ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
-
-## Execute args
-Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer), The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
-
-## New ops
-We won't replace all the gRPC requests to MPI requests, the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives, the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
-
-### mpi_send_op
-Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
-
-### mpi_listenandserve_op
-Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
-
-## Transpiler optimized
-**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
- if confirm to use MPI, we will modify ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
-
-## MPI utils package
-In this package, We will write openMPI low-level API to use MPI.
-The API included in this package are:
-* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
-The detailed flow is below:
-![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
-* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
-gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
deleted file mode 100644
index 38222d083084ebfca3099ce96b47868c42d55101..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Design Doc: Execute the Program with Multi CPU
-
-## Abstract
-
-This Design Doc propose an approach to make the user-defined Op graph
-running with multi-CPU, we will use an auto transpiler to convert the user-defined
-Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
-
-## Transpiler
-
-
-
-After converted:
-
-
-
-## Implement
-
-- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
- which would be executed with multi-threads.
-- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
- for the atomic counter become `0`:
- ```cpp
- BlockingCounter bc(thread_count);
- for (int i = 0; i < thread_count; ++i) {
- thread_pool->Start([&bc] {bc.DecrementCount(); })
- }
- bc.Wait();
- ```
-- `ParallelDo` Operator
- - Initialize a thread pool which is a Singleton.
- - Use a block id as the input, and create run the specify Block on independent scope
- with multi-threads.
- - Initialize a `BlockingCounter` instance and wait until all threads are done.
-- `Split` Operator will split the Input Tensor into a TensorArray.
-- `Merge` merge all the gradients which calculated in different threads
- with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
-
-## TODO
-
-- Improve the optimizer stage with multi-threads, since we could
- assign the parameters to the different threads and execute
- optimizer with multi-threads.
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
deleted file mode 100644
index 563b70bc0e852bec953eb40dda3c46b3d45d7e68..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Design Doc: Parameter Server
-
-## Abstract
-
-We propose an approach to implement the parameter server. In this
-approach, there is no fundamental difference between the trainer and
-the parameter server: they both run subgraphs, but subgraphs of
-different purposes.
-
-## Background
-
-The previous implementations of the parameter server do not run a
-fluid sub-program. Parameter initialization, optimizer computation, network
-communication and checkpointing are implemented twice on both the
-trainer as well as the parameter server.
-
-It would be great if we can write code once and use them on both: the
-trainer and the parameter server, since this reduces code duplication and
-improves extensibility. Given that after the current refactoring, we are
-representing everything as a computation graph on the
-trainer. Representing everything as a computation graph on the parameter
-server becomes a natural extension.
-
-## Design
-
-### Distributed Transpiler
-
-The *Distributed Transpiler* converts the user-defined fluid program
-into sub-programs to be scheduled on different nodes with the following
-steps:
-
-1. OP placement: the OPs will be placed on different nodes according
- to a heuristic that minimizes the estimated total computation
- time. Currently we will use a simple heuristic that puts parameter
- variable on parameter server workers and everything else on trainer
- workers.
-1. Add communication OPs to enable the communication between nodes.
-
-We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
-
-Below is an example of converting the user defined graph to the
-subgraphs for the trainer and the parameter server:
-
-
-
-After converting:
-
-
-
-1. The parameter variable W and its optimizer program are placed on the parameter server.
-1. Operators are added to the program.
- - *Send* sends data to the connected *Recv* operator. The
- scheduler on the receive node will only schedule *Recv* operator
- to run when the *Send* operator has ran (the *Send* OP will mark
- the *Recv* OP runnable automatically).
- - *Enqueue* enqueues the input variable, it can block until space
- become available in the queue.
- - *Dequeue* outputs configurable numbers of tensors from the
- queue. It will block until the queue has the required number of
- tensors.
-
-### Sparse Update
-
-For embedding layers, the gradient may have many rows containing only 0 when training,
-if the gradient uses a dense tensor to do parameter optimization,
-it could spend unnecessary memory, slow down the calculations and waste
-the bandwidth while doing distributed training.
-In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
-non-zero gradient data. So when we do parameter optimization both locally and remotely,
-we only need to send those non-zero rows to the optimizer operators:
-
-
-### Benefits
-
-- Model parallelism becomes easier to implement: it is an extension to
- the trainer - parameter server approach. We can have several "Transpilers"
- to achieve different goals.
-- User-defined optimizer is easier to add - user can now express it as
- a sub-program.
-- No more duplication logic inside the trainer and the parameter
- server mentioned in the background section.
-
-### Challenges
-
-- It is important to balance the parameter shards on multiple
- parameter servers. If a single parameter is very big (for example: some
- word-embedding, fully connected, softmax layer), we need to
- automatically partition the single parameter onto different
- parameter servers when possible (only element-wise optimizer depends
- on the parameter variable).
-- In the "Async SGD" figure, the "W" variable on the parameter server
- could be read and written concurrently. See
- [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
- details about concurrent program in Fluid.
-
-### Discussion
-
-- Can the Enqueue OP be implemented under our current tensor design
- (put the input tensor into the queue tensor)?
-- *Dequeue* OP will have variable numbers of output (depending on the
- `min_count` attribute), does our current design support it? (similar
- question for the *Add* OP)
-
-### References
-
-[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
deleted file mode 100644
index 3b53ab59c0cd7b44b2956f16f1adc47fe85909d3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/async_distributed_training.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
deleted file mode 100644
index d2301611774fcb3866473e3e6470568d1e1312cf..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/async_pserver.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
deleted file mode 100644
index 7d900b0c0eb291c67537b9cf93227c671bafdc73..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/async_pserver.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
deleted file mode 100644
index 3a631888688a0d564a873fcb16d943958c91223e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/async_update.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
deleted file mode 100644
index 3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/async_update.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/compiler.graffle b/doc/fluid/design/dist_train/src/compiler.graffle
deleted file mode 100644
index 8cc678fea3c820103e7ce81f7a5d625d6c1d92de..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/compiler.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/compiler.png b/doc/fluid/design/dist_train/src/compiler.png
deleted file mode 100644
index 65d34f841afce9756def07dd8ecb9ca44e658bfe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/compiler.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.graffle b/doc/fluid/design/dist_train/src/dist-graph.graffle
deleted file mode 100644
index 941399c6ced8d5f65b6c595522b770c88259df4b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/dist-graph.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/dist-graph.png b/doc/fluid/design/dist_train/src/dist-graph.png
deleted file mode 100644
index 3546b09f1c2ee3e4f60f519d5e47f823f08051a7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/dist-graph.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.graffle b/doc/fluid/design/dist_train/src/distributed_architecture.graffle
deleted file mode 100644
index d1b60141342232e06227c2d430ebc60ec349a907..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/distributed_architecture.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/distributed_architecture.png b/doc/fluid/design/dist_train/src/distributed_architecture.png
deleted file mode 100644
index 29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/distributed_architecture.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
deleted file mode 100644
index 65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
deleted file mode 100644
index 5353a16fd329f62ff893d32706b9c3c0bcc46a07..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
deleted file mode 100644
index 1168801bc1fadfce310a74cb3110695bd1629f6b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/distributed_training.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
deleted file mode 100644
index 96ca6d48f43bd9f49c6861dab006e2037873db87..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
deleted file mode 100644
index afa25ab3b4e427bc595a855b12ab966478e01ed0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.graffle b/doc/fluid/design/dist_train/src/local-graph.graffle
deleted file mode 100644
index 19e509bd9af3c1e9a3f5e0f16ddd281457a339c5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/local-graph.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/local-graph.png b/doc/fluid/design/dist_train/src/local-graph.png
deleted file mode 100644
index ada51200f793a9bb18911e7d63cfdb3244b967d7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/local-graph.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.graffle b/doc/fluid/design/dist_train/src/local_architecture.graffle
deleted file mode 100644
index 49fcc663ebe3824aa234e3a67aadf285cb417877..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/local_architecture.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/local_architecture.png b/doc/fluid/design/dist_train/src/local_architecture.png
deleted file mode 100644
index 14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/local_architecture.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
deleted file mode 100644
index 72dfe3547f731d0d090338afb206b0549dff472e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/lookup_table.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
deleted file mode 100644
index cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/lookup_table_training.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
deleted file mode 100644
index e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/mpi_module.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads.graffle b/doc/fluid/design/dist_train/src/multi-threads.graffle
deleted file mode 100644
index e71173715fff92a0a933d0c7d83599ba948552c6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/multi-threads.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png b/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png
deleted file mode 100644
index e40a869987dbbf5019d4cb03c1dab55b74d6c9f9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/multi-threads/multi-threads@3x.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png b/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png
deleted file mode 100644
index 4083aebfdd45af5fbac25fa2c4176bc08c3cb44a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/multi-threads/single-thread@3x.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
deleted file mode 100644
index 7d2753bbb03bc28c7a0054bb0aa424deb072ffbf..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/ncc2_design.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
deleted file mode 100644
index da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/ncc2_design.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.graffle b/doc/fluid/design/dist_train/src/paddle-compile.graffle
deleted file mode 100644
index a6348cc3dbcaca923c6e794681b2edb85cb9f8f6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/paddle-compile.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/paddle-compile.png b/doc/fluid/design/dist_train/src/paddle-compile.png
deleted file mode 100644
index e0f13d551ac41afaec627a57dea79356464bf0bf..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/paddle-compile.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.graffle b/doc/fluid/design/dist_train/src/remote_executor.graffle
deleted file mode 100644
index 41b2067311694b56d211a4f32d1b76884eeffd2d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/remote_executor.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/remote_executor.png b/doc/fluid/design/dist_train/src/remote_executor.png
deleted file mode 100644
index 744e2fb2e0f1bbe058e991ba7b2a09000965ee79..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/remote_executor.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.graffle b/doc/fluid/design/dist_train/src/sparse_update.graffle
deleted file mode 100644
index 08d689a58f83698d8c1158ee3990ed8abf3a7a9a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/sparse_update.graffle and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/sparse_update.png b/doc/fluid/design/dist_train/src/sparse_update.png
deleted file mode 100644
index 8c872e6ac479f7d1b818a4a207956c43155d0ad7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/sparse_update.png and /dev/null differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
deleted file mode 100644
index e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dist_train/src/sync_distributed_training.png and /dev/null differ
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
deleted file mode 100644
index 5d77865061ca7bbbfcf254dd938f09aef5553505..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
+++ /dev/null
@@ -1,56 +0,0 @@
-digraph G {
-
- rnn [label="1st level RNN" shape=box]
-
- subgraph cluster0 {
- label = "time step 0"
-
- sent0 [label="sentence"]
- sent1 [label="sentence"]
-
- rnn1 [label="2nd level RNN" shape=box]
-
- sent0 -> rnn1
- sent1 -> rnn1
- }
-
- subgraph cluster1 {
- label = "time step 1"
-
- sent2 [label="sentence"]
- sent3 [label="sentence"]
-
- rnn2 [label="2nd level RNN" shape=box]
-
- sent2 -> rnn2
- sent3 -> rnn2
- }
-
- subgraph cluster2 {
- label = "time step 2"
-
- sent4 [label="sentence"]
- sent5 [label="sentence"]
-
- rnn3 [label="2nd level RNN" shape=box]
-
- sent4 -> rnn3
- sent5 -> rnn3
- }
-
-
- para0 [label="paragraph info 0"]
- para1 [label="paragraph info 1"]
- para2 [label="paragraph info 2"]
-
- rnn1 -> para0
- rnn2 -> para1
- rnn3 -> para2
-
- para0 -> rnn
- para1 -> rnn
- para2 -> rnn
-
- chapter [label="chapter info"]
- rnn -> chapter
-}
diff --git a/doc/fluid/design/dynamic_rnn/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
deleted file mode 100644
index 0537a75beb175c0c284717421f7aa908da2a5038..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dynamic_rnn/2_level_rnn.png and /dev/null differ
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
deleted file mode 100644
index 1d224d22cf7103616f44115db01f0ae55f1cb88a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/index_cn.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-动态RNN
-------------
-
-.. toctree::
- :maxdepth: 1
-
- rnn.md
- rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
deleted file mode 100644
index 568f496e4ffe21a5e730488aef905f7e2d98839e..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/index_en.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Dynamic RNN
-------------
-
-.. toctree::
- :maxdepth: 1
-
- rnn.md
- rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot
deleted file mode 100644
index c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/rnn.dot
+++ /dev/null
@@ -1,87 +0,0 @@
-digraph G {
- label = "simple RNN implementation"
-
- ranksep=2;
-
- //graph [nodesep=1, ranksep=1];
-
- node[nodesep=1]
-
- subgraph cluster0 {
- label = "global scope"
- rankdir = TB
- W
- boot_memory
- input
- output
- }
-
- subgraph cluster1 {
- label = "step-scope 0"
- rankdir = TB
- memory0[label="memory"]
- prememory0[label="pre-memory"]
- step_input0[label="step input"]
- step_output0[label="step output"]
- }
-
- subgraph cluster2 {
- label = "step-scope 1"
- rankdir = TB
- memory1[label="memory"]
- prememory1[label="pre-memory"]
- step_input1[label="step input"]
- step_output1[label="step output"]
- }
-
- subgraph cluster3 {
- label = "step-scope 2"
- rankdir = TB
- memory2[label="memory"]
- prememory2[label="pre-memory"]
- step_input2[label="step input"]
- step_output2[label="step output"]
- }
-
- stepnet [shape=box]
- stepnet0 [shape=box, style=dashed]
- stepnet1 [shape=box, style=dashed]
- stepnet2 [shape=box, style=dashed]
-
-
- edge[color=blue]
- boot_memory -> prememory0 [label="init" color="blue"]
- memory0 -> prememory1 [label="copy/reference" color="blue"]
- memory1 -> prememory2 [label="copy/reference" color="blue"]
-
- edge[color=black]
- W -> stepnet0[constraint=false, style=dashed]
- W -> stepnet1[constraint=false, style=dashed]
- W -> stepnet2[constraint=false, style=dashed]
-
- memory0 -> stepnet0[style=dashed]
- prememory0 -> stepnet0 -> step_output0[style=dashed]
-
- memory1 -> stepnet1[style=dashed]
- prememory1 -> stepnet1 -> step_output1[style=dashed]
-
- memory2 -> stepnet2[style=dashed]
- prememory2 -> stepnet2 -> step_output2[style=dashed]
-
- input -> step_input0
- input -> step_input1
- input -> step_input2
-
- step_input0 -> stepnet0 [style=dashed]
- step_input1 -> stepnet1[style=dashed]
- step_input2 -> stepnet2[style=dashed]
-
- step_output0 -> output
- step_output1 -> output
- step_output2 -> output
-
- stepnet0 -> stepnet[style=dashed]
- stepnet1 -> stepnet[style=dashed]
- stepnet2 -> stepnet[style=dashed]
-
-}
diff --git a/doc/fluid/design/dynamic_rnn/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg
deleted file mode 100644
index 9867e404cf959df0dce6ded5222b466c788fb840..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dynamic_rnn/rnn.jpg and /dev/null differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
deleted file mode 100644
index b39ae0675c45e56852293d97f45e91861cf31667..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ /dev/null
@@ -1,153 +0,0 @@
-# RNNOp design
-
-This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
-
-## RNN Algorithm Implementation
-
-
-
-
-
-The above diagram shows an RNN unrolled into a full network.
-
-There are several important concepts here:
-
-- *step-net*: the sub-graph that runs at each step.
-- *memory*, $h_t$, the state of the current step.
-- *ex-memory*, $h_{t-1}$, the state of the previous step.
-- *initial memory value*, the memory of the first (initial) step.
-
-### Step-scope
-
-There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
-
-
-
-Figure 2 illustrates the RNN's data flow
-
-
-Please be aware that every step runs the same step-net. Each step does the following:
-
-1. Creates the step-scope.
-2. Initializes the local variables including step-outputs, in the step-scope.
-3. Runs the step-net, which uses the above mentioned variables.
-
-The RNN operator will compose its output from step outputs in each of the step scopes.
-
-### Memory and Ex-memory
-
-Let's give more details about memory and ex-memory using a simple example:
-
-$$
-h_t = U h_{t-1} + W x_t
-$$,
-
-where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
-
-In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
-or copy the memory value of the previous step to the current ex-memory variable.
-
-### Usage in Python
-
-For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
-
-We can define an RNN's step-net using a Block:
-
-```python
-import paddle as pd
-
-X = some_op() # x is some operator's output and is a LoDTensor
-a = some_op()
-
-# declare parameters
-W = pd.Variable(shape=[20, 30])
-U = pd.Variable(shape=[20, 30])
-
-rnn = pd.create_rnn_op(output_num=1)
-with rnn.stepnet():
- x = rnn.add_input(X)
- # declare a memory (rnn's step)
- h = rnn.add_memory(init=a)
- # h.pre_state(), the previous memory of rnn
- new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
- # update current memory
- h.update(new_state)
- # indicate that h variables in all step scopes should be merged
- rnn.add_outputs(h)
-
-out = rnn()
-```
-
-Python API functions in above example:
-
-- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
-- `rnn.add_memory`: creates a variable used as the memory.
-- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
-
-### Nested RNN and LoDTensor
-
-An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
-
-For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
-
-The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
-
-
-
-
-
-```python
-import paddle as pd
-
-W = pd.Variable(shape=[20, 30])
-U = pd.Variable(shape=[20, 30])
-
-W0 = pd.Variable(shape=[20, 30])
-U0 = pd.Variable(shape=[20, 30])
-
-# a is output of some op
-a = some_op()
-
-# chapter_data is a set of 128-dim word vectors
-# the first level of LoD is sentence
-# the second level of LoD is a chapter
-chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
-
-def lower_level_rnn(paragraph):
- '''
- x: the input
- '''
- rnn = pd.create_rnn_op(output_num=1)
- with rnn.stepnet():
- sentence = rnn.add_input(paragraph, level=0)
- h = rnn.add_memory(shape=[20, 30])
- h.update(
- pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
- # get the last state as sentence's info
- rnn.add_outputs(h)
- return rnn
-
-top_level_rnn = pd.create_rnn_op(output_num=1)
-with top_level_rnn.stepnet():
- paragraph_data = rnn.add_input(chapter_data, level=1)
- low_rnn = lower_level_rnn(paragraph_data)
- paragraph_out = low_rnn()
-
- h = rnn.add_memory(init=a)
- h.update(
- pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
- top_level_rnn.add_outputs(h)
-
-# output the last step
-chapter_out = top_level_rnn(output_all_steps=False)
-```
-
-In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
-
-By default, the `RNNOp` will concatenate the outputs from all the time steps.
-If the `output_all_steps` is set to False, it will only output the final time step.
-
-
-
-
-
diff --git a/doc/fluid/design/dynamic_rnn/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png
deleted file mode 100644
index e139e373fe8396782044cfd936fdde624f8c66fe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dynamic_rnn/rnn.png and /dev/null differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
deleted file mode 100644
index 1d85ae2617a915ad0ad8288d848b607cc37ad297..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
+++ /dev/null
@@ -1,75 +0,0 @@
-digraph G {
- chapter [label="chapter"]
-
- subgraph cluster0 {
- label = "paragraph 0"
-
- top_rnn0[label="top rnn step 0" shape=box]
-
- p0 [label="paragraph 0"]
- p1 [label="paragraph 1"]
- }
-
- subgraph cluster1{
- label = "paragraph 1"
-
- top_rnn1[label="top rnn step 1" shape=box]
-
- p2 [label="paragraph 0"]
- p3 [label="paragraph 1"]
- }
-
- subgraph cluster_p0 {
- label = "sentence 0"
-
- low_rnn0 [label="low rnn step 0" shape=box]
- s00 [label="sentence 0"]
- s01 [label="sentence 1"]
-
- low_rnn0 -> s00
- low_rnn0 -> s01
- }
-
- subgraph cluster_p1 {
- label = "sentence 1"
- low_rnn1 [label="low rnn step 1" shape=box]
- s10 [label="sentence 0"]
- s11 [label="sentence 1"]
- low_rnn1 -> s10
- low_rnn1 -> s11
- }
-
- subgraph cluster_p2 {
- label = "sentence 1"
- low_rnn2 [label="low rnn step 0" shape=box]
- s20 [label="sentence 0"]
- s21 [label="sentence 1"]
- low_rnn2 -> s20
- low_rnn2 -> s21
- }
-
- subgraph cluster_p3 {
- label = "sentence 1"
- low_rnn3 [label="low rnn step 1" shape=box]
- s30 [label="sentence 0"]
- s31 [label="sentence 1"]
- low_rnn3 -> s30
- low_rnn3 -> s31
- }
-
-
- chapter -> top_rnn0
- chapter -> top_rnn1
-
- top_rnn0 -> p0
- top_rnn0 -> p1
- top_rnn1 -> p2
- top_rnn1 -> p3
-
-
- p0 -> low_rnn0
- p1 -> low_rnn1
- p2 -> low_rnn2
- p3 -> low_rnn3
-
-}
diff --git a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
deleted file mode 100644
index 4be81b2430717a6a506342a09fc26899568574c6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/dynamic_rnn/rnn_2level_data.png and /dev/null differ
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
deleted file mode 100644
index cecfcd3307ae4c4fa603220a360e9e124069fa58..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/rnn_design.md
+++ /dev/null
@@ -1,242 +0,0 @@
-# RNN 变长输入设计
-对变长序列的学习,现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式,
-即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
-
-现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持,本文也将基于该模块的思路,设计重构后的变长序列支持。
-
-## 背景介绍
-由于tensor必须有明确的shape,因此基于tensor 的主流框架在存储变长序列时,
-必须用zero-padding的方式将变长序列补全为固定shape的tensor。
-
-由于padding是一种框架实现变长序列的妥协, 从用户角度,在使用RNN类模型时自然会比较介意padding的存在,
-因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
-
-由于padding对内存和计算会有额外的消耗,tensorflow和mxnet均使用了bucketing来进行优化[1][2],
-但不管是padding还是bucket,对于用户都是额外的使用负担。
-
-因此,**paddle原生支持变长序列的方式,能直接满足用户对变长序列的最直接的需求,在当前主流平台中可以算是一大优势**。
-
-但对变长序列的支持,需要对目前框架做一些修改,下面讨论如何在最小修改下支持变长序列。
-
-## 多层序列数据格式 `LODTensor`
-目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上,
-额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
-
-Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息,更高维度的序列则无法直接支持;
-
-为了支持 `N-level` 序列的存储,本文将序列信息定义成如下数据结构:
-
-```c++
-std::shared_ptr>> lod_start_pos_;
-```
-
-或者更明确的定义
-
-```c++
-typedef std::vector level_t;
-std::vector lod_start_pos;
-```
-
-这里的每一个 `level_t` 存储一个粒度(level)的偏移信息,和paddle目前做法一致。
-
-为了更透明地传递序列信息,我们引入了一种新的tensor 称为 `LODTensor`[4],
-其关于tensor相关的接口都直接继承自 `Tensor`,但另外添加了序列相关接口。
-如此,在操作一个 `LODTensor` 时,普通 `Op` 直接当成 `Tensor` 使用,
-而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
-
-`LODTensor` 具体定义如下:
-
-```c++
-class LODTensor : public Tensor {
-public:
- size_t Levels() const { return seq_start_positions_.size(); }
- size_t Elements(int level = 0) const {
- return seq_start_positions_[level].size();
- }
- // slice of level[elem_begin: elem_end]
- // NOTE low performance in slice seq_start_positions_.
- // TODO should call Tensor's Slice.
- LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
-
- // slice with tensor's data shared with this.
- LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
-
- // copy other's lod_start_pos_, to share LOD info.
- // NOTE the LOD info sould not be changed.
- void ShareConstLODFrom(const LODTensor &other) {
- lod_start_pos_ = other.lod_start_pos_;
- }
- // copy other's lod_start_pos_'s content, free to mutate.
- void ShareMutableLODFrom(const LODTensor &other) {
- lod_start_pos_ = std::make_shared <
- std::vector>(other.lod_start_pos_.begin(),
- other.lod_start_pos_.end());
- }
-
-private:
- std::shared_ptr>> lod_start_pos_;
-};
-```
-
-其中, `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价,
-可以认为 `LODTensor` 是 `Tensor` 的扩展,几乎完全兼容原始 `Tensor` 的使用。
-
-## 框架支持
-### 框架现有的 `Tensor` 调用替换为 `LODTensor`
-为了实现 `LODTensor` 的传递,框架里很多 `Tensor` 都需要变成 `LODTensor`,
-简单实现,直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`,这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
-
-此外,用户有可能需要感知序列的存在(比如序列的可视化需要解析模型中输出的序列),因此一些序列操作的API也需要暴露到 python 层。
-
-### `lod_start_pos` 随着Op调用链传递
-框架需要支持下列特性,以实现`lod_start_pos`的传递:
-
-1. 以 `shared_ptr` 的方式实现传递
- - 不修改 `lod_start_pos` 内容的作为 consumer
- - 修改 `lod_start_pos` 的作为 producer
- - 约定 consumer 只需要复制传递过来的 `shared_ptr`
- - producer 需要创建自己的独立的内存,以存储自己独立的修改,并暴露 `shared_ptr` 给后续 consumer
- - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos`
-
-2. 对于不感知 `lod_start_pos` 的Op足够透明
-3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
-
-具体的设计分为以下3小节
-
-#### `load_start_pos` 的传递
-
-- 对于不需要修改 `lod_start_pos` 的情况,调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
-- 需要修改的,调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
-
-#### 框架透明
-传递这一步需要加入到网络跑之前的初始化操作中,并且只需要初始化一次,基于当前框架设计的初步方案如下
-
-- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性,默认为 `false`
- - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
-- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ,并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
-- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
-
-一些逻辑如下
-
-```c++
-class OperatorBase {
-public:
- // ...
- void InferShape() {
- if (!is_load_inited) {
- bool do_mutate_lod_info = GetAttr("do_mutate_load_info");
- // find a input having LOD to copy
- auto lod_input = ValidLODInput();
- for (auto &output : outputs) {
- if (do_mutate_load_info) {
- output.ShareMutableLODFrom(lod_input);
- } else {
- output.ShareConstLODFrom(load_input);
- }
- }
- is_pod_inited = true;
- }
-
- // call op's InferShape
- // ...
- }
-
-private:
- // ...
- bool is_lod_inited{false};
-};
-```
-
-如此,`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
-
-#### `lod_start_pos` 的更新
-上一小节介绍到,对于需要修改 `load_start_pos` 的Op,`OperatorBase` 会分配一块自己的内存以存储修改,
-Op在 `Run` 的实现中,操作更新自己的 `load_start_pos` ,
-而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
-
-## 根据长度排序
-按照长度排序后,从前往后的时间步的batch size会自然地递减,可以直接塞入 Net 做batch计算
-
-比如原始的输入:
-
-```
-origin:
-xxxx
-xx
-xxx
-
--> sorted:
-xxxx
-xxx
-xx
-```
-
-经过 `SegmentInputs` 之后,每个会有4个时间步,每个时间步的输入如下(纵向排列)
-
-```
-0 1 2 3
-x x x x
-x x x
-x x
-```
-
-为了追踪排序前后序列的变化,这里用
-```c++
-struct SortedSeqItem {
- void *start{nullptr};
- void *end{nullptr};
-};
-
-std::vector sorted_seqs;
-```
-来追踪序列排序后的位置,并添加一个新的接口
-
-```c++
-std::vector SortBySeqLen(const LODTensor& tensor);
-```
-
-由于输入序列的顺序变化,以下现有的接口需要针对性地修改:
-
-- InitMemories, memory需要根据 `sorted_seqs` 重新排列
-- SetmentInputs
-- ConcatOutputs
-
-此外,由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用,因此会变成 `RecurrentOp` 一个新的output输出,
-之后作为 `RecurrentGradientOp` 的一个输入传入。
-
-## InitMemories
-由于序列顺序的变化,`boot_memories` 的batch上的element的顺序也需要对应重新排列。
-
-## SegmentInputs
-`SegmentInputs` 会依赖 `sorted_seqs` 的信息,将原始的序列按照排序后的序列顺序,从横向切割,转为每个step中的inputs。
-
-即下面的转变:
-```
-origin:
-xxxx
-xx
-xxx
-
- |
- |
- \ /
- !
-0 1 2 3
-x x x x
-x x x
-x x
-```
-## ConcatOutputs
-`ConcatOutputs` 需要
-
-- 将每个时间步的输出重新还原为原始输入的序列顺序(以防止Infer阶段顺序打乱)
-- 将每个序列concat 为规则的mini-batch表示
-
-## 参考文献
-[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
-
-[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
-
-[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
-
-[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
deleted file mode 100644
index 9493908f4f73b3e7d91f5f6364a2a3660257d508..0000000000000000000000000000000000000000
--- a/doc/fluid/design/dynamic_rnn/rnn_design_en.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Varient Length supported RNN Design
-For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
-
-Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
-
-The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`,
-which supports the variable length sequences without padding.
-This doc will design fluid's RNN based on this idea.
-
-## Multi-layer sequence data format `LODTensor`
-At present, Paddle stores data in one mini-batch in one-dimensional array.
-
-`Argument.sequenceStartPositions` is used to store information for each sentence.
-
-In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
-
-In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
-
-
-```c++
-std::shared_ptr>> lod_start_pos_;
-```
-
-Or more clearly defined here
-
-```c++
-typedef std::vector level_t;
-std::vector lod_start_pos;
-```
-Each `level_t` here stores a level of offset information consistent with paddle's current practice.
-
-In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
-Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
-Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
-The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
-
-The definition of `LODTensor` is as follows:
-
-
-```c++
-class LODTensor : public Tensor {
-public:
- size_t Levels() const { return seq_start_positions_.size(); }
- size_t Elements(int level = 0) const {
- return seq_start_positions_[level].size();
- }
- // slice of level[elem_begin: elem_end]
- // NOTE low performance in slice seq_start_positions_.
- // TODO should call Tensor's Slice.
- LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
-
- // slice with tensor's data shared with this.
- LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
-
- // copy other's lod_start_pos_, to share LOD info.
- // NOTE the LOD info sould not be changed.
- void ShareConstLODFrom(const LODTensor &other) {
- lod_start_pos_ = other.lod_start_pos_;
- }
- // copy other's lod_start_pos_'s content, free to mutate.
- void ShareMutableLODFrom(const LODTensor &other) {
- lod_start_pos_ = std::make_shared <
- std::vector>(other.lod_start_pos_.begin(),
- other.lod_start_pos_.end());
- }
-
-private:
- std::shared_ptr>> lod_start_pos_;
-};
-```
-Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
-`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
-
-## How to support the framework
-### Replace `Tensor` with `LoDTensor`
-To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
-Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
-
-In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
-
-### Transmit `lod_start_pos` along with the Op call chain
-`lod_start_pos` is passed along with the Op call chain
-The framework needs to support the following features to implement the transmit of `lod_start_pos`:
-
-1. Implement the transfer as `shared_ptr`
- - Do not modify the contents of `lod_start_pos` as a consumer
- - Modify producer of `lod_start_pos` as producer
- - Conventions consumer only needs to copy `shared_ptr` passed over
- - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
- - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
-
-2. Op is transparent enough not to sense `lod_start_pos`
-3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
-
-## sorted by length
-After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
-
-For example, the original input:
-
-```
-origin:
-xxxx
-xx
-xxx
-
--> sorted:
-xxxx
-xxx
-xx
-```
-
-After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
-
-```
-0 1 2 3
-x x x x
-x x x
-x x
-```
-
-In order to track the changes before and after sorting, use here
-
-```c++
-struct SortedSeqItem {
- void *start{nullptr};
- void *end{nullptr};
-};
-
-std::vector sorted_seqs;
-```
-To track the position of the sequence after sorting, and add a new interface
-
-```c++
-std::vector SortBySeqLen(const LODTensor& tensor);
-```
-Due to the sequence of input sequences, the following existing interfaces need to be modified:
-
-- InitMemories, memory needs to be rearranged according to `sorted_seqs`
-- SetmentInputs
-- ConcatOutputs
-
-In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
-It is passed in as an input to `RecurrentGradientOp`.
-
-## InitMemories
-Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
-
-## SegmentInputs
-
-`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
-
-the transition is as follows:
-```
-origin:
-xxxx
-xx
-xxx
-
- |
- |
- \ /
- !
-0 1 2 3
-x x x x
-x x x
-x x
-```
-## ConcatOutputs
-`ConcatOutputs` needs
-
-- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
-- Concat each sequence as a regular mini-batch representation
-
-## references
-1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/execution/if_else_op.md b/doc/fluid/design/execution/if_else_op.md
deleted file mode 100644
index 26d140f06db4ecefa86be015eaa731ffddc6910c..0000000000000000000000000000000000000000
--- a/doc/fluid/design/execution/if_else_op.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# The `IfElse` Operator
-
-PaddlePaddle's `IfElse` operator differs from TensorFlow's:
-
-- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
-- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
-
-## Example
-
-The following PaddlePaddle program shows the usage of the IfElse operator:
-
-```python
-import paddle as pd
-
-x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(1) # shape=[1], value=1
-z = minibatch([10, 20, 30]) # shape=[None, 1]
-cond = larger_than(x, 15) # [false, true, true]
-
-ie = pd.ifelse()
-with ie.true_block():
- d = pd.layer.add(x, y)
- ie.output(d, pd.layer.softmax(d))
-with ie.false_block():
- d = pd.layer.fc(z)
- ie.output(d, d+1)
-o1, o2 = ie(cond)
-```
-
-A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
-
-An equivalent C++ program is as follows:
-
-```c++
-namespace pd = paddle;
-
-int x = 10;
-int y = 1;
-int z = 10;
-bool cond = false;
-int o1, o2;
-if (cond) {
- int d = x + y;
- o1 = z;
- o2 = pd::layer::softmax(z);
-} else {
- int d = pd::layer::fc(z);
- o1 = d;
- o2 = d+1;
-}
-```
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
deleted file mode 100644
index ed31b017429d168b2466d8f6b423f48bd5d78d1f..0000000000000000000000000000000000000000
--- a/doc/fluid/design/execution/index_cn.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-执行流程
--------------
-
-.. toctree::
- :maxdepth: 1
-
- switch.md
- if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
deleted file mode 100644
index fcf846da348ff0bed707c42718e08314998fbac0..0000000000000000000000000000000000000000
--- a/doc/fluid/design/execution/index_en.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Execution Process
---------------------------------------
-
-.. toctree::
- :maxdepth: 1
-
- switch.md
- if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
deleted file mode 100644
index 1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8..0000000000000000000000000000000000000000
--- a/doc/fluid/design/execution/switch.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Design Doc: Switch
-
-## Background
-
-Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
-
-The following example shows the usage of `fluid.switch`.
-
-```python
-a = fluid.Var(10)
-b = fluid.Var(0)
-
-with switch() as switch:
- with switch.case(fluid.less_equal(a, 10)):
- fluid.print("Case 1")
- with switch.case(fluid.larger(a, 0)):
- fluid.print("Case 2")
- with switch.default():
- fluid.print("Case 3")
-```
-
-## The Semantics
-
-1. A `switch` control-flow checks cases one-by-one.
-1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
-1. It runs the first matched case, or the default case if there is one.
-1. Once it matches a case, it runs the corresponding branch and only that branch. It's like there is a C's `break` keyword at the end of each case.
-
-The above program should print and print only "Case 1".
-
-The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
deleted file mode 100644
index 31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914..0000000000000000000000000000000000000000
--- a/doc/fluid/design/index_cn.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-设计思想
-------------
-
-.. toctree::
- :maxdepth: 1
-
- motivation/index_cn.rst
- execution/index_cn.rst
- concepts/index_cn.rst
- data_type/index_cn.rst
- memory/index_cn.rst
- multi_devices/index_cn.rst
- dynamic_rnn/index_cn.rst
- concurrent/index_cn.rst
- algorithm/index_cn.rst
- network/index_cn.rst
- modules/index_cn.rst
- interface/index_cn.rst
- dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
deleted file mode 100644
index 2bfee02ad4626633b08ddff747e2886faf9ba99f..0000000000000000000000000000000000000000
--- a/doc/fluid/design/index_en.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-Design
-------------
-
-.. toctree::
- :maxdepth: 1
-
- motivation/index_en.rst
- execution/index_en.rst
- concepts/index_en.rst
- data_type/index_en.rst
- memory/index_en.rst
- multi_devices/index_en.rst
- dynamic_rnn/index_en.rst
- concurrent/index_en.rst
- algorithm/index_en.rst
- network/index_en.rst
- modules/index_en.rst
- interface/index_en.rst
- dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
deleted file mode 100644
index 69a8d9bad4fe88935b9fa87757abf0105ca8eb75..0000000000000000000000000000000000000000
--- a/doc/fluid/design/interface/index_cn.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-多语言接口
-------------
-
-TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
deleted file mode 100644
index 22abc71f984aa5da7151d5ebf0c3bdbcc69a3624..0000000000000000000000000000000000000000
--- a/doc/fluid/design/interface/index_en.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Multi-Language Interface
------------------------
-
-TBD
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
deleted file mode 100644
index 83ef97c99efeaf27a27f93f0cd3857c0f1bc812e..0000000000000000000000000000000000000000
--- a/doc/fluid/design/ir/overview.md
+++ /dev/null
@@ -1,185 +0,0 @@
-## Motivation
-
-There is a `gap` between the `Program` defined by
-user and the `Executable` that can be scheduled
-efficiently on heterogeneous hardware, either locally
-or distributedly.
-
-Usually, the `gap` is bridged by
-
-* A serious transformations with defined order.
-
-* These transformations usually involve
-`insert, delete, clustering, split, dependency analysis`.
-
-* Has a simple way to verify and debug each transformation.
-
-* Flexible to add, remove or customize transformations to fit
-the requirements of various algorithms (models) and hardware secenarios.
-
-Some other events also push us to a better unified pattern.
-
-* The deep learning framework is built around the concepts of graphs.
-To leverage tools such as compilation (e.g. TVM and nGraph) or
-cross-framework conversion (e.g. ONNX), we also need a intermediate
-representation that can be connected to the rest of the ecosystem.
-
-
-We need a unified pattern to naturally support the requirements
-described above. The pattern should fit both training, inference
-and other offline serielized model transformations.
-Learned from LLVM and other deep learning framework, we draft the
-design below.
-
-
-## Design
-
-### Major Concepts
-
-#### Node
-
-`Node` represents an operation that performs some computation or
-a variable that is input or output of operation.
-
-`Node`s are connected to other `Node`s via inputs and outputs.
-
-Other properties (maybe device placement information) can be added
-to `Node` in the future if it's a
-common requirement of many other `Pass`es. Otherwise, it should live
-in a `Node` wrapper class that is private to some `Pass` or be
-a local member of a `Pass`.
-
-#### Graph
-
-`Graph` contains a list of `Node`s, which are connected to
-each other via inputs and outputs.
-
-TODO: Better definitions for the graph.
-
-`Graph` can also contain `Attribute`s. `Attribute`s
-can be `any` thing. For example, it can be a list of "wraper"
-nodes. The `wrapper` nodes compose `Node`s and provide
-helper method for execution or transformation. `Attribute`
-can also contain other things that describe some properties of
-the `Graph` or `Graph` nodes. `Attribute` can be passed
-across `Pass`. However, it should be used with care.
-
-```cpp
-class Graph {
- public:
- explicit Graph(const ProgramDesc &program);
-
- bool Has(const std::string &attr_name) const;
-
- template
- AttrType &Get(const std::string &attr_name) const;
-
- template
- void Set(const std::string &attr_name, AttrType *attr);
- const std::unordered_set &Nodes() const;
-
- // Create a normal variable with non-null VarDesc.
- ir::Node *CreateVarNode(VarDesc *var_desc);
-
- // Create a normal runnable operator with OpDesc.
- ir::Node *CreateOpNode(OpDesc *op_desc);
-
- // Create a control dependency var that connects 2 operations. The
- // var doesn't hold any data. Other than that, it's no different from
- // other var, considering dependency analysis.
- ir::Node *CreateControlDepVar();
-
- // A more free style way of creating a graph node. Mostly use for test
- // or "copy" from another node. Avoid using it if possible.
- ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
-
- // Clear all node information of the graph and return the ownership of the
- // nodes.
- std::vector> ReleaseNodes();
-};
-```
-
-#### Pass
-
-`Pass` represents a transformation of `Graph`. Its input
-is a `Graph` and its output is also a `Graph`. For example,
-a `Pass` can simply print out the `Graph`. A `Pass`
-can also fuse some `Graph`'s `Node`s.
-
-```cpp
-class Pass {
- public:
-
- std::unique_ptr Apply(std::unique_ptr graph) const {
- // Some correctness check.
- auto new_graph = ApplyImpl(std::move(graph));
- // Some correctness check.
- return new_graph;
- }
-
- // Get a reference to the attributed previously set.
- template
- AttrType &Get(const std::string &attr_name) const;
-
- // Set a pointer to the attribute. Pass takes ownership of the attribute.
- template
- void Set(const std::string &attr_name, AttrType *attr) ;
-
- // Set a pointer to the attribute. Pass doesn't take ownership. Caller
- // should delete the attribute.
- template
- void SetNotOwned(const std::string &attr_name, AttrType *attr);
-
- protected:
- virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0;
-};
-
-// In my_pass.cc
-class MyPass : public Pass {
- protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const override {
- // do something.
- return graph;
- }
-}
-REGISTER_PASS(my_pass, MyPass)
-.RequirePassAttr("places")
-.RequireGraphAttr("dep_vars");
-
-
-// To use the pass.
-auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
-graph = my_pass->Apply(std::move(graph));
-// Note: to force link my_pass.cc, in the code:
-USE_PASS(my_pass);
-```
-
-#### Optimize
-
-`Optimize` contains a series of `Pass` with defined order.
-`Optimize` transforms a `Graph` that only contains raw
-modeling logic to a `Graph` that can be run efficiently while
-maintaining the original modeling logic.
-
-
-### Optimize Process
-
-* Program is first converted to Graph.
-* Graph goes through a series of Pass
-* Graph is transformed from raw model logic to a
-form that is efficient to execute.
-
-```
-// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
-auto graph = Graph(program);
-graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
-// For more complex Pass, Optimize Process can provide Pass attributes.
-auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
-mem_opt_pass.SetNotOwned("optimize_level", 1);
-mem_opt_pass->Apply(std::move(graph));
-graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
-graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
-Executor exe;
-exe.Run(graph);
-
-```
diff --git a/doc/fluid/design/memory/README.md b/doc/fluid/design/memory/README.md
deleted file mode 100644
index 7cf61d089b39041b7a15184e0ea9211d14a66f5e..0000000000000000000000000000000000000000
--- a/doc/fluid/design/memory/README.md
+++ /dev/null
@@ -1,141 +0,0 @@
-# Region-based Heterogeneous Memory Management
-## Design
-
-### Usage
-
-To allocate 4KB CPU memory:
-
-```cpp
-p = memory::Alloc(platform::CPUPlace(), 4*1024);
-```
-
-To allocate 4KB memory on the 3rd GPU:
-
-```cpp
-p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
-```
-
-To free memory and check the so-far used amount of memory on a place:
-
-```cpp
-auto pl = platform::CUDAPlace(0);
-p = memory::Alloc(pl, 4*1024);
-cout << memory::Used(pl);
-memory::Free(pl, p);
-```
-
-### API
-
-In `paddle/memory/memory.h` we have:
-
-```cpp
-namespace memory {
-template void* Alloc(Place, size_t);
-template void Free(Place, void*);
-template size_t Used(Place);
-} // namespace memory
-```
-
-These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
-
-```cpp
-template<>
-void* Alloc(CPUPlace p, size_t size) {
- return GetCPUBuddyAllocator()->Alloc(size);
-}
-```
-
-and
-
-```cpp
-template<>
-void Alloc(CUDAPlace p, size_t size) {
- return GetGPUBuddyAllocator(p.id)->Alloc(size);
-}
-```
-
-Similar specializations exist for `Free` and `Used`.
-
-### Implementation
-
-`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
-
-```cpp
-BuddyAllocator* GetCPUBuddyAllocator() {
- static BuddyAllocator* a = NULL;
- if (a == NULL) {
- a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
- }
- return a;
-}
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
- static BuddyAllocator* as = NULL;
- if (as == NULL) {
- as = new BuddyAllocator*[platform::NumGPUs()];
- for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
- as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
- }
- }
- return as[gpu_id);
-```
-
-#### `BuddyAllocator`
-
-`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
-
-```cpp
-BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
- ...
-}
-```
-
-Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
-
-```cpp
-class BuddyAllocator {
- private:
- struct Block {
- size_t size;
- Block* left, right;
- size_t index; // allocator id
- };
- ...
-};
-```
-
-Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
-
-#### System Allocators
-
-The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
-
-## Justification
-
-I got inspiration from Majel and Caffe2, though above design look different from both.
-
-### Caffe2
-
-In Caffe2, `Tensor::mutable_data()` allocates the memroy. In particular, [`Tensor::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
-
-There are two implementations of `Context`:
-
-1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
-
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
-
-### Majel
-
-In Majel, there are basically two allocator types:
-
-1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
-1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
-
-However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
-
-In Majel there are hidden global variables like:
-
-1. `cpu::SystemAllocator g_cpu_allocator`, and
-1. `vector g_gpu_allocators(NUM_GPUS)`.
-
-Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/doc/fluid/design/memory/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png
deleted file mode 100644
index 3579998e58d07abc50bd3332128d4733a391cb3b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/memory/images/control_flow_graph.png and /dev/null differ
diff --git a/doc/fluid/design/memory/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png
deleted file mode 100644
index c10f7f69f4007952e5b0394edaa04efa1cfbb658..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/memory/images/dataflow_equations.png and /dev/null differ
diff --git a/doc/fluid/design/memory/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png
deleted file mode 100644
index 026becc4d94e01e407dacb2a5314a0e5723334ff..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/memory/images/deep_learning.png and /dev/null differ
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
deleted file mode 100644
index c507c638bd1a6eb428175ed2756a6ecfc6cca198..0000000000000000000000000000000000000000
--- a/doc/fluid/design/memory/index_cn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-内存管理
-------------
-
-.. toctree::
- :maxdepth: 1
-
- memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
deleted file mode 100644
index f7526437a73a09b300f05e138084755f5528b242..0000000000000000000000000000000000000000
--- a/doc/fluid/design/memory/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Memory Management
--------------------
-
-.. toctree::
- :maxdepth: 1
-
- memory_optimization.md
diff --git a/doc/fluid/design/memory/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md
deleted file mode 100644
index 285464ada728d8f7a086a26beca6cfa4418e98e4..0000000000000000000000000000000000000000
--- a/doc/fluid/design/memory/memory_optimization.md
+++ /dev/null
@@ -1,217 +0,0 @@
-# Memory Optimization
-
-
-## Problem
-
-In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
-
-- Availability of Big Data
-- Supercomputing power to process this Big Data over very large neural networks
-- Modern algorithms
-
-Following graph shows the details:
-
-![](images/deep_learning.png)
-
-Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference.
-
-## Solution
-
-### Basic Strategy
-
-There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
-
-#### In-place Operation
-In a relu activation operator:
-
-$y = \max(x, 0)$
-
-If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
-
-#### Memory Sharing
-
-Not all operators support in-place operations. Memory sharing is a more general strategy.
-
-Following is an example:
-
-```
-a = op1(b, c);
-d = op2(a)
-e = op3(d, f)
-```
-
-In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
-
-
-### Live Variable Analysis
-
-It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
-
-In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
-
-In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
-
-Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
-
-We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
-
-- construct a control flow graph
-- solve the dataflow equations
-
-
-#### Control Flow Graph
-To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
-
-Following is the flow graph for a simple loop.
-
-![](images/control_flow_graph.png)
-
-#### Dataflow Analysis
-
-Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
-
-A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
-
-- Flow Graph Terminology
-
-A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
-In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
-
-- Uses and Defs
-
-An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
-
-- Liveness
-
-A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
-
-
-The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
-
-![](images/dataflow_equations.png)
-
-### Memory optimization transpiler
-
-At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
-
-#### add in-place attribute
-
-In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
-
-
-#### contruct control flow graph
-
-Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py) example.
-
-- Block0:
-
-```
-lookup_table
-mul
-...
-while(sub-block idx 1)
-...
-array_to_lod_tensor
-cross_entropy
-...
-while_grad(sub-block idx 2)
-read_from_array
-array_to_lod_tensor
-...
-```
-
-- Block1
-
-```
-read_from_array
-read_from_array
-...
-write_to_array
-increment
-write_to_array
-less_than
-```
-
-- Block2
-
-```
-read_from_array
-increment
-...
-write_to_array
-write_to_array
-```
-
-We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
-
-```python
-class ControlFlowGraph(object):
- def __init__(self, Program):
- self._sucessors = defaultdict(set)
- self._presucessors = defaultdict(set)
- self._uses = defaultdict(set)
- self._defs = defaultdict(set)
- self._live_in = defaultdict(set)
- self._live_out = defaultdict(set)
- self._program = Program
-
- def build(self):
- pass
-
- def dataflow_analysis(self):
- pass
-
- def memory_optimization(self):
- pass
-
- def get_program(self):
- return self._program
-```
-
-#### Make dataflow analysis
-
-We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
-
-For example:
-
-```
-a = op1(b, c);
-d = op2(a)
-e = op3(d, f)
-```
-
-The dataflow analysis result is:
-
-```
-live_in(op1) = {b, c, f}
-live_out(op1) = {a, f}
-
-live_in(op2) = {a, f}
-live_out(op2) = {d, f}
-
-live_in(op3) = {d, f}
-live_out(op3) = {}
-```
-
-After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
-
-#### memory sharing policy
-
-A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
-
-```
-if op.support_inplace():
- i --> pool
- pool --> o
-else:
- pool --> o
- i --> pool
-```
-
-
-
-## Reference
-
-- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
-- Modern compiler implementation in ML, by Andrew W. Appel
-- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/fluid/design/modules/backward.md b/doc/fluid/design/modules/backward.md
deleted file mode 100644
index 20fda7a98f514a3f1c1c2d0ba7447ec954b21d5a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/backward.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# Backward Building
-
-## Motivation
-
-In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
-
-When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
-
-## Challenges
-
-The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place.
-
-## Usage
-
-Although the whole algorithm is comprised of many functions, only one is exposed as API:
-
-```python
-def append_backward(loss, parameter_list=None, no_grad_set=None):
- """
- Append backward part to main_program
-
- Args:
- loss(Variable): The variable generated by the cost function.
- parameter_list(list): Parameters that need to be updated by optimizers.
- If None, it means all parameters need to be updated.
-
- no_grad_set(set): Variables that have no gradients in Block 0.
- If None, the set will be generated inside the function and
- contains all variables with `step_gradient=True` from all blocks.
-
- Return:
- (list[Variable]): list of (parameters, gradients) pair.
- """
-```
-
-By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
-
-This API will be invoked automatically before optimizer building.
-As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
-
-## Implementation
-
-The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables.
-
-### Creating `grad_op`s
-
-The creating of `grad_op`s is implemented by:
-
-```python
-def _append_backward_ops_(target,
- block,
- target_block,
- no_grad_dict,
- grad_to_var):
- """
- Create all grad ops, and insert them into given block
-
- Args:
- target(Variable): the target variable of forward pass
- block(Block): the block where forward ops are
- target_block(Block): the block which is going to hold new generated grad ops
- no_grad_dict(dict):
- key(int) block index
- val(set) a set of varibale names. These varibales have no gradient
- grad_to_var(dict)(output argument):
- key(str): grad variable name
- val(str): corresponding forward variable name
- """
-```
-
-Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`.
-
-However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
-
-During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
-
-```
-******* pseudo-code ********
-for op in reversed(block.ops):
- if op has an attribute named 'sub_block':
- Get the sub-block(`s_block`) from op's attribute.
- Create a new block(`grad_s_block`), whose father is `s_block`.
- Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
-
- Invoke `core.get_grad_op_desc()` to get op's grad_op.
- Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
- Assign grad_s_block to grad_op as it's 'sub_block' attribute.
- Append grad_op to current target_block.
-```
-
-The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
-
-### Corner Cases of `grad_op` Creating
-
-In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
-
-#### Shared Variables
-
-If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up.
-
-For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
-
-See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
-
-#### No Gradient Variables
-
-In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass.
-
-Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
-
-It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros.
-
-This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False).
-
-### Creating Backward Variables
-
-Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
-
-```python
-def _append_backward_vars_(block,
- start_op_idx,
- grad_to_var,
- grad_info_map):
- """
- Create new variables required by backward pass.
-
- Args:
- block(Block): the block where new variables will be created
- start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
- grad_to_var(dict):
- key(str): grad variable name
- val(str): corresponding forward variable name
- In most cases, this dict is generated by _append_backward_ops_()
- grad_info_map(dict)(output argument):
- key(str): forward variable name
- val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
- """
-```
-
-Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
-
-```
-for op in block.ops[start_op_idx : ]:
-
- if op has an attribute named 'sub_block':
- Get the sub-block(`s_block`) from op's attribute.
- Invoke _append_backward_vars_(), with `block=s_block`
-
- for var_name in op.all_output_names():
- if block.has_var_recursive(var_name) or var_name is the name of empty variable:
- continue
- create a new variable named 'var_name' in block
- if grad_to_var.has_key(var_name):
- set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
-
- do op's var type inference
- do op's shape inference
-```
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
deleted file mode 100644
index e451ffcc73b5de2b911e1c6de54b42a5d1d54c37..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ /dev/null
@@ -1,134 +0,0 @@
-# Batch Normalization
-
-## What is batch normalization
-
-Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
-
-The principle of batch normalization can be summarized into a simple function:
-
-```
-y = (x - E[x]) / STD[x]) * scale + bias
-```
-
-`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
-
-In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
-
-## Differences with normal operators
-
-`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
-
-1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
-
-2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
-
-```
-if batch_id == 0
- estimated_mean = E[x]
-else
- estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
-```
-
-The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
-
-## Implementation
-
-Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
-
-### C++
-
-As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
-
-#### Inputs
-
-- `x`: The inputs data, which is generated by the previous layer.
-- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
-- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
-- `scale`: trainable parameter 'scale'
-- `bias`: trainable parameter 'bias'
-
-#### Outputs
-
-- `y`: The output data.
-- `batch_mean`: The mean value of batch data.
-- `batch_var`: The standard deviation value of batch data.
-- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
-- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
-
-#### Attributes
-
-- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
-- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
-- `epsilon`: *float*. The epsilon value to avoid division by zero.
-- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
-
-#### Kernels
-
-The following graph showes the training computational process of `batch_norm_op`:
-
-
-
-cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
-
-### Python
-
-`batch_norm_op` is warpped as a layer in Python:
-
-```python
-def batch_norm_layer(net,
- input,
- output,
- scale,
- bias,
- use_global_est = False,
- epsilon = 1e-6,
- momentum = 0.99):
- mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
- var_cache = scop.new_var(name = 'estimated_var', trainable = False)
- batch_mean = scope.new_var(name = 'batch_mean')
- batch_var = scope.new_var(name = 'batch_var')
- batch_norm_op = Operator('batch_norm_op',
- x = input,
- estimated_mean = mean_cache,
- estimated_mean = var_cache,
- scale = scale,
- bias = bias,
- y = output,
- batch_mean = batch_mean,
- batch_var = batch_var,
- saved_mean = mean_cache,
- saved_var = var_cache,
- is_infer = False,
- use_global_est = use_global_est,
- epsilon = epsilon,
- momentum = momentum)
- net.append_op(batch_norm_op)
- return output
-```
-
-Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
-
-1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
-
-2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
-
-```python
-for pass_id in range(PASS_NUM):
- # ...
- net.train() # run training model
- if pass_id % 100 == 0:
- net.infer(test_image) # run inferencing model
- # ...
-```
-
-`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
-
-
-
-
-
-Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
-
-When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
-
-How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md
deleted file mode 100644
index de9605b0e67a035ab1ef1e4cafbe838f83bc5807..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/evaluator.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Evaluator Design
-
-## Problem Statement
-
-During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
-
-## Evaluator Design
-Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
-
-1. Initialize the metric state and add it into the block.
-
-2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
-
-
-3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
-
-## Implementation
-This design is shown in the Python API.
-Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
-
-
-```python
-class Evaluator(object):
- """
- Evaluator Base class.
- """
- def __init__(self, name, **kwargs):
- """
- Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
- Auc need four variables, `true_positives`,
- `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
-
- The initialization of Evaluator should be responsible for:
- create metric states and append to the main_program
- """
- pass
-
- def _update_ops(self, input, label, **kwargs)
- """
- Add mini-batch evaluator caculate operators to the main_program.
- Add increment operator to accumulate the metric states.
- """
-
-
- def reset(self, executor, reset_program=None):
- """
- Reset metric states at the begin of each pass/user specified batch number.
- Execute the reset_program to reset the states.
- """
-
-
- def eval(self, executor, eval_program=None):
- """
- Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
- Execute the eval_program and return the result.
- """
- return eval_result
-```
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot
deleted file mode 100644
index 4bc47713cba2cb23f1b34fffe6426ef10ac3a9df..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/images/batch_norm_fork.dot
+++ /dev/null
@@ -1,25 +0,0 @@
-digraph ImageBatchNormForkGragh {
- subgraph cluster_before {
- Prev [label="...", shape=plaintext];
- Rnn [label="rnn_op", shape=box];
- BatchNorm [label="batch_norm_op", shape=box];
- Fc [label="fc_op", shape=box];
- After [label="...", shape=plaintext];
- Prev -> Rnn -> BatchNorm -> Fc -> After;
- label="original";
- }
-
- subgraph cluster_after {
- Prev2 [label="...", shape=plaintext];
- Rnn2 [label="rnn_op", shape=box];
- BatchNorm2_1 [label="train_batch_norm_op", shape=box];
- BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
- Fc2_1 [label="fc_op", shape=box];
- Fc2_2 [label="fc_op", shape=box];
- After2_1 [label="...", shape=plaintext];
- After2_2 [label="...", shape=plaintext];
- Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
- Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
- label="forked";
- }
-}
diff --git a/doc/fluid/design/modules/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png
deleted file mode 100644
index aded62bce5bc268b7a3ef4dc96c89fe21d6ea955..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/batch_norm_fork.png and /dev/null differ
diff --git a/doc/fluid/design/modules/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
deleted file mode 100644
index a99ce81ff3bf42880ebbd6a1297de3bf038e09b2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/batch_norm_op_kernel.png and /dev/null differ
diff --git a/doc/fluid/design/modules/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png
deleted file mode 100644
index d312371a04c26aa6cd196e0bd1f51becb425180b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/feed_forward.png and /dev/null differ
diff --git a/doc/fluid/design/modules/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png
deleted file mode 100644
index 677e99bfd9f8e72ed9fe4b27127af2ced202f447..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/feed_forward_regularized.png and /dev/null differ
diff --git a/doc/fluid/design/modules/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png
deleted file mode 100644
index e1b9c7a44f94dc027598a98da93ddb8133190972..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/l1_regularization.png and /dev/null differ
diff --git a/doc/fluid/design/modules/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png
deleted file mode 100644
index d5c2fcbc2ccae75ad083162e5a2dceb0210be298..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/l2_regularization.png and /dev/null differ
diff --git a/doc/fluid/design/modules/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png
deleted file mode 100644
index 14212ec8d36c803de96bde8a9a4b5591bd20434e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/modules/images/loss_equation.png and /dev/null differ
diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst
deleted file mode 100644
index b25783f0f5120991c29ba31b7b512bd4c183eecf..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/index_cn.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-代码结构和重要模块
------------------
-
-.. toctree::
- :maxdepth: 1
-
- backward.md
- python_api.md
- regularization.md
- infer_var_type.md
- optimizer.md
- prune.md
- register_grad_op.md
- net_op_design.md
diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst
deleted file mode 100644
index 2108156e080996916f2650448f0a56f998757204..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/index_en.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Code Structure and Important Modules
--------------------------------------
-
-.. toctree::
- :maxdepth: 1
-
- backward.md
- python_api.md
- regularization.md
- infer_var_type.md
- optimizer.md
- prune.md
- register_grad_op.md
- net_op_design.md
diff --git a/doc/fluid/design/modules/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md
deleted file mode 100644
index d9d5397becba2ef1806d9341cd49cd9aabbf4a6a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/infer_var_type.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Design Doc: InferVarType
-
-## The Problem Posed
-
-The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
-
-For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
-
-The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
-
-## Proposed Solution
-
-The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
-
-
-```c++
-using InferVarTypeFN = std::function<
- void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
-```
-
-It takes an operator description as its input and will write the output variable type and store them in block description.
-
-The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
-
-```cpp
-struct OpInfo {
- InferVarTypeFN infer_var_type_;
- ...
-};
-```
-
-The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
-
-```cpp
-void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
- // set the output type of variable as `LoDTensor`.
- // ...
-}
-
-struct OpInfo {
- InferVarTypeFN infer_var_type_;
- InferVarTypeFN GetInferVarType() const {
- if (infer_var_type_) {
- return infer_var_type_;
- } else {
- return DefaultInferVarType;
- }
- }
-};
-```
-
-## Register InferVarType
-
-We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
-
-```cpp
-class VarTypeInferer {
-public:
- virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
-}
-```
-
-Operator developers can write the specialize `VarTypeInferer` as follow.
-
-```cpp
-class SpecialVarTypeInferer : public VarTypeInferer {
-public:
- virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
- // .. own logic
- }
-}
-```
-
-Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
-
-```
-REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
-```
diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
deleted file mode 100644
index e64ac2fb1c6898bfeb883250347da3d9a4757b97..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/net_op_design.md
+++ /dev/null
@@ -1,250 +0,0 @@
-# Network Design
-
-`Network` is the container and controller of a set of operators,
-user can build a real network from a `NetDesc` which is a protobuf message
-and use `Network.Run()` to run all the operators in the network.
-
-A network object knows all Operators belonging to this network. Variables,
-which are inputs and outputs of these operators,
-are created and managed by a hierarchy of Scope objects.
-
-## API
-
-### Net
-To make the `Network` extendable, a base class is defined like this
-
-```c++
-// operator's index stored in a network.
-typedef int OpIndex;
-
-// The minimum a network should be implemented.
-class Net {
- public:
- // run all the operators and return success(true) or not, with all the
- // variables are located in `scope`. `context` describes the detail execution
- // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
- // If no positive indexes are provided, all operators in `ops_` will run.
- virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
- OpIndex end = -1) const = 0;
-
- // Add an Operator according to `def`.
- virtual OpIndex AddOp(const proto::OpDef &def) = 0;
-
- // Add optimizer operators acctording to `attrs`.
- virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
-
- // Add backward operators.
- virtual Error AddBackwardOps() = 0;
-
- // Infer the shapes of variables required by operators in the network. The
- // `scope` will be mutated according to the inferred shapes.
-
- static std::unique_ptr Create(const NetDesc &def = NetDesc());
-};
-```
-
-All network implementations should build networks from a protobuf message which
-describes the structure of a real network; `Run` method should be implemented by
-all implementations to offer a universal method to forward or backward compute a network.
-
-`Net::Create` is a method of factory pattern and can be implemented like
-
-```c++
-std::unique Net::Create(const NetDesc& def) {
- switch (def.model_type()) {
- case NN:
- return new Network(def);
- case Recursive:
- return new RecursiveNet(def);
- case Recurrent:
- return new RecurrentNet(def);
- }
- return nullptr;
-}
-```
-
-Network is designed as the container of operators. to make it more extendable,
-we decouple it from the related variable resources.
-
-`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
-
-Finally, `Net` can be used as followed
-
-```c++
-Scope default_scope;
-OpContext default_context;
-auto net = Net::CreateNet(def);
-
-if (net) {
- net.Run(&default_scope, &default_context);
-}
-```
-
-### `PlainNet` as a simple implementation of `BaseNet`
-
-A very basic implementation is as follows. All it does is simply to run every operators in sequence.
-
-```c++
-class PlainNet : public Net {
- public:
- // Create a network describe by `def`. NetDesc is the definition of a network.
- PlainNet(const NetDesc &def);
-
- // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
- training.
- virtual Error InferShape(Scope *scope) override;
-
- // Run all the operators with the `scope`, if no scope is provided, default
- // scope will be used instead. If no OpContext is provicded, default context will be used.
- virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
- OpIndex end = -1) const override;
-
- virtual OpIndex AddOp(const proto::OpDef &def) override;
-
- virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
-
- virtual Error AddBackwardOps() override;
-
- protected:
- // Create operators accordding to `def`, will be called by the constructor.
- Error BuildNet(const NetDesc &def);
-
- // Add a operator which is identified as `type` and has attributes described
- // in `attrs`, the `inputs` are the keys of readonly input variables,
- // `outputs` are keys of mutable output variables. An `OpIndex` will be
- // returned to indicate the offset of the new operator in `ops_`.
- OpIndex AddOp(const std::string &type, const std::vector &inputs,
- const std::vector &outputs,
- const OprAttr &attrs = OprAttr());
-
- private:
- // the operators owned by `Network`.
- std::vector ops_;
-};
-```
-
-`PlainNet` will create operators so that a private member `ops_` is defined,
-the operators are created by `CreateNet`, and each operator is created by `AddOp`.
-
-
-## PlainNet Usage
-`PlainNet` can be used to define and run a network as follows
-
-```c++
-// create an empty scope located on CPU device.
-Scope scope(CPUPlace());
-
-// create and init variables described in `net_desc`.
-scope.CreateVariables(net_desc);
-scope.InitVariables(net_desc);
-
-// create a network according to `net_desc`
-auto net = Net::CreateNet(net_desc);
-// Add more operators if needed.
-net->AddOp(add...);
-net->AddOp(fc...);
-
-net->AddBackwardOps();
-net->AddOptimizerOps();
-
-// run the network providing the `scope`.
-net.Run(&scope);
-```
-
-## `NetBuilder` as a C++ syntax wrapper
-This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
-
-The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
-
-```c++
-Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
-Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
-Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
-Variable* avg_loss = builder.AddOp("mean", loss);
-
-builder.BackwardFrom(avg_loss)
-builder.AddOptimization(1e-4, "adam");
-builder.Run();
-```
-
-`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
-
-```c++
-class NetBuilder final {
- public:
- NetBuilder(Net* net) : net_(net) {}
-
- Variable* AddOp(const string& type, const vector& inputs,
- size_t size, Activation act) {
- // much code here.
- // ...
- net_->AddOp(def);
- need_rebuild_net_ = true;
- net_->InferShape();
- // ...
- }
-
- Error BackwardFrom(const Variable& cost);
-
- Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
- // backward.
- if (need_backward) {
- if (need_rebuild_net_) {
- AddBackwardOps();
- AddOptimizerOps();
- }
- net_->Run(scope, context);
- return;
- }
- // just forward.
- net_->Run(scope, context, 0, last_forward_op_);
- }
-
- protected:
- Error AddBackwardOps();
- Error AddOptimizerOps();
-
- private:
- Net* net_;
- OpIndex last_forward_op_{-1};
- bool need_rebuild_net_{true};
-}
-```
-
-### Compatibility with RNN
-
-Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
-for example we can implement a simple recurrent neural network as follows
-
-```c++
-// copy some `vars` form `source` to `target`
-void Copy(const Scope &source, Scope &target,
- const std::vector &vars);
-
-Scope default_scope;
-// some initial mutations on `default_scope` here.
-
-auto rnn_step_net = PlainNet(rnn_step_net_def);
-
-// Create rnn's states, the last scope is used to store rnn outputs.
-Scope *rnn_states = new Scope[num_states + 1];
-
-for (int i = 0; i < num_states + 1; i++) {
- // Initialize all rnn state scopes, copy parameters and so on.
- rnn_states[i].CreateVars(rnn_step_net_def);
- Copy(default_scope, rnn_states[i], rnn_related_vars);
- // Prepare rnn's inlinks, just copy inlink variables to each state.
- Copy(default_scope, rnn_states[i], inlink_vars);
-}
-
-// Run the rnn.
-for (int i = 0; i < num_states; i++) {
- rnn_step_net.Run(rnn_states[i]);
- // Copy current state's state variables to next state, the related variables
- // are named like "previous_state_xxx".
- Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
-}
-
-// Copy rnn's final outputs to `default_scope`.
-Copy(rnn_states[num_states], default_scope, outlink_vars);
-```
diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md
deleted file mode 100644
index 1c25fde9cafb322f789662077d3fc6cc1d64ce38..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/optimizer.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Optimizer Design
-
-## The Problem
-
-A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works:
-
-1. the forward pass, which computes intermediate results and the cost(s),
-1. the backward pass, which derives gradients from intermediate results and costs, and
-1. the optimization pass, which update model parameters to optimize the cost(s).
-
-These works rely on three kinds of operators:
-
-1. forward operators,
-1. gradient operators, and
-1. optimization operators.
-
-It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
-
-In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
-
-
-## High-level Python API to describe the training process
-
-1. User write code to describe the network:
-
- ```python
- images = layer.data("images")
- labels = layer.data("labels")
- w1 = pd.var("w1")
- b1 = pd.var("b1")
- hidden = layer.fc(images, w=w1, b=b1)
- cost = layer.mse(hidden, labels)
- ```
-
- The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
-
-
-2. Users create a certain kind of Optimizer with some argument.
-
- ```python
- optimizer = AdagradOptimizer(learing_rate=0.001)
- ```
-
-3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
-
- ```python
- opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
- ```
- The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
-
-4. Users use Session/Executor to run this opt_op_list as target to do training.
-
- ```python
- sess.run(target= opt_op_list, ...)
- ```
-
-### Optimizer Python interface:
-
-```python
-class Optimizer(object):
- """Optimizer Base class.
-
- """
-
- def __init__(self):
- pass
-
- def create_optimization_pass(self, parameters_and_grads):
- """Add optimization operators to update gradients to variables.
-
- Args:
- parameters_and_grads: a list of (variable, gradient) pair to update.
-
- Returns:
- optmization_op_list: a list of optimization operator that will update parameter using gradient.
- """
- return None
-
- def minimize(self, loss, parameter_list):
- """Add operations to minimize `loss` by updating `parameter_list`.
-
- This method combines interface `append_backward()` and
- `create_optimization_pass()` into one.
- """
- params_grads = self.create_backward_pass(loss, parameter_list)
- update_ops = self.create_optimization_pass(params_grads)
- return update_ops
-
-```
-
-Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/fluid/design/modules/prune.md b/doc/fluid/design/modules/prune.md
deleted file mode 100644
index 4a5cf10c79a554779137f0cce5494fdd96ef6b7a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/prune.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# Prune
-
-## Motivation
-
-We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement
-`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
-and generate a pruned `ProgramDesc`.
-
-## Challenge
-
-Pruning need to support both variables and operators being evaluation targets. Consider the following
-different situations.
-
-```python
-# Case 1: run foward pass.
-cost_np = session.run(target=cost)
-# Case 2: run backward passing.
-opts_np, _ = session.run(target=[cost, opt])
-# Case 3: run checkpointing
-_ = session.run(target=checkpoint)
-```
-
-## Solution
-
-To support evaluation of operators, we add `is_target` field in the `OpDesc`.
-
-```c++
-message OpDesc {
- required string type = 3;
- repeated Var inputs = 1;
- repeated Var outputs = 2;
- repeated Attr attrs = 4;
- optional bool is_target = 5 [ default = false ];
-};
-```
-
-To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
-For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
-`fetch_op`'s input. Then we also set `fetch_op` is a target.
-
-### Algorithm
-
-If an operator needs to be run, it must fall into one of the following cases:
-
-1. It is the target.
-2. It is depended by some other ops, meaning its output is some other op's input.
-
-The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
-
-```c++
-bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) {
- for (auto& var : op_desc.outputs()) {
- for (auto& argu : var.arguments()) {
- if (dependent_vars.count(argu) != 0) {
- return true;
- }
- }
- }
- return false;
-}
-```
-
-Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
deleted file mode 100644
index 83af4e55485c079265d3f2b1e15070825b532c02..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/python_api.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# Design Doc: Python API
-
-Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
-
-
-
-
-Python classes |
-Protobuf messages |
-
-
-
-
-Program |
-ProgramDesc |
-
-
-Block |
-BlockDesc |
-
-
-Operator |
-OpDesc |
-
-
-Variable |
-VarDesc |
-
-
-
-
-
-Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
-
-## Core Concepts
-
-### Program
-
-A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
-
-Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
-
-```python
-class Program(objects):
- def __init__(self):
- self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
- self.blocks = vector()
- self.blocks.append(Block(self, -1)) # the global block
- self.current_block = 0 # initialized to the global block
-
- def global_block():
- return self.blocks[0]
-
- def current_block():
- return self.get_block(self.current_block)
-
- def rollback():
- self.current_block = self.current_block().parent_idx
-
- def create_block():
- new_block_idx = len(self.block)
- self.blocks.append(Block(self, self.current_block))
- self.current_block = new_block_idx
- return current_block()
-```
-
-`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
-
-`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block.
-
-### Block
-
-A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
-
-1. a map from variable names to an instance of the Python `Variable` class, and
-1. a list of `Operator` instances.
-
-```python
-class Block(objects):
- def __init__(self, program, parent_idx):
- self.desc = core.NewBlock(program.desc)
- self.program = program
- self.vars = map()
- self.ops = vector()
- self.parent_idx = parent_idx
-
- def create_var(self, ...):
- return Variable(self, ...)
-
- def _create_global_var(self, ...):
- program.global_block().create_var(...)
-
- def create_parameter(self, name, ...):
- # Parameter is a subclass of variable. See Parameter section for details.
- self.vars[name] = Parameter(self._create_global_var(...), ...)
- return self.vars[name]
-
- def append_operator(self, ...):
- self.ops.append(Operator(self, ...))
-
- def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
- self.ops.prepend(Operator(self, ...))
-```
-
-`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
-
-`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
-
-### Operator
-
-The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
-
-```python
-class Operator(object):
- def __init__(self,
- block, # Block
- type, # string
- inputs, # dict
- outputs,# dict
- attrs # dict
- ):
- self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
- core.infer_shape(self.desc, inputs, outputs)
-
- def type(self):
- return self.desc.type()
-```
-
-`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
-
-### Variable
-
-Operators take Variables as its inputs and outputs.
-
-```python
-class Variable(object):
- def __init__(self,
- block=None, # Block
- name=None, # string
- shape, # tuple
- dtype="float32", # string
- lod_level=None # int
- ):
- if name is None:
- name = unique_name_generator()
- self.name = name
- self.block = block
- self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
- self.writer = None
-```
-
-Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
-
-### Parameter
-
-A parameter is a global variable with an initializer (or load) operator.
-
-```python
-class Parameter(Variable):
- def __init__(self,
- block=None, # Block
- name=None, # string
- shape, # tuple
- dtype="float32", # string
- lod_level=None # int
- trainable, # bool
- initialize_op_attrs,
- optimize_op_attrs):
- super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
- self.trainable = trainable
- self.optimize_op_attrs = optimize_op_attrs
- block.prepend(Operator(block, # Block
- initialize_op_attrs['type'], # string
- None, # no inputs
- self, # output is the parameter
- initialize_op_attrs)
-```
-
-When users create a parameter, they can call
-
-```python
-program.create_parameter(
- ...,
- init_attr={
- type: "uniform_random",
- min: -1.0,
- max: 1.0,
- })
-)
-```
-
-In above example, `init_attr.type` names an initialize operator. It can also name the load operator
-
-```python
-init_attr={
- type: "load",
- filename: "something.numpy",
-}
-```
-
-`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
-
-## Layer Function
-
-A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
-
-Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
-
-For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
-
-
-### Necessity for reusing code between layer functions
-
-There are a lot of code that can be reused. Such as
-
-* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
-* Append the activation operator.
-* Create a temporary variable.
-* Create parameter.
-* Generate a unique name.
-* Add a bias.
-* ...
-
-A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
-
-
-
-### Comparision between global functions and helper class
-
-The `FullyConnected` layer will be as follow when we provide global functions:
-
-```python
-def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
- if name is None:
- name = unique_name("fc")
- input = multiple_input(input)
- param_attr = default_param_attr(param_attr)
- param_attr = multiple_param_attr(param_attr, len(input))
-
- # mul
- mul_results = []
- for ipt, attr in zip(input, param_attr):
- shape = ipt.shape[1:] + [size]
- w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
- tmp = create_tmp_var(name)
- g_program.current_block().append_op("mul", {ipt, w}, {tmp})
- mul_results.append(tmp)
-
- # add sum
- ...
- # add bias
- ...
- # add activation
- ...
- return out
-```
-
-We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
-
-1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
-2. Global functions will force layer developers to pass its parameter time by time.
-
-So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
-
-```python
-def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
- helper = LayerHelper(locals()) # pass all parameter to LayerHelper
-
- mul_results = []
- for ipt, param in helper.iter_multiple_input_and_param():
- w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
- tmp = helper.create_tmp_variable()
- helper.append_op('mul', {ipt, w}, {tmp})
- mul_results.append(tmp)
-
- pre_bias = helper.add_sum(mul_results)
- pre_activation = helper.add_bias(pre_bias)
- return helper.add_activation(pre_activation)
-```
-
-We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
-
-
-### Implementation of layer helper
-
-We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
-
-```python
-class LayerHelper(object):
- def __init__(self, **kwargs): # kwargs is short for `keyword arguments`
- self.kwargs = kwargs
-
- def add_activation(self, input_var):
- act = self.kwargs.get("act", None) # default value is None
- if act is None: # do nothing if no act
- return input_var
-
- tmp = self.create_tmp_var(self)
- self.append_op(type=act, input=input_var, output=tmp)
- return tmp
-```
-
-### Return value of layer functions
-
-The layer will return a Variable, which is also the output of an operator. However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
-
-1. Users can debug the network by printing parameter gradients.
-2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
-
-However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
-
-The sample usage is
-
-```python
-data = fluid.layers.data(...)
-hidden = fluid.layers.fc(data, ...)
-...
-
-executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
-```
-
-
-## Optimizer
-
-[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/fluid/design/modules/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md
deleted file mode 100644
index 8d973eb53178c3e889c845144553a453e11f067c..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/register_grad_op.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Design Doc: Gradient Operators Registration
-
-
-## The Problem Posed
-
-Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
-
-However, we noticed two problems with the current design:
-
-1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
-
-1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
-
-## The Current Implementation
-
-Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
-
-```cpp
-struct OpInfo {
- std::function creator_;
- std::string grad_op_type_;
- ...
-};
-
-map OpInfoMap;
-
-OperatorBase* CreateGradientOperator(const OperatorBase& op) {
- return OpInfoMap.at(op.Type()).creator_(...);
-}
-```
-
-## Proposed Solution
-
-The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
-
-```cpp
-// (OpDesc) --> vector
-std::function(const OpDescBind&)>;
-```
-
-The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
-
-The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like
-
-```cpp
-struct OpInfo {
- std::function>(const OpDescBind&)> grad_op_maker_;
- ...
-};
-```
-
-The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
-
-We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
-
-```cpp
-class GradOpDescMakerBase {
-public:
- GradOpDescMakerBase(const OpDescBind& );
- virtual std::vector> operator()()const = 0;
-};
-```
-
-We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by
-
-```cpp
-using GradOpMaker = ...;
-std::function(const OpDescBind&)> func;
-func = [] (const OpDescBind& fwd_op) {
- GradOpMaker maker(fwd_op);
- return maker();
-};
-```
-
-We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
-
-We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
-
-The user interface should be
-
-```cpp
-vector MinusOpGradMaker(OpDesc) {...}
-REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
-// Developers can still manually implement gradient operator.
-REGISTER_OPERATOR(minus_grad, MinusGradOp);
-```
-
-The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
-
-```cpp
-REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
-```
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
deleted file mode 100644
index 519a9143033386678351ff78a465e5ba6e220c52..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/regularization.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Regularization in PaddlePaddle
-
-## Introduction to Regularization
-A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
-
-### Parameter Norm Penalties
-Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
-
-
-
-The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
-
-The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
-
-##### L2 Regularization:
-
-
-##### L1 Regularization
-
-
-A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
-
-## Regularization Survey
-
-A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
-
-## Proposal for Regularization in PaddlePaddle
-
-### Low-Level implementation
-
-In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
-- L2_regularization_op
-- L1_regularization_op
-
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
-
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
-
-### Computation Graph
-
-Below is an example of a really simple feed forward neural network.
-
-
-
-The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
-
-
-
-### Python API implementation for Regularization
-
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
-
-#### Creation of Regularization ops
-There are two possibilities for creating the regularization ops:
-1. We create these ops immediately while building the computation graph.
-2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
-
-The proposal is to add these ops in a lazy manner just before the backward pass.
-
-#### Storage of Regularization attributes
-
-Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
-
-#### High-level API
-
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/fluid/design/modules/selected_rows.md b/doc/fluid/design/modules/selected_rows.md
deleted file mode 100644
index 1a98839a957612b91b2276b58818623ecc62d1d5..0000000000000000000000000000000000000000
--- a/doc/fluid/design/modules/selected_rows.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# Design Doc: Selected Rows
-
-`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
-
-```cpp
-class SelectedRows {
- private:
- vector rows_;
- Tensor value_;
- int height_;
-};
-```
-
-The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
-
-Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
-
-```
-x = SelectedRow {
- rows = [73, 84],
- value = [[1, 2], [3,4]]
-}
-```
-
-
-## SelectedRows in Protobuf
-
-`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data.
-So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
-
-```proto
-message TensorDesc {
- required DataType data_type = 1;
- repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-}
-
-message LodTensorDesc {
- required TensorDesc tensor = 1;
- optional int lod_level = 2;
-}
-
-message VarDesc {
- required string name = 1;
- enum VarType {
- LOD_TENSOR = 0;
- SELECTED_ROWS = 1;
- }
- required VarType type = 2;
- optional LodTensorDesc lod_desc = 3;
- optional TensorDesc selected_rows_desc = 4;
- optional bool persistable = 5 [ default = false ];
-}
-```
-
-## InferShape for Selected Rows
-
-Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
-
-For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
-
-```cpp
-void TableLookupGrad::InferShape(context) {
- ...
- context.SetDataType("Embedding.Grad", kSelectedRows);
-}
-```
-
-
-## Sparse Operators
-
-There are several operators that need to be written to support `SelectedRows`. These are:
-
-1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
-2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
deleted file mode 100644
index bc222564e3ec28e306ca0572b6a23104f6e9cbc5..0000000000000000000000000000000000000000
--- a/doc/fluid/design/motivation/api.md
+++ /dev/null
@@ -1,261 +0,0 @@
-# PaddlePaddle Design Doc
-
-## Ingredients
-
-As our design principle is starting from the essence: how could we
-allow users to express and solve their problems as neural networks.
-Some essential concepts that our API have to provide include:
-
-1. A *topology* is an expression of *layers*.
-
-1. A layer could be any kind of computation, including *cost*.
-
-1. Some layers have parameters, some don't. Most costs don't have
- parameters.
-
-1. In some topologies, layers share parameters. For
- example,
- [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
-
-1. At programming time, users specify topologies and possible sharing
- of parameters. PaddlePaddle can figure out and create parameters
- required (and possibly shared) by one or more topologies.
-
-
-## Starting from Examples
-
-As a summarization
-of
-[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
-let us present two examples here:
-
-
-### Example 1. Sharing Parameters between Layers
-
-We use
-the
-[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
-in this example. For your convenience, I copy-a-paste the model's
-topology as follows:
-
-```
-A -> f -\
-Q -> f --> cost
-B -> f -/
-```
-
-The following program trains the topology including the cost, and then
-use the sub-network in the trained topology in inference:
-
-```python
-def f(in):
- e = paddle.layer.embedding(in, parameter_name="embedding")
- o = paddle.layer.softmax(e, parameter_name="semantic")
- return o
-
-# Create 3 topologies (subnets), they share parameters because all
-# correspoinding layers have the same parameter names.
-fA = f(paddle.layer.data(input_name="A"))
-fB = f(paddle.layer.data(input_name="B"))
-fQ = f(paddle.layer.data(input_name="Q"))
-
-topology = paddle.layer.less_than(
- paddle.layer.cross_entropy(fA, fQ),
- paddle.layer.corss_entropy(fB, fQ))
-
-# Derive parameters required in topology and create them in model.
-parameters = paddle.parameters.create(topology)
-
-# Estimate parameters used in topology from data.
-paddle.train(topology, parameters, reader=read_ranking_model_data)
-
-# Inference using fA (or fB or fC, as they share their parameters).
-[testA, testB, testQ] = read_ranking_model_data()
-print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
-```
-
-
-### Example 2. Sharing Parameters between "Models"
-
-We use GAN in this example. In the following example program, `d0` and `d1`
-correspond to the two networks in the following figure:
-
-
-
-```python
-def G(in):
- # over-simplified example as G has only one layers:
- return paddle.layer.fc(in, parameter_name="G")
-
-def D(in);
- # again, over-simplified:
- return paddle.layer.fc(in, parameter_name="D")
-
-# Construct the first topology, which contains both D and G.
-# By learning this topology, we update parameters of G.
-d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
-
-# Construct a second topology d1, which contains only D. By
-# training this topology, we update parameters of D. Note
-# that d1 share parameters with d0.
-d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
-
-# Create parameters from a list of multiple topologies (models) for
-# the chance to share parameters between these topologies.
-parameters = paddle.parameters.create([d0, d1])
-
-# Iterative training of GAN.
-for ...:
- train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
- train(d1, parameters, reader=read_from_realistic_images)
-
-# Use d1 for inference:
-print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
-```
-
-
-### Summarization
-
-
-Above two programs reveal some important design concerns:
-
-1. Users describe a topology as an expression of layers. Every layer
- has a *parameter name*. If the users don't specify it explicitly, it's automatically generated as a unique name. By
- specifying the parameter name, users can specify the sharing of
- parameters between layers and even between topologies.
-
-1. `paddle.parameters.create` figures out parameters required by one
- or more topologies from parameter names of layers. It creates these
- parameters and returns a `ParameterSet` object, which is in essence
- a map from *parameter names* to *parameters*.
-
-1. At training and inference time, `paddle.train` and `paddle.infer`
- requires both a topology and the parameter set that holds the parameters of that topology. There are some reasons:
-
- 1. This prevents users from forgetting to call
- `paddle.parameters.create`.
- 1. `paddle.train` needs to know which parameter set to update.
- 1. Users could load another (pre-trained) parameter set and use it
- with a topology in `train.infer`.
-
-1. By specifying the `immutable_parameters` parameter of
- `paddle.train`, we can forbid the update of these parameters.
-
-
-## Reader
-
-Not all programming frameworks allow users to define I/O functions.
-An example is Google MapReduce, which can only read from text,
-SSTable, and RecordIO files. Hadoop MapReduce allows users to define
-readers and writers by deriving from base classes `Reader` and
-`Writer`. The former is less flexible but also less error-prone. We
-decide to provide the flexibility to users to define their readers.
-
-
-There are some open questions here:
-
-1. **Should a reader return a Python dictionary?**
-
-1. **How to map multiple outputs from a reader to multiple data layers?**
-
-1. **How to easily compose some existing readers to read more data and
- feed a topology with more data layers?**
-
-
-## Training
-
-The recommended way to training a model is to call `paddle.train`,
-which simply calls `paddle.trainer.Default`, a global variable of
-type `paddle.trainer.SGD`. Equivalently, we can do
-
-```python
-opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
-opt.train(topology, parameters, reader=read, ...)
-```
-
-### Updater
-
-Please be aware that a trainer can accept an updater as its data
-member, where an updater is a class derived from
-`paddle.trainer.Updater`. This is to make it easier to customize
-trainers, as discussed
-[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
-
-### Event Handler
-
-`paddle.train` and `paddle.trainer.XXX.train` take an optional
-parameter `event_handler`, which should be either `None` or a function
-that handle some events:
-
-1. BeginTraining
-1. EndTraining
-1. BeginIteration
-1. EndIteration
-1. BeginPass
-1. EndPass
-
-where EndPass is sent if and only if the reader yields
-`end_pass=True`.
-
-An example as follows:
-
-```python
-def event_handler(event):
- if ininstance(event, paddle.event.EndIteration):
- print paddle.test(...)
-
-paddle.train(topology, parameters, reader, event_handler)
-```
-
-If we are writing a PaddlePaddle program in and for iPython/Jypyter,
-we can use metaplotlib in the event handler to plot a curve of
-cost/error versus iterations, as shown
-[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
-
-### Distributed Training
-
-If users want to do distributed training on a cluster, s/he should
-call `paddle.dist_train` and provides access tokens to the cluster as
-a parameter.
-
-For example, if the user has a TLS certificate that allows him to
-access a Kubernetes cluster, s/he should be able to call
-
-```python
-paddle.dist_train(model,
- trainer=paddle.trainer.SGD(...,
- paddle.updater.Adam(...)),
- reader=read,
- k8s_user="yi",
- k8s_token="kube_cluster_tls.pem",
- k8s_job="hello",
- num_parameter_servers=15)
-```
-
-The pseudo code of `paddle.dist_train` is as follows:
-
-```python
-def dist_train(topology, parameters, trainer, reader, ...):
- if os.getenv("KUBERNETES_SERVICE_HOST") == None:
- image_name = k8s_user + '/' + k8s_job
- docker_build(image_name)
- docker_push()
- kube_ctrl_start_job(image_name, k8s_user, k8s_token)
- else:
- rank = kube_list_containers_in_job_and_return_current_containers_rank()
- if rank == 0:
- master()
- elif rank < 15:
- parameter_server()
- else:
- trainer.train(model, reader=read)
-```
-
-Please be aware that if a process is running on the Kubernetes
-cluster, it will have some environment variables pre-defined.
-
-If `dist_train` doesn't see these environment variables, it knows
-that it's running on users' personal computer, and it should work as a
-*launcher*. Otherwise, it knows that it's running on the cluster and
-need to figure out its role as either the master, or a trainer, or a
-parameter server.
diff --git a/doc/fluid/design/motivation/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle
deleted file mode 100644
index c933df2cb855462c52b2d25f7f9a99b95652961d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/motivation/fluid-compiler.graffle and /dev/null differ
diff --git a/doc/fluid/design/motivation/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png
deleted file mode 100644
index 1b0ffed2039c91a3a00bbb719da08c91c3acf7bb..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/motivation/fluid-compiler.png and /dev/null differ
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
deleted file mode 100644
index 4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939..0000000000000000000000000000000000000000
--- a/doc/fluid/design/motivation/fluid.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Design Doc: PaddlePaddle Fluid
-
-## Why Fluid
-
-When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
-
-Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
-
-## The Evolution of Deep Learning Systems
-
-Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
-
-
-
-
-Existed since |
-model as sequence of layers |
-model as graph of operators |
-No model |
-
-
-
-
-2013 |
-Caffe, Theano, Torch, PaddlePaddle |
- |
- |
-
-
-2015 |
- |
-TensorFlow, MxNet, Caffe2, ONNX, n-graph |
- |
-
-
-2016 |
- |
- |
- PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
-
-
-
-
-
-From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
-
-## Deep Learning Programming Paradigms
-
-With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
-
-```python
-x = layer.data("image")
-l = layer.data("label")
-f = layer.fc(x, W)
-s = layer.softmax(f)
-c = layer.mse(l, s)
-
-for i in xrange(1000): # train for 1000 iterations
- m = read_minibatch()
- forward({input=x, data=m}, minimize=c)
- backward(...)
-
-print W # print the trained model parameters.
-```
-
-The above program includes two parts:
-
-1. The first part describes the model, and
-2. The second part describes the training process (or inference process) for the model.
-
-This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
-
-This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following:
-
-```python
-W = tensor(...)
-
-for i in xrange(1000): # train for 1000 iterations
- m = read_minibatch()
- x = m["image"]
- l = m["label"]
- f = layer.fc(x, W)
- s = layer.softmax(f)
- c = layer.mse(l, s)
- backward()
-
-print W # print the trained model parameters.
-```
-
-We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
-
-## Describe Arbitrary Models for the Future
-
-Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
-
-As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following:
-
-```python
-for i in xrange(1000):
- m = read_minibatch()
- x = m["sentence"]
- for t in xrange x.len():
- h[t] = the_step(x[t])
-```
-
-With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
-
-```python
-train_loop = layers.While(cond)
-with train_loop.block():
- m = read_minibatch()
- x = m["sentence"]
- rnn = layers.While(...)
- with rnn.block():
- h[t] = the_step(input[t])
-```
-
-An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
-
-From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
-
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
-
-## Turing Completeness
-
-In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
-
-## The Execution of a Fluid Program
-
-There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-
-There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
-
-Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
-
-## Backward Compatibility of Fluid
-
-Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
-
-For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
deleted file mode 100644
index 6dd3840a0734e8593890dcf8044746197350c6f5..0000000000000000000000000000000000000000
--- a/doc/fluid/design/motivation/fluid_compiler.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# PaddlePaddle Fluid: Towards a Compiled Programming Language
-
-As described in [fluid.md](fluid.md), when a Fluid application program
-runs, it generates a `ProgramDesc` protobuf message as an intermediate
-representation of itself. The C++ class `Executor` can run this
-protobuf message as an interpreter. This article describes the Fluid
-compiler.
-
-![](fluid-compiler.png)
-
-## ProgramDesc
-
-Before we go deeper into the idea of compiled language, let us take a
-look at a simple example Fluid application.
-
-```python
-import "fluid"
-
-func paddlepaddle() {
- X = fluid.read(...)
- W = fluid.Tensor(...)
- Y = fluid.mult(X, W)
-}
-```
-
-This program consists of a [block](../concepts/block.md) of three operators --
-`read`, `assign`, and `mult`. Its `ProgramDesc` message looks like
-the following
-
-```protobuf
-message ProgramDesc {
- block[0] = Block {
- vars = [X, W, Y],
- ops = [
- read(output = X)
- assign(input = ..., output = W)
- mult(input = {X, W}, output = Y)
- ],
- }
-}
-```
-
-## Transpilers
-
-We can write a transpiler program that takes a `ProgramDesc`, e.g.,
-the above one, and outputs another `ProgramDesc`. Let us take some
-examples:
-
-1. *Memory optimization transpiler*: We can write a transpiler that
- inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
- to free memory early, before the end of an iteration, so to keep a
- small memory footprint.
-
-1. *Distributed training transpiler*: We can write a transpiler that
- converts a`ProgramDesc` into its distributed version of two
- `ProgramDesc`s -- one for running by the trainer processes and the
- other for the parameter server.
-
-In the rest of this article, we talk about a special kind of
-transpiler, *Native code generator*, which takes a `ProgramDesc` and
-generates a `.cu` (or `.cc`) file, which could be built by C++
-compilers (gcc, nvcc, icc) into binaries.
-
-## Native Code Generator
-
-For the above example, the native code generator transpiler, say, the
-CUDA code generator, should generate a `main` function:
-
-```c++
-void main() {
- auto X = fluid_cuda_read(...);
- auto W = fluid_cuda_create_tensor(...);
- auto Y = fluid_cuda_mult(X, W);
-}
-```
-
-and the definitions of functions `fluid_cuda_read`,
-`fluid_cuda_create_tensor`, and `fluid_cuda_mult`. Please be aware
-that each function could just define a C++ instance of an operator and
-run it. For example
-
-```c++
-paddle::Tensor fluid_cuda_read(...) {
- paddle::Tensor t;
- paddle::operator::Read r(&t, ...);
- r.Run();
- return t;
-}
-```
-
-For computational operators that have multiple *kernels*, each for a
-specific hardware platform, for example, the `mult` operator, the
-generated code should call its CUDA kernel:
-
-```c++
-paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
- const paddle::Tensor& b) {
- paddle::Tensor t;
- paddle::operator::Mult m(a, b, ...);
- Mult.Run(cuda_context);
-}
-```
-
-where `cuda_context` could be a global variable of type
-`paddle::CUDADeviceContext`.
-
-## Multi-Block Code Generation
-
-Most Fluid application programs may have more than one blocks. To
-execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
deleted file mode 100644
index 7706e73eca644ed6db772fd77da947395313237f..0000000000000000000000000000000000000000
--- a/doc/fluid/design/motivation/index_cn.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-设计动机和目标
--------------
-
-.. toctree::
- :maxdepth: 1
-
- api.md
- refactorization.md
- fluid.md
- fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
deleted file mode 100644
index 10b64b257c604ced6b957d6d6018e8a363f00fac..0000000000000000000000000000000000000000
--- a/doc/fluid/design/motivation/index_en.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Design Motivations and Goals
---------------------------------------
-
-.. toctree::
- :maxdepth: 1
-
- api.md
- refactorization.md
- fluid.md
- fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
deleted file mode 100644
index ad9d0f6d3f3ad9884f108826e8410871fffd51bf..0000000000000000000000000000000000000000
--- a/doc/fluid/design/motivation/refactorization.md
+++ /dev/null
@@ -1,275 +0,0 @@
-# Design Doc: Refactorization Overview
-
-The goals of refactoring include:
-
-1. Making it easy for external contributors to write new elementary computation operations.
-1. Making the codebase clean and readable.
-1. Designing a new computation representation -- a computation graph of operators and variables.
-1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
-
-## Computation Graphs
-
-1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
-
- 1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
-
-1. Users write Python programs to describe the graphs and run them (locally or remotely).
-
-1. A graph is composed of *variables* and *operators*.
-
-1. The description of graphs must be serializable/deserializable, so that:
-
- 1. It can be sent to the cloud for distributed execution, and
- 1. It can be sent to clients for mobile or enterprise deployment.
-
-1. The Python program does two things
-
- 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
- 1. the C++ library `libpaddle.so` for local execution,
- 1. the master process of a distributed training job for training, or
- 1. the server process of a Kubernetes serving job for distributed serving.
- 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
-
-## Description and Realization of Computation Graph
-
-At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
-
-At runtime, the C++ program realizes the graph and runs it.
-
-
-
-
- |
-Representation (protobuf messages) |
-Realization (C++ class objects) |
-
-
-
-
-Data |
-
-VarDesc |
-
-Variable |
-
-
-Operation |
-
-OpDesc |
-
-Operator |
-
-
-Block |
-BlockDesc |
-Block |
-
-
-
-
-
-The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
-
-## Compilation and Execution
-
-1. Run a Python program to describe the graph. In particular, the Python application program does the following:
-
- 1. Create `VarDesc` to represent local/intermediate variables,
- 1. Create operators and set attributes,
- 1. Validate attribute values,
- 1. Infer the type and the shape of variables,
- 1. Plan memory-reuse for variables,
- 1. Generate the backward graph
- 1. Add optimization operators to the computation graph.
- 1. Optionally, split the graph for distributed training.
-
-1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
-
- 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
- 1. realize local variables defined in the BlockDesc message in the new scope,
- 1. a scope is similar to the stack frame in programming languages,
-
- 1. Create an instance of class `Block`, in which,
- 1. realize operators in the BlockDesc message,
-
- 1. Run the Block by calling
- 1. `Block::Eval(vector* targets)` for forward and backward computations, or
- 1. `Block::Eval(vector* targets)` for optimization.
-
-
-## Intermediate Representation (IR)
-
-```text
-Compile Time -> IR -> Runtime
-```
-
-### Benefits of IR
-
-- Optimization
- ```text
- Compile Time -> IR -> Optimized IR -> Runtime
- ```
-- Automatically send partitioned IR to different nodes.
- - Automatic Data Parallelism
- ```text
- Compile Time
- |-> Single GPU IR
- |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
- |-> Node-0 (runs trainer-IR-0)
- |-> Node-1 (runs trainer-IR-1)
- |-> Node-2 (runs pserver-IR)
- ```
- - Automatic Model Parallelism (planned for future)
-
----
-
-## Operator/OpWithKernel/OpKernel
-
-![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
-
----
-
-## Operator
-![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
-
-* `Operator` is the fundamental building block of the user interface.
- * Operator stores input/output variable names and attributes.
- * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
- * Use `Run` to compute the `output` variables from the `input` variables.
-
----
-
-## OpWithKernel/Kernel
-
-![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
-
-* `OpWithKernel` inherits `Operator`.
-* `OpWithKernel` contains a Kernel map.
- * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
- * `OpKernelKey` is the map key. Only device place now, but may be data type later.
-
----
-
-## Why separate Kernel and Operator
-
-* Separate GPU and CPU code.
- * Make Paddle capable of running without GPU.
-* Make one operator (which is a user interface) and create many implementations.
- * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
----
-
-## Libraries for Kernel development
-
-* `Eigen::Tensor` contains basic math and element-wise functions.
- * Note that `Eigen::Tensor` has broadcast implementation.
- * Limit the number of `tensor.device(dev) = ` in your code.
-* `thrust::transform` and `std::transform`.
- * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
- * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
-* Hand-writing `GPUKernel` and `CPU` code
- * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
----
-## Operator Registration
-
-### Why is registration necessary?
-We need a method to build mappings between Op type names and Op classes.
-
-### How is registration implemented?
-Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
-
----
-## The Registry Map
-
-### `OpInfoMap`
-
-`op_type(string)` -> `OpInfo`
-
-`OpInfo`:
-
-- **`creator`**: The Op constructor.
-- **`grad_op_type`**: The type of the gradient Op.
-- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
-- **`checker`**: Used to check attributes.
-
----
-## Related Concepts
-
-### Op_Maker
-It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
-
-### Register Macros
-```cpp
-REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
-REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
-```
-
----
-## Registration Process
-1. Write an Op class and its gradient Op class, if required.
-2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
-3. Invoke the macro `REGISTER_OP`. This macro will
- 1. Call maker class to complete `proto` and `checker`
- 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
-
----
-## Backward Module (1/2)
-### Create Backward Operator
-- Mapping from forward Op to backward Op
-![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
-
----
-## Backward Module (2/2)
-### Build Backward Network
-- **Input**: a graph of forward operators
-- **Output**: a graph of backward operators
-- **Corner cases in construction**
- - Shared Variables => insert an `Add` operator to combine gradients
- - No Gradient => insert a `fill_zero_grad` operator
- - Recursive NetOp => call `Backward` recursively
- - RNN Op => recursively call `Backward` on stepnet
- - RNN Op => recursively call `Backward` on stepnet
-
-
----
-## Scope, Variable, Tensor
-
-* `Tensor` is an n-dimension array with type.
- * Only dims and data pointers are stored in `Tensor`.
- * All operations on `Tensor` are written in `Operator` or global functions.
- * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
-* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
- * `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables are stored.
- * map
- * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
-
----
-## Block (in design)
-### the difference between original RNNOp and Block
-- As an operator is more intuitive than `RNNOp`,
-- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
-- Fits the compile-time/ runtime separation design paradigm.
- - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
- - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
-
----
-## Milestone
-- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
-- Model migration
- - Framework development gives **priority support** to model migration, for example,
- - the MNIST demo needs a Python interface,
- - the RNN models require the framework to support `LoDTensor`.
- - Determine some timelines,
- - Frequently used Ops need to be migrated first,
- - Different models can be migrated in parallel.
-- Improve the framework at the same time
-- Accept imperfection, concentrate on solving the specific problem at the right price.
-
----
-## Control the migration quality
-- Compare the performance of migrated models with old ones.
-- Follow the google C++ style guide.
-- Build the automatic workflow of generating Python/C++ documentations.
- - The documentation of layers and ops should be written inside the code.
- - Take the documentation quality into account when submitting pull requests.
- - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/fluid/design/multi_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
deleted file mode 100644
index 1f8439e8623e1c1ae9a12c24d08079f0ec3d761f..0000000000000000000000000000000000000000
--- a/doc/fluid/design/multi_devices/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-多设备支持
-------------
-
-.. toctree::
- :maxdepth: 1
-
- operator_kernel_type.md
- kernel_selection.md
- kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
deleted file mode 100644
index 819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2..0000000000000000000000000000000000000000
--- a/doc/fluid/design/multi_devices/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Multi-Device Support
-----------------------
-
-.. toctree::
- :maxdepth: 1
-
- operator_kernel_type.md
- kernel_selection.md
- kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
deleted file mode 100644
index 6edc14ca73b1abf824981b59511a9aca4e0f3b47..0000000000000000000000000000000000000000
--- a/doc/fluid/design/multi_devices/kernel_hint_design.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# Kernel Hint Design
-
-## Problem
-In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
-
-In the current design, we use KernelType to describe one kernel.
-
-```cpp
-struct KernelType {
- Place place_;
- DataType data_type_;
- LayoutType layout_;
-};
-```
- `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
-
-The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
-
-So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
-
-The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
-
-## Solution
-
-### Potential choice
-1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
-
-2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
-
-### Final choice
-To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
-
-In C++
-
-```cpp
-const std::string kForceCPU = "force_cpu";
-const std::string kUseCUDNN = "use_cudnn";
-const std::string kUseMKLDNN = "use_mkldnn";
-
-KernelType GetExpectedKernelType() {
- if (Attr(kForceCPU)) {
- return KernelType(CPUPlace, ...)
- } else {
- ...
- }
-}
-```
-
-In Python code
-
-```python
-FORCE_CPU = core.kForceCPU()
-
-def xx_layer(..., force_cpu=false):
- layer_helper = LayerHelper(...)
- layer_helper.append_op(
- type="xx",
- attr={FORCE_CPU: force_cpu})
-```
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
deleted file mode 100644
index 4d2aab87b8cf30d03075e96cc4c67070efaf963a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# Kernel Selection
-
-## Background
-Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
-
-The `OpKernelType ` is as follows:
-
-```cpp
-struct OpKernelType {
- Place place_;
- DataType data_type_;
- DataLayout data_layout_;
- LibraryType library_type_;
-};
-```
-
-- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
-
-- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
-
-- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
-
-- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
-
-## Problem
-
-We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
-
-1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
-2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
-3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
-
-Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
-
-If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
-
-```
-OP1(CPUPlace)
- |
- op1_2_op2
- |
-OP2(CPUPlace)
-```
-
-If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
-
-Problems under these situations are similar. We can formalize this problem as follow.
-
-We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
-
-## Solution: data transform
-
-It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
-
-We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
-
-We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
-
-We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
-
-The algorithm is described as following
-
-```cpp
-void OperatorWithKernel::Run(
- const Scope& scope,
- const platform::Place& place) const {
- ExecutionContext ctx(...);
- auto expected_kernel_key = this->GetExpectedKernelType(ctx);
-
- Scope& new_scope = scope.NewScope();
-
- for (auto& var_name : this->Inputs()) {
- auto* tensor_in = GetTensor(var_name);
- auto kernel_type_for_var = this->GetKernelTypeForVar(...);
- if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
- auto* trans_var = new_scope.Var(var_name);
- auto* out = TransformData(expected_kernel_key,
- kernel_type_for_var,
- *tensor_in);
- SetTensorToVariable(...);
- }
- }
-
- auto kernel = kernels.find(expected_kernel_key);
- kernel->Compute(ExecutionContext(...));
-}
-```
-
-then the actual process for the multi-device above will be:
-
-```
-OP1(CPUPlace)
- |
-op1_2_op2(on CPU)
- |
-[transform](from CPU to GPU)
- |
-op1_2_op2(on GPU)
- |
-OP2(CUDAPlace)
-```
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
deleted file mode 100644
index 5e391bd62b4f4e123a9a6f35b7adf5726f205635..0000000000000000000000000000000000000000
--- a/doc/fluid/design/multi_devices/operator_kernel_type.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Design Doc: The Keys of Operator Kernel Type
-## Problem
-An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
-
-```cpp
-struct OpKernelType {
- platform::Place place_;
- proto::DataType data_type_;
-};
-```
-For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
-
-It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
-
-We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
-
-For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
-
-Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
-
-## Solution
-
-There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
-
-```cpp
-struct OpKernelType {
- platform::Place place_;
- platform::Library library_;
- proto::DataType data_type_;
- framework::Layout layout_;
-};
-```
-
-The details are as follows:
-
-### Place
-
-`Place` is defined as:
-
-```cpp
-typedef boost::variant Place;
-```
-
-`Place` represents the device memory where data is located.
-
-
-### Library
-
-One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
-
-```cpp
-enum Library { Plain, MKLDNN, CUDNN };
-```
-
-We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
-A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
-
-If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
-
-
-### DataType
-
-
-`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
-
-### Layout
-
-Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
-
-Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
-
-- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
-
-- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
-
-- The inference of Layout is at run-time, not at compile-time.
-
-- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators.
-
-`Layout` is also defined as a enum variable:
-
-```cpp
-enum Layout {
- kNCHW,
- kNHWC,
-#ifdef PADDLE_WITH_MKLDNN
- knChw8c
- ...
-#endif
-};
-```
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
deleted file mode 100644
index f32a5b7e8a4d820319a666dab4c3129360e2c924..0000000000000000000000000000000000000000
--- a/doc/fluid/design/network/deep_speech_2.md
+++ /dev/null
@@ -1,235 +0,0 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc
-
-We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
-
-- Release a basic distributed implementation of DS2 on PaddlePaddle.
-- Contribute a chapter of Deep Speech to PaddlePaddle Book.
-
-Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
-
-## Table of Contents
-
-- [Tasks](#tasks)
-- [Task Dependency](#task-dependency)
-- [Design Details](#design-details)
- - [Overview](#overview)
- - [Row Convolution](#row-convolution)
- - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
-- [Future Work](#future-work)
-- [References](#references)
-
-## Tasks
-
-We roughly break down the project into 14 tasks:
-
-1. Develop an **audio data provider**:
- - Json filelist generator.
- - Audio file format transformer.
- - Spectrogram feature extraction, power normalization etc.
- - Batch data reader with SortaGrad.
- - Data augmentation (optional).
- - Prepare (one or more) public English data sets & baseline.
-2. Create a **simplified DS2 model configuration**:
- - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
- - With only bidirectional-GRU (otherwise need *Task 4*).
- - With only greedy decoder (otherwise need *Task 5, 6*).
-3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
- - Update `DenseScanner` in `dataprovider_converter.py`, etc.
-4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
- - Lookahead convolution windows.
- - Within-row convolution, without kernels shared across rows.
-5. Build KenLM **language model** (5-gram) for beam search decoder:
- - Use KenLM toolkit.
- - Prepare the corpus & train the model.
- - Create infererence interfaces (for Task 6).
-6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
- - Beam search with CTC.
- - Beam search with external custom scorer (e.g. LM).
- - Try to design a more general beam search interface.
-7. Develop a **Word Error Rate evaluator**:
- - update `ctc_error_evaluator`(CER) to support WER.
-8. Prepare internal dataset for Mandarin (optional):
- - Dataset, baseline, evaluation details.
- - Particular data preprocessing for Mandarin.
- - Might need cooperating with the Speech Department.
-9. Create **standard DS2 model configuration**:
- - With variable-length audio sequences (need *Task 3*).
- - With unidirectional-GRU + row-convolution (need *Task 4*).
- - With CTC-LM beam search decoder (need *Task 5, 6*).
-10. Make it run perfectly on **clusters**.
-11. Experiments and **benchmarking** (for accuracy, not efficiency):
- - With public English dataset.
- - With internal (Baidu) Mandarin dataset (optional).
-12. Time **profiling** and optimization.
-13. Prepare **docs**.
-14. Prepare PaddlePaddle **Book** chapter with a simplified version.
-
-## Task Dependency
-
-Tasks parallelizable within phases:
-
-
-
-
-Roadmap |
-Description |
- Parallelizable Tasks |
-
-
-
-
-Phase I |
-Simplified model & components |
-Task 1 ~ Task 8 |
-
-
-Phase II |
- Standard model & benchmarking & profiling |
-Task 9 ~ Task 12 |
-
-
-Phase III |
- Documentations |
- Task13 ~ Task14 |
-
-
-
-
-
-Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
-
-## Design Details
-
-### Overview
-
-Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
-
-Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
-
-The classical DS2 network contains 15 layers (from bottom to top):
-
-- **Two** data layers (audio spectrogram, transcription text)
-- **Three** 2D convolution layers
-- **Seven** uni-directional simple-RNN layers
-- **One** lookahead row convolution layers
-- **One** fully-connected layers
-- **One** CTC-loss layer
-
-
-
-Figure 1. Archetecture of Deep Speech 2 Network.
-
-
-We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
-
-Key ingredients about the layers:
-
-- **Data Layers**:
- - Frame sequences data of audio **spectrogram** (with FFT).
- - Token sequences data of **transcription** text (labels).
- - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**:
- - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
- - With striding for only the first convlution layer.
- - No pooling for all convolution layers.
-- **Uni-directional RNNs**
- - Uni-directional + row convolution: for low-latency inference.
- - Bi-direcitional + without row convolution: if we don't care about the inference latency.
-- **Row convolution**:
- - For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
- - Not nessesary if with bi-direcitional RNNs.
- - "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
-- **Batch Normalization Layers**:
- - Added to all above layers (except for data and loss layer).
- - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
-
-
-
-
-Required Components |
- PaddlePaddle Support |
- Need to Develop |
-
-
-
-
-Data Layer I (Spectrogram) |
-Not supported yet. |
-TBD (Task 3) |
-
-
-Data Layer II (Transcription) |
- paddle.data_type.integer_value_sequence |
- - |
-
-
-2D Convolution Layer |
- paddle.layer.image_conv_layer |
- - |
-
-
-DataType Converter (vec2seq) |
- paddle.layer.block_expand |
- - |
-
-
-Bi-/Uni-directional RNNs |
-paddle.layer.recurrent_group |
- - |
-
-
-Row Convolution Layer |
-Not supported yet. |
-TBD (Task 4) |
-
-
-CTC-loss Layer |
-paddle.layer.warp_ctc |
- - |
-
-
-Batch Normalization Layer |
-paddle.layer.batch_norm |
- - |
-
-
-CTC-Beam search |
-Not supported yet. |
- TBD (Task 6) |
-
-
-
-
-
-### Row Convolution
-
-TODO by Assignees
-
-### Beam Search with CTC and LM
-
-
-
-Figure 2. Algorithm for CTC Beam Search Decoder.
-
-
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
-- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
-- Such external scorer consists of language model, word count or any other custom scorers.
-- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
-
-
-## Future Work
-
-- Efficiency Improvement
-- Accuracy Improvement
-- Low-latency Inference Library
-- Large-scale benchmarking
-
-## References
-
-1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
-2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
-3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
deleted file mode 100644
index 8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg and /dev/null differ
diff --git a/doc/fluid/design/network/images/beam_search.png b/doc/fluid/design/network/images/beam_search.png
deleted file mode 100644
index 7f7e35f34223162d0f7f0ed97375909c43b830ae..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/network/images/beam_search.png and /dev/null differ
diff --git a/doc/fluid/design/network/images/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png
deleted file mode 100644
index 1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/network/images/ds2_network.png and /dev/null differ
diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst
deleted file mode 100644
index 3557d55fe4dbae1f712e0760ca15111ec6f6792d..0000000000000000000000000000000000000000
--- a/doc/fluid/design/network/index_cn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-复杂网络设计
-------------
-
-.. toctree::
- :maxdepth: 1
-
- sequence_decoder.md
diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst
deleted file mode 100644
index 73a7137236bdf0548d35721609351d6deca3013b..0000000000000000000000000000000000000000
--- a/doc/fluid/design/network/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Complex Network Design
-------------------------
-
-.. toctree::
- :maxdepth: 1
-
- sequence_decoder.md
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
deleted file mode 100644
index b95773c50ca0dcbd1b93529332e035d4de90faa8..0000000000000000000000000000000000000000
--- a/doc/fluid/design/network/sequence_decoder.md
+++ /dev/null
@@ -1,229 +0,0 @@
-# Design: Sequence Decoder Generating LoDTensors
-In tasks such as machine translation and visual captioning,
-a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
-
-This documentation describes how to implement the sequence decoder as an operator.
-
-## Beam Search based Decoder
-The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
-
-In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
-
-There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
-
-During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
-
-For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
-the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
-
-## Changing LoD's absolute offset to relative offsets
-The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
-
-The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
-let's call this format the **absolute-offset LoD** for clarity.
-
-The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
-```python
-[[0, 3, 9]
- [0, 2, 3, 3, 3, 9]]
-```
-The first level tells that there are two sequences:
-- the first's offset is `[0, 3)`
-- the second's offset is `[3, 9)`
-
-while on the second level, there are several empty sequences that both begin and end at `3`.
-It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
-
-There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
-
-So let's introduce another format of LoD,
-it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
-
-For example, to represent the same sequences of the above data
-
-```python
-[[0, 3, 6]
- [0, 2, 3, 3, 3, 9]]
-```
-
-the first level represents that there are two sequences,
-their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
-
-The second level is the same with the relative offset example because the lower level is a tensor.
-It is easy to find out the second sequence in the first-level LoD has two empty sequences.
-
-The following examples are based on relative-offset LoD.
-
-## Usage in a simple machine translation model
-Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
-
-The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
-
-**Encoder**
-```python
-import paddle as pd
-
-dict_size = 8000
-source_dict_size = dict_size
-target_dict_size = dict_size
-word_vector_dim = 128
-encoder_dim = 128
-decoder_dim = 128
-beam_size = 5
-max_length = 120
-
-# encoder
-src_word_id = pd.data(
- name='source_language_word',
- type=pd.data.integer_value_sequence(source_dict_dim))
-src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
-
-src_word_vec = pd.lookup(src_embedding, src_word_id)
-
-encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
-
-encoder_ctx = pd.last_seq(encoder_out_seq)
-# encoder_ctx_proj is the learned semantic vector
-encoder_ctx_proj = pd.fc(
- encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
-```
-
-**Decoder**
-
-```python
-def generate():
- decoder = pd.while_loop()
- with decoder.step():
- decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory
- generated_ids = decoder.memory() # TODO init to batch_size s
- generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
-
- target_word = pd.lookup(trg_embedding, gendrated_ids)
- # expand encoder_ctx's batch to fit target_word's lod
- # for example
- # decoder_mem.lod is
- # [[0 1 3],
- # [0 1 3 6]]
- # its tensor content is [a1 a2 a3 a4 a5]
- # which means there are 2 sentences to translate
- # - the first sentence has 1 translation prefixes, the offsets are [0, 1)
- # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
- # the target_word.lod is
- # [[0, 1, 6]
- # [0, 2, 4, 7, 9 12]]
- # which means 2 sentences to translate, each has 1 and 5 prefixes
- # the first prefix has 2 candidates
- # the following has 2, 3, 2, 3 candidates
- # the encoder_ctx_expanded's content will be
- # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
- encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
- decoder_input = pd.fc(
- act=pd.activation.Linear(),
- input=[target_word, encoder_ctx_expanded],
- size=3 * decoder_dim)
- gru_out, cur_mem = pd.gru_step(
- decoder_input, mem=decoder_mem, size=decoder_dim)
- scores = pd.fc(
- gru_out,
- size=trg_dic_size,
- bias=None,
- act=pd.activation.Softmax())
- # K is an config
- topk_scores, topk_ids = pd.top_k(scores, K)
- topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
-
- selected_ids, selected_generation_scores = decoder.beam_search(
- topk_ids, topk_generated_scores)
-
- # update the states
- decoder_mem.update(cur_mem) # tells how to update state
- generated_ids.update(selected_ids)
- generated_scores.update(selected_generation_scores)
-
- decoder.output(selected_ids)
- decoder.output(selected_generation_scores)
-
-translation_ids, translation_scores = decoder()
-```
-The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
-returns the result of the beam search algorithm.
-
-In this way, users can customize anything on the input or output of beam search, for example:
-
-1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
-2. Remove some specific candidate in `selected_ids`.
-3. Get the final `translation_ids`, remove the translation sequence in it.
-
-The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
-so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
-
-Both of them are two-level `LoDTensors`:
-
-- The first level represents `batch_size` of (source) sentences.
-- The second level represents the candidate ID sets for translation prefix.
-
-For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
-
-Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
-
-For example, the previous state:
-
-* LoD is `[0, 1, 3][0, 2, 5, 6]`
-* content of tensor is `a1 a2 b1 b2 b3 c1`
-
-the current state is stored in `encoder_ctx_expanded`:
-
-* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
-* the content is
- - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
- - a2 a2
- - b1 b1 b1
- - b2
- - b3 b3
- - None (c1 has 0 candidates, so c1 is dropped)
-
-The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
-
-The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
-
-```python
-decoder.output(selected_ids)
-decoder.output(selected_generation_scores)
-```
-
-The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
-
-Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
-
-Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
-
-## LoD and shape changes during decoding
-
-
-
-
-According to the image above, the only phase that changes the LoD is beam search.
-
-## Beam search design
-The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
-
-1. `topk_ids`, the top K candidate ids for each prefix.
-2. `topk_scores`, the corresponding scores for `topk_ids`
-3. `generated_scores`, the score of the prefixes.
-
-All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
-
-It will return three variables:
-
-1. `selected_ids`, the final candidate beam search function selected for the next step.
-2. `selected_scores`, the scores for the candidates.
-3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
-
-## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
-The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
-so it is natural to store them in arrays.
-
-Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
-
-The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
-It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png
deleted file mode 100644
index ab1c2ff23cfff586516876684348bb15bd2084fc..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/onnx/images/project_structure.png and /dev/null differ
diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md
deleted file mode 100644
index bc1665d7c33eb54cb63e5306a439c1ca67016d1e..0000000000000000000000000000000000000000
--- a/doc/fluid/design/onnx/onnx_convertor.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# Background
-
-[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
-
-Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
-
-
-# How it works
-
-ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
-
-When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
-
-Here are a few major considerations when it comes to converting models:
-
-- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
-- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
-- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
-- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
-- **Model validation**: There are two kinds of validations that are necessary:
- 1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
- 2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
-- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
-
-One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
-
-
-# Project structure
-
-
-
-
-
-The project contains four important parts:
-
-* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
-
-* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
-
-* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
-
-* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
-
-
-# Usage
-The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
-
-* Convert Fluid inference model to ONNX binary model
-
- ```
- python convert.py --fluid_model --onnx_model validate True
- ```
-
-* Validate the converted model
-
- ```
- python validate.py --fluid_model --onnx_model
- ```
-
-The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
-
-
-# Challenges and mitigation
-
-## Cycles
-
-Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
-
-*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
-
-## Sequences
-
-Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
-
-*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
-
-## Ops that can't easily be mapped
-
-There are ops that just aren't possible to map today:
-
-**Control flow operators**
-
-Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
-
-*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
-
-
-**Non-existent in Fluid**
-
-There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
-
-*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
-
-
-**Concurrency**
-
-ONNX does not have any considerations for concurrency right now.
-
-*Resolution*: There are two ways to approach this:
-
-a. We choose to not support concurrent models.
-b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
-
-
-**Overloaded in Fluid**
-
-There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
-
-*Resolution*: Chain multiple Paddle operators.
-
-
-## Lack of LoDTensors
-
-As stated above, ONNX only supports simple Tensor values.
-
-*Resolution*: Deprecate to plain old numpy-able tensors.
-
-
-## Reconstruction from deprecated ONNX ops
-
-For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
-
-*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
-
-
-# Supported models
-
-As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
-
-- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
-- Recurrent models: language model, stacked LSTMs etc.
diff --git a/doc/fluid/design/others/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md
deleted file mode 100644
index 773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/auto_gradient_check.md
+++ /dev/null
@@ -1,150 +0,0 @@
-## Auto Gradient Check Design
-
-## Background:
-- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
- 1. The formula for backpropagation formula should be correct according to the forward computation.
- 2. The Implementation of the above shoule be correct in CPP.
- 3. It is difficult to prepare an unbiased test data.
-
-- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
- 1. Numerical gradient checker only needs the forward operator.
- 2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
-
-## Mathematical Theory
-The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
-
-- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
-- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
-
-
-## Numerical Gradient Implementation
-### Python Interface
-```python
-def get_numerical_gradient(op,
- input_values,
- output_name,
- input_to_check,
- delta=0.005,
- local_scope=None):
- """
- Get Numerical Gradient for the input of an operator.
-
- :param op: C++ operator instance, could be an network.
- :param input_values: The input variables. Should be an dictionary, whose key is
- variable name, and value is a numpy array.
- :param output_name: The final output variable name.
- :param input_to_check: The input variable with respect to which the gradient has to be computed.
- :param delta: The perturbation value for numerical gradient method. The
- smaller the delta, the more accurate the result. But if the delta is too
- small, it will suffer from the numerical stability problem.
- :param local_scope: The local scope used for get_numeric_gradient.
- :return: The gradient array in numpy format.
- """
-```
-
-### Explanation:
-
-- Why do we need an `output_name`
- - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
-
-- Why do we need `input_to_check`
- - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
-
-
-### Core Algorithm Implementation
-
-
-```python
- # we only compute the gradient of one element a time.
- # we use a for loop to compute the gradient of each element.
- for i in xrange(tensor_size):
- # get one input element using the index i.
- original = tensor_to_check.get_float_element(i)
-
- # add delta to it, run the forward op and then
- # get the new value of the result tensor.
- x_pos = original + delta
- tensor_to_check.set_float_element(i, x_pos)
- y_pos = get_output()
-
- # Subtract delta from this element, run the op again
- # and get the new value of the result tensor.
- x_neg = original - delta
- tensor_to_check.set_float_element(i, x_neg)
- y_neg = get_output()
-
- # restore old value
- tensor_to_check.set_float_element(i, original)
-
- # compute the gradient of this element and store
- # it into a numpy array.
- gradient_flat[i] = (y_pos - y_neg) / delta / 2
-
- # reshape the gradient result to the shape of the source tensor.
- return gradient_flat.reshape(tensor_to_check.get_dims())
-```
-
-## Auto Gradient Check Framework
-
-Each Operator Kernel has three kinds of Gradient:
-
-1. Numerical gradient
-2. CPU kernel gradient
-3. GPU kernel gradient (if supported by the device)
-
-The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
-
-1. Calculate the numerical gradient
-2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
-3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
-
-#### Python Interface
-
-```python
- def check_grad(self,
- forward_op,
- input_vars,
- inputs_to_check,
- output_name,
- no_grad_set=None,
- only_cpu=False,
- max_relative_error=0.005):
- """
- :param forward_op: used to create backward_op
- :param input_vars: numpy value of input variable. The following
- computation will use these variables.
- :param inputs_to_check: the input variable with respect to which the
- gradient will be computed.
- :param output_name: The final output variable name.
- :param max_relative_error: The relative tolerance parameter.
- :param no_grad_set: used to create backward ops
- :param only_cpu: only compute and check gradient on cpu kernel.
- :return:
- """
-```
-
-### How to check if two numpy arrays are close enough?
-if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
-
-```python
-numerical_grad = ...
-operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
-
-abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for
-# numeric_grad, instead of relative error.
-abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
-
-diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
-max_diff = numpy.max(diff_mat)
-```
-
-
-#### Notes:
-The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
-
-
-#### References:
-
-- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
-- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/fluid/design/others/dcgan.png b/doc/fluid/design/others/dcgan.png
deleted file mode 100644
index 15e8e290a111ff43900934341365cb4360d87d28..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/others/dcgan.png and /dev/null differ
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
deleted file mode 100644
index 7167470088766985fa5ad31657410309330fd725..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/gan_api.md
+++ /dev/null
@@ -1,253 +0,0 @@
-# Design for GAN
-
-GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
-
-It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
-
-In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
-
-
-
-Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
-
-
-The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
-
-
-
-Figure 2. Photo borrowed from the original DC-GAN paper.
-
-
-## The Conditional-GAN might be a class.
-This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
-
-- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
-
-- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
-
-- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
-Returns a generated image.
-
-- discriminator(image):
-Given an image, decide if it is from a real source or a fake one.
-Returns a 0/1 binary label.
-
-- build_model(self):
-build the whole GAN model, define training loss for both generator and discrimator.
-
-## Discussion on Engine Functions required to build GAN
-- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
-- Different optimizers responsible for optimizing different loss.
-
-To be more detailed, we introduce our design of DCGAN as following:
-
-### Class member Function: Initializer
-- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
-- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
-```python
-class DCGAN(object):
- def __init__(self, y_dim=None):
-
- # hyper parameters
- self.y_dim = y_dim # conditional gan or not
- self.batch_size = 100
- self.z_dim = z_dim # input noise dimension
-
- # define parameters of discriminators
- self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
- self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
- self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
- self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
- self.D_W2 = pd.Varialble(np.random.rand(128, 1))
- self.D_b2 = pd.Variable(np.zeros(128))
- self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
-
- # define parameters of generators
- self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
- self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
- self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
- self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data
- self.G_W2 = pd.Varialble(np.random.rand(128, 1))
- self.G_b2 = pd.Variable(np.zeros(128))
- self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
-```
-
-### Class member Function: Generator
-- Given a noisy input z, returns a fake image.
-- Concatenation, batch-norm, FC operations required;
-- Deconv layer required, which is missing now...
-```python
-class DCGAN(object):
- def generator(self, z, y = None):
- # input z: the random noise
- # input y: input data label (optional)
- # output G_im: generated fake images
-
- if not self.y_dim:
- z = pd.layer.concat(1, [z, y])
-
- G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
- G_h0_bn = pd.layer.batch_norm(G_h0)
- G_h0_relu = pd.layer.relu(G_h0_bn)
-
- G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
- G_h1_bn = pd.layer.batch_norm(G_h1)
- G_h1_relu = pd.layer.relu(G_h1_bn)
-
- G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
- G_im = pd.layer.tanh(G_im)
- return G_im
-```
-
-### Class member function: Discriminator
-- Given a noisy input z, returns a fake image.
-- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
-```python
-class DCGAN(object):
- def discriminator(self, image):
- # input image: either generated images or real ones
- # output D_h2: binary logit of the label
-
- D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
- D_h0_bn = pd.layer.batchnorm(h0)
- D_h0_relu = pd.layer.lrelu(h0_bn)
-
- D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
- D_h1_bn = pd.layer.batchnorm(D_h1)
- D_h1_relu = pd.layer.lrelu(D_h1_bn)
-
- D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
- return D_h2
-```
-
-### Class member function: Build the model
-- Define data readers as placeholders to hold the data;
-- Build generator and discriminators;
-- Define two training losses for discriminator and generator, respectively.
-If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
-```python
-class DCGAN(object):
- def build_model(self):
- if self.y_dim:
- self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
- self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
- self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
- self.z = pd.data(tf.float32, [None, self.z_size])
-
- # step 1: generate images by generator, classify real/fake images with discriminator
- if self.y_dim: # if conditional GAN, includes label
- self.G = self.generator(self.z, self.y)
- self.D_t = self.discriminator(self.images)
- # generated fake images
- self.sampled = self.sampler(self.z, self.y)
- self.D_f = self.discriminator(self.G)
- else: # original version of GAN
- self.G = self.generator(self.z)
- self.D_t = self.discriminator(self.images)
- # generate fake images
- self.sampled = self.sampler(self.z)
- self.D_f = self.discriminator(self.images)
-
- # step 2: define the two losses
- self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
- self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
- self.d_loss = self.d_loss_real + self.d_loss_fake
-
- self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
-```
-
-If we do not have dependency engine but blocks, the module building our GAN model will be like this:
-```python
-class DCGAN(object):
- def build_model(self, default_block):
- # input data in the default block
- if self.y_dim:
- self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
- self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
- # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
- self.z = pd.data(tf.float32, [None, self.z_size])
-
- # step 1: generate images by generator, classify real/fake images with discriminator
- with pd.default_block().g_block():
- if self.y_dim: # if conditional GAN, includes label
- self.G = self.generator(self.z, self.y)
- self.D_g = self.discriminator(self.G, self.y)
- else: # original version of GAN
- self.G = self.generator(self.z)
- self.D_g = self.discriminator(self.G, self.y)
- self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
-
- with pd.default_block().d_block():
- if self.y_dim: # if conditional GAN, includes label
- self.D_t = self.discriminator(self.images, self.y)
- self.D_f = self.discriminator(self.G, self.y)
- else: # original version of GAN
- self.D_t = self.discriminator(self.images)
- self.D_f = self.discriminator(self.G)
-
- # step 2: define the two losses
- self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
- self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
- self.d_loss = self.d_loss_real + self.d_loss_fake
-```
-Some small confusion and problems with this design:
-- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
-- Requires ability to create a block anytime, rather than in if-else or rnn only;
-
-## Main function for the demo:
-Generally, the user of GAN just need to the following things:
-- Define an object as DCGAN class;
-- Build the DCGAN model;
-- Specify two optimizers for two different losses with respect to different parameters.
-```python
-# pd for short, should be more concise.
-from paddle.v2 as pd
-import numpy as np
-import logging
-
-if __name__ == "__main__":
- # dcgan class in the default graph/block
- # if we use dependency engine as tensorflow
- # the codes, will be slightly different like:
- # dcgan = DCGAN()
- # dcgan.build_model()
- with pd.block() as def_block:
- dcgan = DCGAN()
- dcgan.build_model(def_block)
-
- # load mnist data
- data_X, data_y = self.load_mnist()
-
- # Two subgraphs required!!!
- with pd.block().d_block():
- d_optim = pd.train.Adam(lr = .001, beta= .1)
- d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
- with pd.block.g_block():
- g_optim = pd.train.Adam(lr = .001, beta= .1)
- g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
-
- # executor
- sess = pd.executor()
-
- # training
- for epoch in xrange(10000):
- for batch_id in range(N / batch_size):
- idx = ...
- # sample a batch
- batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
- # sample z
- batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
-
- if batch_id % 2 == 0:
- sess.run(d_step,
- feed_dict = {dcgan.images: batch_im,
- dcgan.y: batch_label,
- dcgan.z: batch_z})
- else:
- sess.run(g_step,
- feed_dict = {dcgan.z: batch_z})
-```
-
-# More thinking about dependency engine v.s. block design:
-- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
-- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/fluid/design/others/graph.md b/doc/fluid/design/others/graph.md
deleted file mode 100644
index 7519a65df835a39fe14f6ef45530afff170191ff..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/graph.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Design Doc: Computations as a Graph
-
-A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
-
-This document explains that the construction of a graph as three steps:
-
-- construct the forward part
-- construct the backward part
-- construct the optimization part
-
-## The Construction of a Graph
-
-Let us take the problem of image classification as a simple example. The application program that trains the model looks like:
-
-```python
-x = layer.data("images")
-l = layer.data("label")
-y = layer.fc(x)
-cost = layer.mse(y, l)
-optimize(cost)
-train(cost, reader=mnist.train())
-```
-
-### Forward Part
-
-The first four lines of above program build the forward part of the graph.
-
-![](images/graph_construction_example_forward_only.png)
-
-In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
-
-Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
-
-In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message.
-
-### Backward Part
-
-The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
-
-`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
-
-![](images/graph_construction_example_forward_backward.png)
-
-According to the chain rule of gradient computation, `ConstructBackwardGraph` would
-
-1. create a gradient operator G for each operator F,
-1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
-1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
-1. make all these gradients as outputs of G.
-
-### Optimization Part
-
-For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph:
-
-![](images/graph_construction_example_all.png)
-
-## Block and Graph
-
-The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
-
-A Block keeps operators in an array `BlockDesc::ops`
-
-```protobuf
-message BlockDesc {
- repeated OpDesc ops = 1;
- repeated VarDesc vars = 2;
-}
-```
-
-in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
deleted file mode 100644
index 6c6db08f463ae0a2b94fc4546f123a1d7c151870..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/graph_survey.md
+++ /dev/null
@@ -1,232 +0,0 @@
-## Survey on Graph
-
-Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
-
-### Mxnet
-
-The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
-
-
-`Symbol` is help class used to represent the operator node in Graph.
-`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
-
-
-A simple network topology wrote by Symbol is as follows:
-
-```python
-def get_symbol(num_classes=10, **kwargs):
- data = mx.symbol.Variable('data')
- data = mx.symbol.Flatten(data=data)
- fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
- act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
- fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
- act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
- fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
- mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
- return mlp
-```
-
-
-
-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
-
-Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
-
-And Symbol can be saved to a Json file.
-
-Here is a detailed example:
-
-```
->>> import mxnet as mx
->>> data = mx.symbol.Variable('data')
->>> print data.debug_str()
-Variable:data
-
->>> data = mx.symbol.Flatten(data=data)
->>> print data.debug_str()
-Symbol Outputs:
- output[0]=flatten0(0)
-Variable:data
---------------------
-Op:Flatten, Name=flatten0
-Inputs:
- arg[0]=data(0) version=0
-
->>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
->>> print fc1.debug_str()
-Symbol Outputs:
- output[0]=fc1(0)
-Variable:data
---------------------
-Op:Flatten, Name=flatten0
-Inputs:
- arg[0]=data(0) version=0
-Variable:fc1_weight
-Variable:fc1_bias
---------------------
-Op:FullyConnected, Name=fc1
-Inputs:
- arg[0]=flatten0(0)
- arg[1]=fc1_weight(0) version=0
- arg[2]=fc1_bias(0) version=0
-Attrs:
- num_hidden=128
-
-```
-
-
-### TensorFlow
-
-
-The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
-
-A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
-
-A simple example is as follows:
-
-```python
- # Build a dataflow graph.
- c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
- d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
- e = tf.matmul(c, d)
-
- # Construct a `Session` to execute the graph.
- sess = tf.Session()
-
- # Execute the graph and store the value that `e` represents in `result`.
- result = sess.run(e)
-```
-
-
-The main method of `Tensor` is as follows:
-
-
-```python
-@property
-def op(self):
- """The `Operation` that produces this tensor as an output."""
- return self._op
-
-@property
-def dtype(self):
- """The `DType` of elements in this tensor."""
- return self._dtype
-
-@property
-def graph(self):
- """The `Graph` that contains this tensor."""
- return self._op.graph
-
-@property
-def name(self):
- """The string name of this tensor."""
- if not self._op.name:
- raise ValueError("Operation was not named: %s" % self._op)
- return "%s:%d" % (self._op.name, self._value_index)
-
-@property
-def device(self):
- """The name of the device on which this tensor will be produced, or None."""
- return self._op.device
-```
-
-
-Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
-
-
-Here is a detailed example:
-
-
-```
->>> import tensorflow as tf
->>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
->>> print c.graph
-
->>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
->>> print d.graph
-
->>> e = tf.matmul(c, d)
->>> print e.graph
-
-```
-
-### Dynet
-
-
-The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
-
-
-A simple example is as follows:
-
-```cpp
-ComputationGraph cg;
-Expression W = parameter(cg, pW);
-
-Expression in = input(cg, xs[i]);
-Expression label = input(cg, ys[i]);
-Expression pred = W * in;
-Expression loss = square(pred - label);
-```
-
-The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node.
-
-Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
-
-
-Here is a detailed example:
-
-write topology in C++
-
-```
-ComputationGraph cg;
-Expression W = parameter(cg, pW);
-cg.print_graphviz();
-
-Expression pred = W * xs[i];
-cg.print_graphviz();
-
-Expression loss = square(pred - ys[i]);
-cg.print_graphviz();
-```
-
-compile and print
-
-```
-# first print
-digraph G {
- rankdir=LR;
- nodesep=.05;
- N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
-}
-# second print
-digraph G {
- rankdir=LR;
- nodesep=.05;
- N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
- N1 [label="v1 = v0 * -0.98"];
- N0 -> N1;
-}
-# third print
-digraph G {
- rankdir=LR;
- nodesep=.05;
- N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
- N1 [label="v1 = v0 * -0.98"];
- N0 -> N1;
- N2 [label="v2 = -1.88387 - v1"];
- N1 -> N2;
- N3 [label="v3 = -v2"];
- N2 -> N3;
- N4 [label="v4 = square(v3)"];
- N3 -> N4;
-}
-```
-
-### Conclusion
-
-
-Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
-
-- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
-- Expression corresponds with a global Graph, and Expression can also be composed.
-- Expression tracks all dependency and can be taken as a run target
diff --git a/doc/fluid/design/others/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash
deleted file mode 100755
index 35e6997abd17588e17a82d448918fc1b3bd7220e..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/images/graph_construction_example.bash
+++ /dev/null
@@ -1,11 +0,0 @@
-cat ./graph_construction_example.dot | \
- sed 's/color=red/color=red, style=invis/g' | \
- sed 's/color=green/color=green, style=invis/g' | \
- dot -Tpng > graph_construction_example_forward_only.png
-
-cat ./graph_construction_example.dot | \
- sed 's/color=green/color=green, style=invis/g' | \
- dot -Tpng > graph_construction_example_forward_backward.png
-
-cat ./graph_construction_example.dot | \
- dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/design/others/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot
deleted file mode 100644
index e115f9844bae6ad24f638c8ed4749cea8aff06a9..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/images/graph_construction_example.dot
+++ /dev/null
@@ -1,68 +0,0 @@
-digraph ImageClassificationGraph {
- ///////// The forward part /////////
- FeedX [label="Feed", color=blue, shape=box];
- FeedY [label="Feed", color=blue, shape=box];
- InitW [label="Init", color=blue, shape=diamond];
- Initb [label="Init", color=blue, shape=diamond];
- FC [label="FC", color=blue, shape=box];
- MSE [label="MSE", color=blue, shape=box];
-
- x [label="x", color=blue, shape=oval];
- l [label="l", color=blue, shape=oval];
- y [label="y", color=blue, shape=oval];
- W [label="W", color=blue, shape=doublecircle];
- b [label="b", color=blue, shape=doublecircle];
- cost [label="cost", color=blue, shape=oval];
-
- FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
- FeedY -> l [color=blue];
- InitW -> W [color=blue];
- Initb -> b [color=blue];
- W -> FC [color=blue];
- b -> FC [color=blue];
- l -> MSE [color=blue];
-
- ////////// The backward part /////////
- MSE_Grad [label="MSE_grad", color=red, shape=box];
- FC_Grad [label="FC_grad", color=red, shape=box];
-
- d_cost [label="d cost", color=red, shape=oval];
- d_y [label="d y", color=red, shape=oval];
- d_b [label="d b", color=red, shape=oval];
- d_W [label="d W", color=red, shape=oval];
-
- cost -> MSE_Grad [color=red];
- d_cost -> MSE_Grad [color=red];
- l -> MSE_Grad [color=red];
- y -> MSE_Grad -> d_y [color=red];
-
- x -> FC_Grad [color=red];
- y -> FC_Grad [color=red];
- d_y -> FC_Grad [color=red];
- W -> FC_Grad -> d_W [color=red];
- b -> FC_Grad -> d_b [color=red];
-
- ////////// The optimizaiton part //////////
-
- OPT_W [label="SGD", color=green, shape=box];
- OPT_b [label="SGD", color=green, shape=box];
-
- W -> OPT_W [color=green];
- b -> OPT_b [color=green];
- d_W -> OPT_W -> W [color=green];
- d_b -> OPT_b -> b [color=green];
-
- ////////// Groupings //////////
-
- subgraph clusterMSE {
- style=invis;
- MSE;
- MSE_Grad;
- }
-
- subgraph clusterFC {
- style=invis;
- FC;
- FC_Grad;
- }
-}
diff --git a/doc/fluid/design/others/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png
deleted file mode 100644
index 261611a5721f9aa97874f7e6d897fe48cf667db2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/others/images/graph_construction_example_all.png and /dev/null differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
deleted file mode 100644
index 4c69687f4a6a181138f3df72ce5e8aa48487b5be..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/others/images/graph_construction_example_forward_backward.png and /dev/null differ
diff --git a/doc/fluid/design/others/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
deleted file mode 100644
index e668c16e0cac73acb4e5dc2b1827557ae77126b4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/others/images/graph_construction_example_forward_only.png and /dev/null differ
diff --git a/doc/fluid/design/others/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md
deleted file mode 100644
index a7ac3f17c44ca94a669a8f1e283b291bceb42317..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/parameters_in_cpp.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Design Doc: The C++ Class `Parameters`
-
-`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
-
-We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
-* We just use `memcpy` to share Parameters between topologies, but this is very inefficient.
-* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
-
-It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
-
-1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
-It is evident that we should use `paddle::Parameter` when developing `Parameters`.
-However, the `Parameter` class contains many functions and does not have a clear interface.
-It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
-When we developing `Parameters`, we only use `create/store Parameter` functionality.
-We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
-
-2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
-We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
-Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
-`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
-
-3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle.
-So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
-
-
-The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
-
-1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
-
-2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
-
-3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
-Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
-`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
- * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
- * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
-
-4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
-
-5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/fluid/design/others/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md
deleted file mode 100644
index c7aeed7f9b4637e1c29d530f37b42d12500af82f..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/simple_op_design.md
+++ /dev/null
@@ -1,202 +0,0 @@
-## Interaction between C++ and Python
-
-Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++.
-
-The Interaction between Python and C++ can be simplified as two steps:
-
-1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
-
-2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
-
-### Message from C++ to Python
-
-We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
-
-Following message are necessary:
-
-1. Op's name, and its simple comment.
-2. Input and output variable number; each variable's name, type, and comment.
-3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
-
-So `OpProto` can be defined as follows:
-
-```proto
-enum AttrType {
- INT = 1;
- FLOAT = 2;
- STRING = 3;
- INTS = 4;
- FLOATS = 5;
- STRINGS = 6;
-};
-
-message AttrValue {
- AttrType type = 1;
- optional int iv = 2;
- optional float fv = 3;
- optional string sv = 4;
- repeated int ivs = 5;
- repeated float fvs = 6;
- repeated string svs = 7;
-};
-
-message AttrProto {
- required string name = 1;
- required string comment = 2;
- required AttrType type = 3;
-};
-
-message VarProto {
- required string name = 1;
- required string comment = 2;
- required bool is_tensor = 3;
-};
-
-message OpProto {
- repeated VarProto inputs = 1;
- repeated VarProto outputs = 2;
- repeated AttrProto attrs = 3;
- required string type = 4;
- required string comment = 5;
-};
-```
-
-To generate Python code automatically:
-
-```python
-def create_python_ops_creatation_functions():
- op_protos = paddle.framework.OpRegistry.get_all_op_proto()
- for type_name in op_protos:
- op_proto = op_protos[type_name]
- def __impl__(**kwargs): # User must use key word args in Paddle API
- inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs]
- outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs]
- attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs]
- opdesc = (input, outputs, type_name, attrs)
- return paddle.framework.OpRegistry.CreateOp(opdesc)
- __impl__.__doc__ = create_doc_string(op_proto)
- globals()[type_name] = __impl__
-
-create_python_ops_creatation_functions()
-```
-
-### Message from Python to C++
-
-To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
-
-```proto
-message OpDesc {
- required string type = 1;
- repeated string inputs = 2;
- repeated string outputs = 3;
- map attrs = 4;
-};
-```
-
-## OpProto Register
-
-Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
-
-```cpp
-class OpProtoMaker {
-public:
- OpProtoMaker(OpProto* proto): proto_(proto) {}
-protected:
- OpProto* proto_;
- void AddInput(const std::string& name, const std::string& desc) {...}
- void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
- void AddComment(const std::string& comment) { ... }
-};
-
-class OpRegistry {
-public:
- using OpCreator = std::function;
-
- template
- static void RegisterOp(const std::string& name) {
- gCreators_[name] = [](const OpDesc& desc) {
- return new OpType(desc);
- };
- OpProto& opProto = gProtos_[name];
- OpMaker()(&opProto);
- }
-
- static map gCreators_;
- static map gProtos_;
-};
-
-template
-class OpRegister {
- public:
- OpRegister(std::string type) {
- OpRegistry::RegisterOp(type);
- }
-};
-
-#define REGISTER_OP(op_class, op_maker_class, type_name) \
- class op_class##Register { \
- private: \
- const static OpRegister<#op_class, #op_maker_class> reg; \
- }; \
- const Register op_class##Register::reg(#type_name);
-
-class CosineOp {
-// ...
-}
-
-struct CosineOpProtoMaker : public OpProtoMaker {
- CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
- AddInput("input", "input of cosine op");
- AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
- AddType("cos");
- AddComment("This is cos op");
- }
-}
-
-REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
-```
-
-In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here.
-
-## Python API
-
-Python APIs are divided into two types, high-level API and low-level API.
-
-### High-Level API
-
-High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
-
-Here is a sample about how a define a fc layer:
-
-```python
-hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
-```
-
-`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
-
-The definition of `fc_layer()`:
-
-```python
-def fc_layer(input, size, with_bias, activation):
- attr_map = {"size":size}
- check_attrs(attr_map)
- w = make_variable('w')
- if with_bias:
- b = make_variable('b')
- else:
- b = None
- fc_output = make_variable('fc_output');
- fc_op(input, w, b, fc_output, attr_map)
- act_output = make_variable('sigmod_output');
- if activation == "sigmod":
- sigmod_op(fc_output, act_output);
- elif:
- # ...
- return act_output;
-```
-
-### Low Leval API
-
-In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
-
-*TODO*
diff --git a/doc/fluid/design/others/test.dot b/doc/fluid/design/others/test.dot
deleted file mode 100644
index 62c69b8fc8010a26a54a6ee8ef1488aad94d747a..0000000000000000000000000000000000000000
--- a/doc/fluid/design/others/test.dot
+++ /dev/null
@@ -1,35 +0,0 @@
-
-digraph Test {
- z -> generator -> G_img;
- G_img -> discriminator -> D_f -> d_loss_f;
- label0 -> d_loss_f -> d_loss;
-
- img -> discriminator -> D_t -> d_loss_t;
- label1 -> d_loss_t -> d_loss;
-
- d_loss -> d_loss_t[color=red, style=dashed];
- d_loss -> d_loss_f[color=red, style=dashed];
- d_loss_t -> D_t[color=red, style=dashed];
- d_loss_f -> D_f[color=red, style=dashed];
- D_t -> discriminator[color=red, style=dashed];
- D_f -> discriminator[color=red, style=dashed];
-
- D_f -> g_loss;
- label2 -> g_loss;
-
- g_loss -> D_f[color=green, style=dashed];
- D_f -> discriminator[color=green, style=dashed];
- discriminator -> G_img[color=green, style=dashed];
- G_img -> generator[color=green, style=dashed];
-
- discriminator [color=red, shape=box];
- generator [color=green, shape=box];
- z [shape=diamond];
- img [shape=diamond];
- label0 [shape=diamond];
- label1 [shape=diamond];
- label2 [shape=diamond];
-
- d_loss [color=red];
- g_loss [color=green];
-}
diff --git a/doc/fluid/design/others/test.dot.png b/doc/fluid/design/others/test.dot.png
deleted file mode 100644
index 4e121a40b9f7b2232d7cdda315bad15926446f55..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/others/test.dot.png and /dev/null differ
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
deleted file mode 100644
index 085352fc5614d693e63a2f7241e868a9649456af..0000000000000000000000000000000000000000
--- a/doc/fluid/design/quantization/fixed_point_quantization.md
+++ /dev/null
@@ -1,110 +0,0 @@
-Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements. It is especially important for the inference in embedded-device deployment.
-
-According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
-
-This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
-
-
-### How to quantize
-
-There are many ways to quantize the float value to fixed-point value. For example:
-
-$$ r = min(max(x, a), b)$$
-$$ s = \frac{b - a}{n - 1} $$
-$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
-
-where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer.
-
-
-The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
-
-$$ M = max(abs(x)) $$
-$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
-
-where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer.
-
-
-Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
-
-$q = scale * r + b$
-
-We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
-
-
-How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
-
-
-### Training Framework
-
-#### Forward pass
-
-The forward pass is simulated quantization, see Figure 1.
-
-The training framework is as following figure.
-
-
-
-Figure 1. Forward in training with simulated quantization.
-
-
-- Firstly, both input and weight will be quantized to 8-bit integers.
-- Second, do the multiplication (or convolution) operation with integers.
-- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
-- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
-
-For general matrix multiplication (GEMM), quantize for $X$ and $W$:
-
-$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil $$
-$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
-
-Do GEMM:
-
-$$ Y = X_q * W_q $$
-
-
-Dequantize $Y$:
-
-$$
-\begin{align}
-Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
- &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
- &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m)
-\end{align}
-$$
-
-From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
-
-
-
-Figure 2. Equivalent forward in training with simulated quantization.
-
-
-We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
-
-#### Backward pass
-
-See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
-
-
-
-Figure 3. Backward and weight updating in training with simulated quantization.
-
-
-So the quantization transipler will change some inputs of the corresponding backward operators.
-
-### How to calculate quantization scale
-
-There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
-
-For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
-
-For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
-
-
-1. Calculate the mean of maximum absolute during a window.
-2. Calculate the max of maximum absolute during a window.
-3. Calculate the running mean of maximum absolute during a window, as follows:
-
- $$ Vt = (1 - k) * V + k * V_{t-1} $$
-
- where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
deleted file mode 100644
index 84f8235ab87cb631992b691f8e05b9c0b6c93da2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/quantization/quantization_backward_and_optimization.png and /dev/null differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
deleted file mode 100644
index df49c864537c047c785da12d24893e54ce0a5341..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/quantization/quantization_equivalent_forward.png and /dev/null differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
deleted file mode 100644
index 0913f61621bb6533bcb10bd1d18120ccaaa96cff..0000000000000000000000000000000000000000
Binary files a/doc/fluid/design/quantization/quantization_forward.png and /dev/null differ
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
deleted file mode 100644
index 7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ /dev/null
@@ -1,221 +0,0 @@
-# API注释撰写标准
-
-- [API注释撰写标准](#api)
- - [API注释模块](#api)
- - [格式及示例](#)
- - [完整示例](#)
-
-
-## API注释模块
-
-API文档须包含以下几个模块(排列顺序为文档撰写顺序):
-
-- Python API Definition
-
- API的代码定义。
-
-- Function Description
-
- API的功能描述。描述该API的含义、作用或对输入所做的操作,及参考文献和对应链接(如果有),必要时给出公式,并解释公式中关键变量的含义。
-
-- Args Description
-
- API参数介绍。按代码定义中的参数顺序逐个介绍,介绍内容包含数据类型、默认值(如果有)、含义等。
-
-- Returns
-
- API返回值介绍。介绍返回值含义,必要时给出对应的形状。若返回值为包含多个参数的tuple,则按顺序逐个介绍各参数。
-
-- Raises(如果有)
-
- 可能抛出的异常或错误及可能的产生原因,当可能抛出多种异常或错误时应分条列出。
-
-- Note(如果有)
-
- 注意事项。当有多条注意事项时,应分条列出。
-
-- Examples
-
- API的使用示例。
-
-
-## 格式及示例
-
-API文档须使用reStructuredText格式撰写,该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下(以下以fc为例进行说明):
-
-- Python API Definition
-
- - 格式:
-
- [Python API Definition]
-
- - 示例
-
- ```
- fc(input,
- size,
- num_flatten_dims=1,
- param_attr=None,
- bias_attr=None,
- act=None,
- name=None,
- main_program=None,
- startup_program=None)
- ```
-
-- Function Description
-
- - 格式
-
- 本模块应包含以下内容(排列顺序为文档撰写顺序):
-
- [Function Description]
-
- [Formula]
-
- [Symbols' Descriptions if necessary]
-
- [References if necessary]
-
- - 示例
-
- [Function Description]
-
- ```
- **Fully Connected Layer**
-
- The fully connected layer can take multiple tensors as its inputs. It
- creates a variable called weights for each input tensor, which represents
- a fully connected weight matrix from each input unit to each output unit.
- The fully connected layer multiplies each input tensor with its coresponding
- weight to produce an output Tensor. If multiple input tensors are given,
- the results of multiple multiplications will be sumed up. If bias_attr is
- not None, a bias variable will be created and added to the output. Finally,
- if activation is not None, it will be applied to the output as well.
- ```
-
- [Formula]
-
- ```
- This process can be formulated as follows:
-
- .. math::
-
- Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
- ```
-
- [Symbols' Descriptions if necessary]
-
- ```
- In the above equation:
-
- * :math:`N`: Number of the input.
- * :math:`X_i`: The input tensor.
- * :math:`W`: The weights created by this layer.
- * :math:`b`: The bias parameter created by this layer (if needed).
- * :math:`Act`: The activation function.
- * :math:`Out`: The output tensor.
- ```
-
- [References if necessary]
-
- 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例:
-
- ```
- Refer to `Layer Normalization `_ for more details.
- ```
-
-
-- Args Description
-
- - 格式
-
- \[Arg's Name\][(Data Type, Default Value)][Description]
-
- - 示例
-
- fc的部分参数注释如下:
-
- ```
- Args:
- input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
- the input tensor(s) is at least 2.
- param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
- parameters/weights of this layer.
- name (str, default None): The name of this layer.
- ```
-
-- Returns
-
- - 格式
-
- [Name][Shape]
-
- - 示例
-
- ```
- Returns:
- A tensor variable storing the transformation result.
- ```
-
- 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例:
-
- ```
- Returns:
- A tuple containing:
- The hidden state of LSTM whose shape is (T X D).
- The cell state of LSTM whose shape is (T X D).
- ```
-
-- Raises
-
- - 格式
-
- [Exception Type][Condition]
-
- - 示例
-
- ```
- Raises:
- ValueError: If the rank of the input is less than 2.
- ```
-
-- Note
-
- - 格式
-
- [Note]
-
- - 示例
-
- fc没有注意事项,故该模块省略不写。如有注意事项应明确给出,当有多条注意事项,须分条列出,以scaled\_dot\_product\_attention为例:
-
- ```
- Note:
- 1. When num_heads > 1, three linear projections are learned respectively
- to map input queries, keys and values into queries', keys' and values'.
- queries', keys' and values' have the same shapes with queries, keys
- and values.
- 2. When num_heads == 1, scaled_dot_product_attention has no learnable
- parameters.
- ```
-
-- Examples
-
- - 格式
-
- \[Python Code Snipper]
-
- - 示例
-
- ```
- Examples:
- .. code-block:: python
-
- data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
- fc = fluid.layers.fc(input=data, size=1000, act="tanh")
- ```
-
-## 完整示例
-
-fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
deleted file mode 100644
index f175b219750d1c765a6a111c2ec3aa732fa46175..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/api_doc_std_en.md
+++ /dev/null
@@ -1,227 +0,0 @@
-# API Doc Standard
-
-- [API Doc Standard](#api-doc-standard)
- - [API Doc Structure](#api-doc-structure)
- - [Format and Examples](#format-and-examples)
- - [Complete Example](#complete-example)
-
-
-## API Doc Structure
-
-API Doc should contain the following parts(please write them in order):
-
-- Python API Definition
-
- The definition of API
-
-- Function Description
-
- Description of API's function.
- The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
-
-- Args Description
-
- Description of API parameters.
- Introduce parameters one by one according to the order in API definition.
- The introduction includes: data type, default value(if any), meaning, etc.
-
-- Returns
-
- Introduction of API returned value.
- Introduce meaning of returned value, provide correspoding format if necessary.
- If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
-
-- Raises(if any)
-
- Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order.
-
-- Note(if any)
-
- Matters needing attention. If there are more than one matters, they should be listed in order.
-
-- Examples
-
- Examples of how to use API.
-
-
-## Format and Examples
-
-API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
-Format and examples of each part of API documantation are as follows: (take fc for example)
-
-- Python API Definition
-
- - Format
-
- [Python API Definition]
-
- - Example
-
- ```
- fc(input,
- size,
- num_flatten_dims=1,
- param_attr=None,
- bias_attr=None,
- act=None,
- name=None,
- main_program=None,
- startup_program=None)
- ```
-
-- Function Description
-
- - Format
-
- This part contains (please write them in order):
-
- [Function Description]
-
- [Formula]
-
- [Symbols' Descriptions if necessary]
-
- [References if necessary]
-
- - Example
-
- [Function Description]
-
- ```
- **Fully Connected Layer**
-
- The fully connected layer can take multiple tensors as its inputs. It
- creates a variable called weights for each input tensor, which represents
- a fully connected weight matrix from each input unit to each output unit.
- The fully connected layer multiplies each input tensor with its coresponding
- weight to produce an output Tensor. If multiple input tensors are given,
- the results of multiple multiplications will be sumed up. If bias_attr is
- not None, a bias variable will be created and added to the output. Finally,
- if activation is not None, it will be applied to the output as well.
- ```
-
- [Formula]
-
- ```
- This process can be formulated as follows:
-
- .. math::
-
- Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
- ```
-
- [Symbols' Descriptions if necessary]
-
- ```
- In the above equation:
-
- * :math:`N`: Number of the input.
- * :math:`X_i`: The input tensor.
- * :math:`W`: The weights created by this layer.
- * :math:`b`: The bias parameter created by this layer (if needed).
- * :math:`Act`: The activation function.
- * :math:`Out`: The output tensor.
- ```
-
- [References if necessary]
-
- Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example:
-
- ```
- Refer to `Layer Normalization `_ for more details.
- ```
-
-
-- Args Description
-
- - Format
-
- \[Arg's Name\][(Data Type, Default Value)][Description]
-
- - Example
-
- part of fc parameters are as follows:
-
- ```
- Args:
- input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
- the input tensor(s) is at least 2.
- param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
- parameters/weights of this layer.
- name (str, default None): The name of this layer.
- ```
-
-- Returns
-
- - Format
-
- [Name][Shape]
-
- - Example
-
- ```
- Returns:
- A tensor variable storing the transformation result.
- ```
-
- when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
-
- ```
- Returns:
- A tuple containing:
- The hidden state of LSTM whose shape is (T X D).
- The cell state of LSTM whose shape is (T X D).
- ```
-
-- Raises
-
- - Format
-
- [Exception Type][Condition]
-
- - Example
-
- ```
- Raises:
- ValueError: If the rank of the input is less than 2.
- ```
-
-- Note
-
- - Format
-
- [Note]
-
- - Example
-
- there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
-
- ```
- Note:
- 1. When num_heads > 1, three linear projections are learned respectively
- to map input queries, keys and values into queries', keys' and values'.
- queries', keys' and values' have the same shapes with queries, keys
- and values.
- 2. When num_heads == 1, scaled_dot_product_attention has no learnable
- parameters.
- ```
-
-- Examples
-
- - Format
-
- \[Python Code Snipper]
-
- - Example
-
- ```
- Examples:
- .. code-block:: python
-
- data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
- fc = fluid.layers.fc(input=data, size=1000, act="tanh")
- ```
-
-## Complete Example
-
-Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png
deleted file mode 100644
index 232762b82a9ae3e979a1f38a7beb715c87438f40..0000000000000000000000000000000000000000
Binary files a/doc/fluid/dev/ci_build_whl.png and /dev/null differ
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
deleted file mode 120000
index 955216ca62e71b4d3666e1662aa86c9495d2e7d6..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/contribute_to_paddle_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
deleted file mode 120000
index f9fc68c37e17a8a365b0d7fae86c16b0d094631f..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1 +0,0 @@
-../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
deleted file mode 100644
index 37e608160db0ad5a92297987937bbbfa8f842ea8..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/index_cn.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-开发标准
-------------
-
-.. toctree::
- :maxdepth: 1
-
- contribute_to_paddle_cn.md
- write_docs_cn.md
- api_doc_std_cn.md
- new_op_cn.md
- new_op_kernel.md
- use_eigen_cn.md
- name_convention.md
- support_new_device.md
- releasing_process_cn.md
- op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
deleted file mode 100644
index d7f83035010f13c30514673ecbee301f194dc175..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/index_en.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-Development
-------------
-
-.. toctree::
- :maxdepth: 1
-
- contribute_to_paddle_en.md
- write_docs_en.md
- api_doc_std_en.md
- new_op_en.md
- new_op_kernel.md
- use_eigen_en.md
- name_convention.md
- support_new_device.md
- releasing_process_en.md
- op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
deleted file mode 100644
index 6b4244d0f506c8cd6c08739141eabad27c581ca7..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/name_convention.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Operator's Parameter Name Convention
-
-To make the operator document itself more clear, we recommend operator names obey the listing conventions.
-
-## OpProtoMaker names
-
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
-
-- Input/Output.
- - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
- - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
-
-- Attribute.
- - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
-
-- Comments.
- - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g. Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
- - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
-
-- Order.
- - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
-
-## Best Practice
-
-Here we give some examples to show how these rules will be used.
-
-- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
-
-- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
-
-- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
-
- We give a full example of Accumulator Operator.
-
-```c++
-class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
-public:
- AccumulateOpMaker(OpProto *proto,
- OpAttrChecker *op_checker)
- : OpProtoAndCheckerMaker(proto, op_checker) {
- AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
- If the output size is not the same as input size,
- the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
- AddOutput("Out", "(Tensor) Accumulated output tensor");
- AddAttr("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
- AddComment(R"DOC(
-Accumulate Operator.
-
-This operator accumulates the input tensor to the output tensor. If the
-output tensor already has the right size, we add to it; otherwise, we first
-initialize the output tensor to all zeros, and then do accumulation. Any
-further calls to the operator, given that no one else fiddles with the output
-in the interim, will do simple accumulations.
-
-Accumulation is done as follows:
-
-Out = 1*X + gamma*Out
-
-where X is the input tensor, Out is the output tensor and gamma is the multiplier
-argument.
-
-)DOC");
- }
-};
-```
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
deleted file mode 100644
index 587d819f79fcf82549826359fbf04ad3af404446..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/new_op_cn.md
+++ /dev/null
@@ -1,336 +0,0 @@
-# 如何写新的Operator
-
- - [概念简介](#概念简介)
- - [实现C++类](#实现c类)
- - [定义ProtoMaker类](#定义protomaker类)
- - [定义Operator类](#定义operator类)
- - [定义OpKernel类](#定义opkernel类)
- - [注册Operator](#注册operator)
- - [编译](#编译)
- - [绑定Python](#绑定python)
- - [实现单元测试](#实现单元测试)
- - [前向Operator单测](#前向operator单测)
- - [反向Operator单测](#反向operator单测)
- - [编译和执行](#编译和执行)
- - [注意事项](#注意事项)
-
-
-## 概念简介
-
-简单介绍需要用到基类,详细介绍请参考设计文档。
-
-- `framework::OperatorBase`: Operator(简写,Op)基类。
-- `framework::OpKernel`: Op计算函数的基类,称作Kernel。
-- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
-- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
-
-依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
-
-
-
-
-内容 |
-定义位置 |
-
-
-
-
-OpProtoMake定义 |
-`.cc`文件,Backward Op不需要定义OpProtoMake |
-
-
-Op定义 |
- `.cc`文件 |
-
-
-Kernel实现 |
- CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 |
-
-
-注册Op |
- Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 |
-
-
-
-
-
-实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
-
-
-下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
-
-
-## 实现C++类
-
-
-### 定义ProtoMaker类
-
-矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
-
-首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释:
-
-```cpp
-class MulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
- MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
- : OpProtoAndCheckerMaker(proto, op_checker) {
- AddInput("X", "(Tensor), 2D tensor of size (M x K)");
- AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
- AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
- AddComment(R"DOC(
-Two Element Mul Operator.
-The equation is: Out = X * Y
-)DOC");
- }
-};
-```
-
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数:
-
- - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
- - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
-
-构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
-
-上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
-
-
-再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例:
-
-```cpp
-template
-class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
- ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
- : OpProtoAndCheckerMaker(proto, op_checker) {
- AddInput("X", "(Tensor) Input tensor of scale operator.");
- AddOutput("Out", "(Tensor) Output tensor of scale operator.");
- AddComment(R"DOC(
-Scale operator
-$$Out = scale*X$$
-)DOC");
- AddAttr("scale",
- "(float, default 1.0)"
- "The scaling factor of the scale operator.")
- .SetDefault(1.0);
- }
-};
-```
-
-这个例子有`AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
-
-
-### 定义Operator类
-
-下面的点实现了MulOp的定义:
-
-```cpp
-class MulOp : public framework::OperatorWithKernel {
- public:
- using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
- void InferShape(const framework::InferShapeContext &ctx) const override {
- auto dim0 = ctx.Input("X")->dims();
- auto dim1 = ctx.Input("Y")->dims();
- PADDLE_ENFORCE_EQ(dim0.size(), 2,
- "input X(%s) should be a tensor with 2 dims, a matrix",
- ctx.op_.Input("X"));
- PADDLE_ENFORCE_EQ(dim1.size(), 2,
- "input Y(%s) should be a tensor with 2 dims, a matrix",
- ctx.op_.Input("Y"));
- PADDLE_ENFORCE_EQ(
- dim0[1], dim1[0],
- "First matrix's width must be equal with second matrix's height.");
- ctx.Output("Out")->Resize({dim0[0], dim1[1]});
- }
-};
-```
-
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员:
-
-```cpp
-using framework::OperatorWithKernel::OperatorWithKernel;
-```
-
-这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
-
-```cpp
-MulOp(const std::string &type, const framework::VariableNameMap &inputs,
- const framework::VariableNameMap &outputs,
- const framework::AttributeMap &attrs)
- : OperatorWithKernel(type, inputs, outputs, attrs) {}
-```
-
-还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
-
- - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
- - 2). 设置输出Tensor的形状。
-
-通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中
-
-### 定义OpKernel类
-
-`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
-
-- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
-
-- `typename T` : 表示数据类型,如`float`, `double`等。
-
-需要为`MulKernel`类重写`Compute`接口。
-- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。
-- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。
-- `Compute`函数里实现`OpKernel`的具体计算逻辑。
-
-下面是 `MulKernel` `Compute`的实现:
-
- ```cpp
- template
- class MulKernel : public framework::OpKernel {
- public:
- void Compute(const framework::ExecutionContext& context) const override {
- auto* X = context.Input("X");
- auto* Y = context.Input("Y");
- auto* Z = context.Output("Out");
- Z->mutable_data(context.GetPlace());
- auto& device_context = context.template device_context();
- math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
- }
- };
- ```
-
-需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
-
-`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
-
-为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
-
-到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。
-反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
-
-### 注册Operator
-
-- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
-
- ```cpp
- namespace ops = paddle::operators;
- REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
- paddle::framework::DefaultGradOpDescMaker)
- REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
- REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
- REGISTER_OP_CPU_KERNEL(mul_grad,
- ops::MulGradKernel);
- ```
-
- 在上面的代码中:
-
- - `REGISTER_OPERATOR` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
- - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。
-
-
-- 在 `.cu`文件中注册CUDA Kernel。
- - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
-
- ```cpp
- // if use Eigen unsupported module before include head files
- #define EIGEN_USE_GPU
-
- namespace ops = paddle::operators;
- REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel);
- REGISTER_OP_CUDA_KERNEL(mul_grad,
- ops::MulGradKernel);
- ```
-
-### 编译
-
-运行下面命令可以进行编译:
-
-```
-make mul_op
-```
-
-## 绑定Python
-
-系统会对新增的op自动绑定Python,并链接到生成的lib库中。
-
-## 实现单元测试
-
-单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
-
-### 前向Operator单测
-
-Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要:
-
-1. 在`setUp`函数定义输入、输出,以及相关的属性参数。
-2. 生成随机的输入数据。
-3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。
-4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。
-
-
- ```python
- import unittest
- import numpy as np
- from op_test import OpTest
-
-
- class TestMulOp(OpTest):
- def setUp(self):
- self.op_type = "mul"
- self.inputs = {
- 'X': np.random.random((32, 84)).astype("float32"),
- 'Y': np.random.random((84, 100)).astype("float32")
- }
- self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-
- def test_check_output(self):
- self.check_output()
-
- def test_check_grad_normal(self):
- self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
- def test_check_grad_ingore_x(self):
- self.check_grad(
- ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
- def test_check_grad_ingore_y(self):
- self.check_grad(
- ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
- ```
-
-上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
-
-- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
-- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
-- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
-
-### 反向operator单测
-
-而反向测试中:
-- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
- - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
- - 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。
-- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
-
-
-### 编译和执行
-
-`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
-
-请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试:
-
-```bash
-make test ARGS="-R test_mul_op -V"
-```
-
-或者:
-
-```bash
-ctest -R test_mul_op
-```
-
-## 注意事项
-
-- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OPERATOR(B, ...)`等,这将会导致单元测试出错。
-- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
-- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
deleted file mode 100644
index f8de271ed4e5e0fb4018478bffd4b525d4319738..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/new_op_en.md
+++ /dev/null
@@ -1,352 +0,0 @@
-# How to write a new operator
-
- - [Background](#background)
- - [Implementing C++ Types](#implementing-c-types)
- - [Defining ProtoMaker](#defining-protomaker)
- - [Defining Operator](#defining-operator)
- - [Defining OpKernel](#defining-opkernel)
- - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
- - [Compilation](#compilation)
- - [Python Binding](#python-binding)
- - [Unit Tests](#unit-tests)
- - [Testing Forward Operators](#testing-forward-operators)
- - [Testing Backward Operators](#testing-backward-operators)
- - [Compiling and Running](#compiling-and-running)
- - [Remarks](#remarks)
-## Background
-
-Here are the base types needed. For details, please refer to the design docs.
-
-- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
-- `framework::OperatorBase`: Operator (Op)base class.
-- `framework::OpKernel`: Base class for Op computation kernel.
-- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
-
-
-Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
-
-
-
-
-
-Information |
- Where is it defined |
-
-
-
-
-OpProtoMake definition |
- `.cc`files, Backward Op does not need an OpProtoMake interface. |
-
-
-Op definition |
- `.cc` files |
-
-
-Kernel implementation |
- The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files. |
-
-
-Registering the Op |
- Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation. |
-
-
-
-
-
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
-
-
-Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
-
-
-## Implementing C++ Types
-
-
-### Defining ProtoMaker
-
-Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
-
-First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
-
-```cpp
-class MulOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
- MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
- : OpProtoAndCheckerMaker(proto, op_checker) {
- AddInput("X", "(Tensor), 2D tensor of size (M x K)");
- AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
- AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
- AddComment(R"DOC(
-Two Element Mul Operator.
-The equation is: Out = X * Y
-)DOC");
- }
-};
-```
-
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor:
-
- - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
- - `framework::OpAttrChecker` is used to validate variable attributes.
-
-The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
-
-The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
-
-
-An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
-
-```cpp
-template
-class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
- ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
- : OpProtoAndCheckerMaker(proto, op_checker) {
- AddInput("X", "The input tensor of scale operator.").NotInGradient();
- AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
- AddComment(R"DOC(Scale operator
-The equation is: Out = scale*X
-)DOC");
- AddAttr("scale", "scale of scale operator.").SetDefault(1.0);
- }
-};
-```
-
-Note `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
-
-
-### Defining Operator
-
-The following code defines the interface for MulOp:
-
-```cpp
-class MulOp : public framework::OperatorWithKernel {
- public:
- using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
- void InferShape(const framework::InferShapeContext &ctx) const override {
- auto dim0 = ctx.Input("X")->dims();
- auto dim1 = ctx.Input("Y")->dims();
- PADDLE_ENFORCE_EQ(dim0.size(), 2,
- "input X(%s) should be a tensor with 2 dims, a matrix",
- ctx.op_.Input("X"));
- PADDLE_ENFORCE_EQ(dim1.size(), 2,
- "input Y(%s) should be a tensor with 2 dims, a matrix",
- ctx.op_.Input("Y"));
- PADDLE_ENFORCE_EQ(
- dim0[1], dim1[0],
- "First matrix's width must be equal with second matrix's height.");
- ctx.Output("Out")->Resize({dim0[0], dim1[1]});
- }
-};
-```
-
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
-
-```cpp
-using framework::OperatorWithKernel::OperatorWithKernel;
-```
-
-expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
-
-```cpp
-MulOp(const std::string &type, const framework::VariableNameMap &inputs,
- const framework::VariableNameMap &outputs,
- const framework::AttributeMap &attrs)
- : OperatorWithKernel(type, inputs, outputs, attrs) {}
-```
-
-`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
-
- - 1). validate and error out early: it checks input data dimensions and types.
- - 2). configures the tensor shape in the output.
-
-Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
-
-### Defining OpKernel
-
-`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
-
-- `typename DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
-
-- `typename T` denotes data type, such as `float` or `double`.
-
-`MulKernel` types need to rewrite the interface for `Compute`.
-
-- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
-- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
-- `Compute` implements the computation logics of an `OpKernel`.
-
-`MulKernel`'s implementation of `Compute` is as follows:
-
- ```cpp
- template
- class MulKernel : public framework::OpKernel {
- public:
- void Compute(const framework::ExecutionContext& context) const override {
- auto* X = context.Input("X");
- auto* Y = context.Input("Y");
- auto* Z = context.Output("Out");
- Z->mutable_data(context.GetPlace());
- auto& device_context = context.template device_context();
- math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
- }
- };
- ```
-
-Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
-
-`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
-
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
-
-
-This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
-
-The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
-
-### Registering Operator and OpKernel
-
-- In `.cc` files, register forward and backward operator classes and the CPU kernel.
-
- ```cpp
- namespace ops = paddle::operators;
- REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
- paddle::framework::DefaultGradOpDescMaker)
- REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
-
- REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
- REGISTER_OP_CPU_KERNEL(mul_grad,
- ops::MulGradKernel);
- ```
-
- In that code block,
-
- - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
- - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
- - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
-
-
-- Registering CUDA Kernel in `.cu` files
- - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
-
- ```cpp
- // if use Eigen unsupported module before include head files
- #define EIGEN_USE_GPU
-
- namespace ops = paddle::operators;
- REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel);
- REGISTER_OP_CUDA_KERNEL(mul_grad,
- ops::MulGradKernel);
- ```
-
-### Compilation
-
-Run the following commands to compile.
-
-```
-# maybe you need to rerun cmake
-make mul_op
-```
-
-## Python Binding
-
-The system will automatically bind to Python and link it to a generated library.
-
-## Unit Tests
-
-Unit tests for an operator include
-
-1. comparing a forward operator's implementations on different devices,
-
-2. comparing a backward operator's implementation on different devices, and
-
-3. a scaling test for the backward operator.
-
-Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
-
-### Testing Forward Operators
-
-A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following:
-
-1. Defining input, output and relevant attributes in `setUp` method.
-
-2. Generating random input data.
-
-3. Implementing the same computation logic in a Python script.
-
-4. Call check gradient function to check the backward operator.
-
- ```python
- import unittest
- import numpy as np
- from op_test import OpTest
-
-
- class TestMulOp(OpTest):
- def setUp(self):
- self.op_type = "mul"
- self.inputs = {
- 'X': np.random.random((32, 84)).astype("float32"),
- 'Y': np.random.random((84, 100)).astype("float32")
- }
- self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-
- def test_check_output(self):
- self.check_output()
-
- def test_check_grad_normal(self):
- self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
- def test_check_grad_ingore_x(self):
- self.check_grad(
- ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
- def test_check_grad_ingore_y(self):
- self.check_grad(
- ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
- ```
-Get its output, and compare it with the forward operator's own output.
-
-The code above first loads required packages. In addition, we have
-
-- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
-- `self.inputs` defines input, with type `numpy.array` and initializes it.
-- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
-
-### Testing Backward Operators
-
-Some key points in checking gradient above include:
-
-- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
- - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
- - The second variable `"Out"` points to the network's final output target `Out`.
- - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
-- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
-
-### Compiling and Running
-
-
-Any new unit testing file of the format `test_*.py` added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
-
-Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
-
-After successfully compiling the project, run the following command to run unit tests:
-
-```bash
-make test ARGS="-R test_mul_op -V"
-```
-
-Or,
-
-```bash
-ctest -R test_mul_op
-```
-
-## Remarks
-
-- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
-- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
-- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md
deleted file mode 100644
index 87e617d44041bde9c9051151878ffb4304689b3c..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/new_op_kernel.md
+++ /dev/null
@@ -1,121 +0,0 @@
-# Add Kernels for a New Device
-
-## Background
-
-PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
-
-[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
-
-## Write Kernels for A New Device
-
-### Add A New Device
-
- For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24). We will correct this ASAP.
-
-To register a new device, we need to add an enum value to `LibraryType`:
-
-```
-enum class LibraryType {
- kPlain = 0,
- kMKLDNN = 1,
- kCUDNN = 2,
-};
-```
-
-
-### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
-
-If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
-
-```cpp
-struct CUDAPlace {
- CUDAPlace() : CUDAPlace(0) {}
- explicit CUDAPlace(int d) : device(d) {}
-
- inline int GetDeviceId() const { return device; }
- // needed for variant equality comparison
- inline bool operator==(const CUDAPlace &o) const {
- return device == o.device;
- }
- inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
-
- int device;
-};
-
-typedef boost::variant Place;
-```
-
-### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
-After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
-
-```cpp
-class DeviceContext {
- public:
- virtual ~DeviceContext() {}
- virtual Place GetPlace() const = 0;
-
- virtual void Wait() const {}
-};
-```
-
-### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
-
-A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
-
-```cpp
-class OpKernelBase {
- public:
- /**
- * ExecutionContext is the only parameter of Kernel Run function.
- * Run will get input/output variables, state such as momentum and
- * device resource such as CUDA stream, cublas handle, etc. from
- * ExecutionContext. User should construct it before run the Operator.
- */
-
- virtual void Compute(const ExecutionContext& context) const = 0;
-
- virtual ~OpKernelBase() = default;
-};
-
-template
-class OpKernel : public OpKernelBase {
- public:
- using ELEMENT_TYPE = T;
-};
-```
-
-
-### Register the OpKernel to framework
-
-After writing the components described above, we should register the kernel to the framework.
-
-We use `REGISTER_OP_KERNEL` to do the registration.
-
-```cpp
-REGISTER_OP_KERNEL(
- op_type,
- library_type,
- place_type,
- kernel0, kernel1, ...)
-```
-
-kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
-
-take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
-
- ```cpp
- REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
- paddle::operators::GemmConvKernel,
- paddle::operators::GemmConvKernel);
-
- REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
- paddle::operators::CUDNNConvOpKernel,
- paddle::operators::CUDNNConvOpKernel);
- ```
-
-In the code above:
-
- - `conv2d` is the type/name of the operator
- - `CUDNN/CPU` is `library`
- - `paddle::platform::CUDAPlace/CPUPlace` is `place`
- - template parameter `float/double` on `CUDNNConvOpKernel` is `data_type`.
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
deleted file mode 100644
index 4e539d7992e5f67ee7b07193b59b6b425b73c9e5..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/op_markdown_format.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Standard Markdown Format for Operators
-The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
-
-```
-Operator Name (In PaddlePaddle)
-
-Operator Name (Standard)
-
-Operator description.
-
-LaTeX equation of how the operator performs an update.
-
-The signature of the operator.
-```
-
-Each section mentioned above has been covered in further detail in the rest of the document.
-
-## PaddlePaddle Operator Name
-This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
-`array to lod tensor` should be written as `array_to_lod_tensor`.
-
-This naming convention should be standard across all PaddlePaddle operators.
-
-## Standard Operator Name
-This is the standard name of the operator as used in the community. The general standard is usually:
-- Standard abbreviations like `SGD` are written in all capital letters.
-- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
-- Keep numbers inside a word as is, with no boundary delimiters.
-- Follow the name of the operator with the keyword: `Activation Operator.`
-
-## Operator description
-This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
-
-## LaTeX equation
-This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
-
-## The signature
-This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
-`Section :
-VariableName : (VariableType) VariableDescription
-...
-...
-`
-
-
-The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
-
-```
-sgd
-
-SGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-param_out = param_learning_rate * grad
-
-Inputs:
-Param : (Tensor) Input parameter
-LearningRate : (Tensor) Learning rate of SGD
-Grad : (Tensor) Input gradient
-
-Outputs:
-ParamOut : (Tensor) Output parameter
-```
diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
deleted file mode 100644
index 4c6728fba7150b0f1e180e57590f18a5b677c70d..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/releasing_process_cn.md
+++ /dev/null
@@ -1,199 +0,0 @@
-# PaddlePaddle发行规范
-
-PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
-
-PaddlePaddle每次发新的版本,遵循以下流程:
-
-1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
-1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
-1. 对这个版本的提交,做如下几个操作:
- * 使用Regression Test List作为检查列表,测试本次release的正确性。
- * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
- * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
- * 将这个版本的python wheel包发布到pypi。
- * 更新Docker镜像(参考后面的操作细节)。
-1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。
-1. 协同完成Release Note的书写。
-
-需要注意的是:
-
-* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
-* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
-
-## 发布wheel包到pypi
-
-1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
-完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
-弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。
-
-1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。
-1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-1. 上传:
-```
-cd build/python
-pip install twine
-twine upload dist/[package to upload]
-```
-
-* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
- 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
-* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。
-
-## 发布Docker镜像
-
-上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
-版本号对应的tag即可:
-
-```
-docker pull [镜像]:latest
-docker tag [镜像]:latest [镜像]:[version]
-docker push [镜像]:[version]
-```
-
-需要更新的镜像tag包括:
-
-* `[version]`: CPU版本
-* `[version]-openblas`: openblas版本
-* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5)
-* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
-
-之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
-
-## PaddlePaddle 分支规范
-
-PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
-
-* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
- * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
- * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。
- * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
-
-* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。
- * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
- * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
- * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
- * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
-
-* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
-
-## PaddlePaddle回归测试列表
-
-本列表说明PaddlePaddle发版之前需要测试的功能点。
-
-### PaddlePaddle Book中所有章节
-
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。
-
-
-
-
- |
-新手入门章节 |
- 识别数字 |
- 图像分类 |
-词向量 |
- 情感分析 |
-语意角色标注 |
- 机器翻译 |
-个性化推荐 |
-
-
-
-
-
-API.V2 + Docker + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- API.V2 + Docker + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
-`paddle_trainer` + Docker + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
-`paddle_trainer` + Docker + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- API.V2 + Ubuntu + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
-API.V2 + Ubuntu + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- `paddle_trainer` + Ubuntu + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- `paddle_trainer` + Ubuntu + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
deleted file mode 100644
index f989b964d6d1a329bbe31adc7ec10db017acaefa..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/releasing_process_en.md
+++ /dev/null
@@ -1,210 +0,0 @@
-# PaddlePaddle Releasing Process
-
-PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
-
-Each time we release a new PaddlePaddle version, we should follow the below steps:
-
-1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
-1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
- first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
-1. After that, we should do:
- * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
- that this release has no major bugs.
- * If regression test fails, we must fix those bugs and create a new `release/[version]`
- branch from previous release branch.
- * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
- * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
- * Update the Docker images (see below instructions for detail).
-1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
- then merge `master` to `develop`.
-1. Update the Release Note.
-
-***NOTE:***
-
-* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
- features only for current release, so that we can test on that version.
-* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
-
-## Publish Wheel Packages to pypi
-
-1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
- to build all wheel packages needed to publish. As shown in the following picture, choose a build
- version, click "..." button on the right side of "Run" button, and switch to the second tab in the
-pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
- step to start different versions of builds.
-
-1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
-1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
- upload the package using `twine`, we need to rename the package from `linux_x86_64` to
- `manylinux1_x86_64`.
-1. Start the upload:
- ```
- cd build/python
- pip install twine
- twine upload dist/[package to upload]
- ```
-
-* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
- download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
- scripts under `tools/manylinux1`.
-* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
- old version. you must change the version number before upload a new one.
-
-## Publish Docker Images
-
-Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
-
-```
-docker pull [image]:latest
-docker tag [image]:latest [image]:[version]
-docker push [image]:[version]
-```
-
-Tags that need to be updated are:
-* `[version]`: CPU only version image
-* `[version]-openblas`: openblas version image
-* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5)
-* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
-
-You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
-
-## Branching Model
-
-We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
-with some modifications:
-
-* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
-* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
- regression tests are run.
-* `release/[version]` branch is used to publish each release. Latest release version branches have
- bugfix only for that version, but no feature updates.
-* Developer forks are not required to follow
- [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
- branching model, all forks is like a feature branch.
- * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
- * Advise: developer use it's fork's develop branch to for new branch to start developing.
- * Use that branch on developer's fork to create pull requests and start reviews.
- * developer can push new commits to that branch when the pull request is open.
-* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
- `master`, `develop` and `releases`.
-
-## PaddlePaddle Regression Test List
-
-### All Chapters of PaddlePaddle Book
-
-We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
-V1 (`paddle_trainer` training) and V2 training and Fluid training.
-
-
-
-
- |
-Linear Regression |
-Recognize Digits |
-Image Classification |
-Word2Vec |
-Personalized Recommendation |
-Sentiment Analysis |
-Semantic Role Labeling |
-Machine Translation |
-
-
-
-
-
-API.V2 + Docker + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- API.V2 + Docker + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
-`paddle_trainer` + Docker + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
-`paddle_trainer` + Docker + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- API.V2 + Ubuntu + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
-API.V2 + Ubuntu + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- `paddle_trainer` + Ubuntu + GPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
- `paddle_trainer` + Ubuntu + CPU |
- |
- |
- |
- |
- |
- |
- |
- |
-
-
-
diff --git a/doc/fluid/dev/src/fc.py b/doc/fluid/dev/src/fc.py
deleted file mode 100644
index 3b074821cc2276a29b2a8639e82199fcf4d72020..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/src/fc.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def fc(input,
- size,
- num_flatten_dims=1,
- param_attr=None,
- bias_attr=None,
- act=None,
- name=None):
- """
- **Fully Connected Layer**
-
- The fully connected layer can take multiple tensors as its inputs. It
- creates a variable called weights for each input tensor, which represents
- a fully connected weight matrix from each input unit to each output unit.
- The fully connected layer multiplies each input tensor with its coresponding
- weight to produce an output Tensor. If multiple input tensors are given,
- the results of multiple multiplications will be sumed up. If bias_attr is
- not None, a bias variable will be created and added to the output. Finally,
- if activation is not None, it will be applied to the output as well.
-
- This process can be formulated as follows:
-
- .. math::
-
- Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
-
- In the above equation:
-
- * :math:`N`: Number of the input.
- * :math:`X_i`: The input tensor.
- * :math:`W`: The weights created by this layer.
- * :math:`b`: The bias parameter created by this layer (if needed).
- * :math:`Act`: The activation function.
- * :math:`Out`: The output tensor.
-
- Args:
- input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
- the input tensor(s) is at least 2.
- size(int): The number of output units in this layer.
- num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
- two dimensions. If this happens, the multidimensional tensor will first be flattened
- into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
- tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
- dimensions will be flatten to form the first dimension of the final matrix (height of
- the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
- form the second dimension of the final matrix (width of the matrix). For example, suppose
- `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
- Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
- param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
- parameters/weights of this layer.
- bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
- of this layer. If it is set to None, no bias will be added to the output units.
- act (str, default None): Activation to be applied to the output of this layer.
- name (str, default None): The name of this layer.
-
- Returns:
- A tensor variable storing the transformation result.
-
- Raises:
- ValueError: If rank of the input tensor is less than 2.
-
- Examples:
- .. code-block:: python
-
- data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
- fc = fluid.layers.fc(input=data, size=1000, act="tanh")
- """
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
deleted file mode 100644
index 051a463cfcf97df2e2d5b6a880923ca70fefbd6e..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/support_new_device.md
+++ /dev/null
@@ -1,240 +0,0 @@
-# Design Doc: Supporting new Device/Library
-
-## Background
-
-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
-
-On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
-
-On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
-
-So, how to support a new Device/Library in Fluid becomes a challenge.
-
-
-## Basic: Integrate A New Device/Library
-
-For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
-
-There are mainly three parts that we have to consider while integrating a new device/library:
-
-- Place and DeviceContext: indicate the device id and manage hardware resources
-
-- Memory and Tensor: malloc/free data on certain device
-
-- Math Functor and OpKernel: implement computing unit on certain devices/libraries
-
-### Place and DeviceContext
-
-Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
-
-#### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
-
-```
- | CPUPlace
-Place --| CUDAPlace
- | FPGAPlace
-```
-
-And `Place` is defined as follows:
-
-```
-typedef boost::variant Place;
-```
-
-#### DeviceContext
-
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
-
-
-```
- /-> CPUDeviceContext
-DeviceContext ----> CUDADeviceContext
- \-> FPGADeviceContext
-```
-
-An example of Nvidia GPU is as follows:
-
-- DeviceContext
-
-
-```
-class DeviceContext {
- virtual Place GetPlace() const = 0;
-};
-```
-
-
-- CUDADeviceContext
-
-
-```
-class CUDADeviceContext : public DeviceContext {
- Place GetPlace() const override { return place_; }
-private:
- CUDAPlace place_;
- cudaStream_t stream_;
- cublasHandle_t cublas_handle_;
- std::unique_ptr eigen_device_; // binds with stream_
-};
-```
-
-### Memory and Tensor
-
-
-#### memory module
-
-Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
-
-```
-template
-void* Alloc(Place place, size_t size);
-
-template
-void Free(Place place, void* ptr);
-
-template
-size_t Used(Place place);
-```
-
-To implement these interfaces, we have to implement MemoryAllocator for different Devices.
-
-
-#### Tensor
-
-[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
-
-```cpp
-class Tensor {
- public:
- /*! Return a pointer to mutable memory block. */
- template
- inline T* data();
-
- /**
- * @brief Return a pointer to mutable memory block.
- * @note If not exist, then allocation.
- */
- template
- inline T* mutable_data(platform::Place place);
-
- /**
- * @brief Return a pointer to mutable memory block.
- *
- * @param[in] dims The dimensions of the memory block.
- * @param[in] place The place of the memory block.
- *
- * @note If not exist, then allocation.
- */
- template
- inline T* mutable_data(DDim dims, platform::Place place);
-
- /*! Resize the dimensions of the memory block. */
- inline Tensor& Resize(const DDim& dims);
-
- /*! Return the dimensions of the memory block. */
- inline const DDim& dims() const;
-
- private:
- /*! holds the memory block if allocated. */
- std::shared_ptr holder_;
-
- /*! points to dimensions of memory block. */
- DDim dim_;
-};
-```
-
-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
-
-```cpp
-paddle::framework::Tensor t;
-paddle::platform::CPUPlace place;
-// set size first
-t.Resize({2, 3});
-// allocate memory on CPU later
-t.mutable_data(place);
-```
-
-
-
-### Math Functor and OpKernel
-
-Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
-
-Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
-
-The interface is defined in the header file.
-
-```
-template
-class MaxOutFunctor {
- public:
- void operator()(const DeviceContext& context, const framework::Tensor& input,
- framework::Tensor* output, int groups);
-};
-```
-
-CPU implementation is in .cc file
-
-```
-template
-class MaxOutFunctor {
- public:
- void operator()(const platform::CPUDeviceContext& context,
- const framework::Tensor& input, framework::Tensor* output,
- int groups) {
- ...
- }
-};
-```
-
-CUDA implementation is in .cu file
-
-```
-template
-class MaxOutFunctor {
- public:
- void operator()(const platform::CUDADeviceContext& context,
- const framework::Tensor& input, framework::Tensor* output,
- int groups) {
- ...
- }
-};
-```
-
-
-We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
-
-The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
-
-Fluid provides different register interfaces in op_registry.h
-
-
-Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
-
-In .cc file:
-
-```
-REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel);
-REGISTER_OP_CPU_KERNEL(
- crop_grad, ops::CropGradKernel);
-```
-
-In .cu file:
-
-```
-REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel);
-REGISTER_OP_CUDA_KERNEL(
- crop_grad, ops::CropGradKernel);
-```
-
-
-## Advanced topics: How to switch between different Device/Library
-
-Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
-
-
-For more details, please refer to following docs:
-
-- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
-- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
deleted file mode 100644
index 75922e7d85a13e53ce94619a48d8da8b960e6c9a..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/use_eigen_cn.md
+++ /dev/null
@@ -1,146 +0,0 @@
-# 在Paddle中如何使用Eigen
-
-神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
-
-
-## Eigen Tensor模块
-
-Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
-
-关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
-
-
-## paddle::framework::Tensor
-
-Paddle Tensor定义在framework目录下,其主要接口如下:
-
-```cpp
-class Tensor {
- public:
- /*! Return a pointer to mutable memory block. */
- template
- inline T* data();
-
- /**
- * @brief Return a pointer to mutable memory block.
- * @note If not exist, then allocation.
- */
- template
- inline T* mutable_data(platform::Place place);
-
- /**
- * @brief Return a pointer to mutable memory block.
- *
- * @param[in] dims The dimensions of the memory block.
- * @param[in] place The place of the memory block.
- *
- * @note If not exist, then allocation.
- */
- template
- inline T* mutable_data(DDim dims, platform::Place place);
-
- /*! Resize the dimensions of the memory block. */
- inline Tensor& Resize(const DDim& dims);
-
- /*! Return the dimensions of the memory block. */
- inline const DDim& dims() const;
-
- private:
- /*! holds the memory block if allocated. */
- std::shared_ptr holder_;
-
- /*! points to dimensions of memory block. */
- DDim dim_;
-};
-```
-
-`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。
-
-```cpp
-paddle::framework::Tensor t;
-paddle::platform::CPUPlace place;
-// set size first
-t.Resize({2, 3});
-// allocate memory on CPU later
-t.mutable_data(place);
-```
-
-### paddle::framework::Tensor使用样例
-下面以AddOp为例说明Tensor的使用过程:
-
-- InferShape
-
-在运行神经网络计算图时,我们先调用每个`Operator`的`InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。
-
-```cpp
-void InferShape(const framework::InferShapeContext &ctx) const override {
- PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(),
- ctx.Input("Y")->dims(),
- "Two input of Add Op's dimension must be same.");
- ctx.Output("Out")->Resize(ctx.Input("X")->dims());
-}
-```
-
-
-- Run
-
-`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。
-
-```cpp
-void Compute(const framework::ExecutionContext& context) const override {
- auto* input0 = context.Input