From bd3642aa6e2b3fdb0306e8bf0e469f43381abe80 Mon Sep 17 00:00:00 2001
From: zq19 <50872563+zq19@users.noreply.github.com>
Date: Fri, 6 Sep 2019 15:37:00 +0800
Subject: [PATCH] fix flags (#1139)

Add 'FLAGS_'
---
 doc/fluid/flags/cudnn_cn.rst       |  8 +++----
 doc/fluid/flags/cudnn_en.rst       |  8 +++----
 doc/fluid/flags/data_cn.rst        |  6 ++---
 doc/fluid/flags/data_en.rst        |  6 ++---
 doc/fluid/flags/debug_cn.rst       | 10 ++++-----
 doc/fluid/flags/debug_en.rst       | 10 ++++-----
 doc/fluid/flags/device_cn.rst      |  4 ++--
 doc/fluid/flags/device_en.rst      |  4 ++--
 doc/fluid/flags/distributed_cn.rst | 34 ++++++++++++++--------------
 doc/fluid/flags/distributed_en.rst | 34 ++++++++++++++--------------
 doc/fluid/flags/executor_cn.rst    |  8 +++----
 doc/fluid/flags/executor_en.rst    |  8 +++----
 doc/fluid/flags/memory_cn.rst      | 34 ++++++++++++++--------------
 doc/fluid/flags/memory_en.rst      | 36 +++++++++++++++---------------
 doc/fluid/flags/others_cn.rst      | 10 ++++-----
 doc/fluid/flags/others_en.rst      | 10 ++++-----
 16 files changed, 115 insertions(+), 115 deletions(-)

diff --git a/doc/fluid/flags/cudnn_cn.rst b/doc/fluid/flags/cudnn_cn.rst
index e09f29f49..dda86afd4 100755
--- a/doc/fluid/flags/cudnn_cn.rst
+++ b/doc/fluid/flags/cudnn_cn.rst
@@ -3,7 +3,7 @@ cudnn
 ==================
 
 
-conv_workspace_size_limit
+FLAGS_conv_workspace_size_limit
 *******************************************
 (始于0.13.0)
 
@@ -18,7 +18,7 @@ Uint64型，缺省值为4096。即4G内存工作区。
 FLAGS_conv_workspace_size_limit=1024 - 将用于选择cuDNN卷积算法的工作区限制大小设置为1024MB。
 
 
-cudnn_batchnorm_spatial_persistent
+FLAGS_cudnn_batchnorm_spatial_persistent
 *******************************************
 (始于1.4.0)
 
@@ -37,7 +37,7 @@ FLAGS_cudnn_batchnorm_spatial_persistent=True - 开启CUDNN_BATCHNORM_SPATIAL_PE
 此模式在某些任务中可以更快，因为将为CUDNN_DATA_FLOAT和CUDNN_DATA_HALF数据类型选择优化路径。我们默认将其设置为False的原因是此模式可能使用原子整数缩减(scaled atomic integer reduction)而导致某些输入数据范围的数字溢出。
 
 
-cudnn_deterministic
+FLAGS_cudnn_deterministic
 *******************************************
 (始于0.13.0)
 
@@ -56,7 +56,7 @@ FLAGS_cudnn_deterministic=True - 选择cuDNN中的确定性函数。
 现在，在cuDNN卷积和池化Operator中启用此flag。确定性算法速度可能较慢，因此该flag通常用于调试。
 
 
-cudnn_exhaustive_search
+FLAGS_cudnn_exhaustive_search
 *******************************************
 (始于1.2.0)
 
diff --git a/doc/fluid/flags/cudnn_en.rst b/doc/fluid/flags/cudnn_en.rst
index 97acf43f9..1c29e3de0 100755
--- a/doc/fluid/flags/cudnn_en.rst
+++ b/doc/fluid/flags/cudnn_en.rst
@@ -3,7 +3,7 @@ cudnn
 ==================
 
 
-conv_workspace_size_limit
+FLAGS_conv_workspace_size_limit
 *******************************************
 (since 0.13.0)
 
@@ -18,7 +18,7 @@ Example
 FLAGS_conv_workspace_size_limit=1024 set the workspace limit size for choosing cuDNN convolution algorithms to 1024MB.
 
 
-cudnn_batchnorm_spatial_persistent
+FLAGS_cudnn_batchnorm_spatial_persistent
 *******************************************
 (since 1.4.0)
 
@@ -37,7 +37,7 @@ Note
 This mode can be faster in some tasks because an optimized path will be selected for CUDNN_DATA_FLOAT and CUDNN_DATA_HALF data types. The reason we set it to False by default is that this mode may use scaled atomic integer reduction which may cause a numerical overflow for some input data range.
 
 
-cudnn_deterministic
+FLAGS_cudnn_deterministic
 *******************************************
 (since 0.13.0)
 
@@ -56,7 +56,7 @@ Note
 Now this flag is enabled in cuDNN convolution and pooling operator. The deterministic algorithms may slower, so this flag is generally used for debugging.
 
 
-cudnn_exhaustive_search
+FLAGS_cudnn_exhaustive_search
 *******************************************
 (since 1.2.0)
 
diff --git a/doc/fluid/flags/data_cn.rst b/doc/fluid/flags/data_cn.rst
index 061207dcb..db4bd5e3c 100755
--- a/doc/fluid/flags/data_cn.rst
+++ b/doc/fluid/flags/data_cn.rst
@@ -3,7 +3,7 @@
 ==================
 
 
-enable_cublas_tensor_op_math
+FLAGS_enable_cublas_tensor_op_math
 *******************************************
 (始于1.2.0)
 
@@ -15,10 +15,10 @@ Bool型，缺省值为False。
 
 示例
 -------
-enable_cublas_tensor_op_math=True - 使用Tensor Core。
+FLAGS_enable_cublas_tensor_op_math=True - 使用Tensor Core。
 
 
-use_mkldnn
+FLAGS_use_mkldnn
 *******************************************
 (始于0.13.0)
 
diff --git a/doc/fluid/flags/data_en.rst b/doc/fluid/flags/data_en.rst
index 96c29933c..c156a37dd 100755
--- a/doc/fluid/flags/data_en.rst
+++ b/doc/fluid/flags/data_en.rst
@@ -2,7 +2,7 @@
 data processing
 ==================
 
-enable_cublas_tensor_op_math
+FLAGS_enable_cublas_tensor_op_math
 *******************************************
 (since 1.2.0)
 
@@ -14,10 +14,10 @@ Bool. The default value is False.
 
 Example
 -------
-enable_cublas_tensor_op_math=True will use Tensor Core.
+FLAGS_enable_cublas_tensor_op_math=True will use Tensor Core.
 
 
-use_mkldnn
+FLAGS_use_mkldnn
 *******************************************
 (since 0.13.0)
 
diff --git a/doc/fluid/flags/debug_cn.rst b/doc/fluid/flags/debug_cn.rst
index 2927ae483..f414de88c 100755
--- a/doc/fluid/flags/debug_cn.rst
+++ b/doc/fluid/flags/debug_cn.rst
@@ -3,7 +3,7 @@
 ==================
 
 
-check_nan_inf
+FLAGS_check_nan_inf
 ********************
 (始于0.13.0)
 
@@ -18,7 +18,7 @@ Bool型，缺省值为False。
 FLAGS_check_nan_inf=True - 检查Operator的结果是否含有Nan或Inf。
 
 
-cpu_deterministic
+FLAGS_cpu_deterministic
 *******************************************
 (始于0.15.0)
 
@@ -33,7 +33,7 @@ Bool型，缺省值为False。
 FLAGS_cpu_deterministic=True - 在CPU侧确定计算结果。
 
 
-enable_rpc_profiler
+FLAGS_enable_rpc_profiler
 *******************************************
 (始于1.0.0)
 
@@ -48,7 +48,7 @@ Bool型，缺省值为False。
 FLAGS_enable_rpc_profiler=True - 启用RPC分析器并在分析器文件中记录时间线。
 
 
-multiple_of_cupti_buffer_size
+FLAGS_multiple_of_cupti_buffer_size
 *******************************************
 (始于1.4.0)
 
@@ -63,7 +63,7 @@ Int32型，缺省值为1。
 FLAGS_multiple_of_cupti_buffer_size=1 - 将CUPTI设备缓冲区大小的倍数设为1。
 
 
-reader_queue_speed_test_mode
+FLAGS_reader_queue_speed_test_mode
 *******************************************
 (始于1.1.0)
 
diff --git a/doc/fluid/flags/debug_en.rst b/doc/fluid/flags/debug_en.rst
index cc62d76fb..39b93240d 100755
--- a/doc/fluid/flags/debug_en.rst
+++ b/doc/fluid/flags/debug_en.rst
@@ -2,7 +2,7 @@
 debug
 ==================
 
-check_nan_inf
+FLAGS_check_nan_inf
 **************************************
 (since 0.13.0)
 
@@ -17,7 +17,7 @@ Example
 FLAGS_check_nan_inf=True will check the result of Operator whether the result has Nan or Inf.
 
 
-cpu_deterministic
+FLAGS_cpu_deterministic
 *******************************************
 (since 0.15.0)
 
@@ -32,7 +32,7 @@ Example
 FLAGS_cpu_deterministic=True will make the result of computation deterministic in CPU side.
 
 
-enable_rpc_profiler
+FLAGS_enable_rpc_profiler
 *******************************************
 (Since 1.0.0)
 
@@ -47,7 +47,7 @@ Example
 FLAGS_enable_rpc_profiler=True will enable rpc profiler and record the timeline to profiler file.
 
 
-multiple_of_cupti_buffer_size
+FLAGS_multiple_of_cupti_buffer_size
 *******************************************
 (since 1.4.0)
 
@@ -62,7 +62,7 @@ Example
 FLAGS_multiple_of_cupti_buffer_size=1 set the multiple of the CUPTI device buffer size to 1.
 
 
-reader_queue_speed_test_mode
+FLAGS_reader_queue_speed_test_mode
 *******************************************
 (since 1.1.0)
 
diff --git a/doc/fluid/flags/device_cn.rst b/doc/fluid/flags/device_cn.rst
index 7fda8d692..0bed575e9 100755
--- a/doc/fluid/flags/device_cn.rst
+++ b/doc/fluid/flags/device_cn.rst
@@ -3,7 +3,7 @@
 ==================
 
 
-paddle_num_threads
+FLAGS_paddle_num_threads
 *******************************************
 (始于0.15.0)
 
@@ -18,7 +18,7 @@ Int32型，缺省值为1。
 FLAGS_paddle_num_threads=2 - 将每个实例的最大线程数设为2。
 
 
-selected_gpus
+FLAGS_selected_gpus
 *******************************************
 (始于1.3)
 
diff --git a/doc/fluid/flags/device_en.rst b/doc/fluid/flags/device_en.rst
index eeae4d68d..5397ee9fc 100755
--- a/doc/fluid/flags/device_en.rst
+++ b/doc/fluid/flags/device_en.rst
@@ -3,7 +3,7 @@ device management
 ==================
 
 
-paddle_num_threads
+FLAGS_paddle_num_threads
 *******************************************
 (since 0.15.0)
 
@@ -18,7 +18,7 @@ Example
 FLAGS_paddle_num_threads=2 will enable 2 threads as max number of threads for each instance.
 
 
-selected_gpus
+FLAGS_selected_gpus
 *******************************************
 (since 1.3)
 
diff --git a/doc/fluid/flags/distributed_cn.rst b/doc/fluid/flags/distributed_cn.rst
index 5bf67884c..8c869ab46 100755
--- a/doc/fluid/flags/distributed_cn.rst
+++ b/doc/fluid/flags/distributed_cn.rst
@@ -3,7 +3,7 @@
 ==================
 
 
-communicator_fake_rpc
+FLAGS_communicator_fake_rpc
 **********************
 (始于1.5.0)
 
@@ -22,7 +22,7 @@ FLAGS_communicator_fake_rpc=True - 启用通信器fake模式。
 该flag仅用于paddlepaddle的开发者，普通用户不应对其设置。
 
 
-communicator_independent_recv_thread
+FLAGS_communicator_independent_recv_thread
 **************************************
 (始于1.5.0)
 
@@ -41,7 +41,7 @@ FLAGS_communicator_independent_recv_thread=True - 使用独立线程以从参数
 开发者使用该flag进行框架的调试与优化，普通用户不应对其设置。
 
 
-communicator_max_merge_var_num
+FLAGS_communicator_max_merge_var_num
 **************************************
 (始于1.5.0)
 
@@ -60,7 +60,7 @@ FLAGS_communicator_max_merge_var_num=16 - 将要通过通信器合并为一个
 该flag和训练器线程数有着密切关联，缺省值应和线程数一致。
 
 
-communicator_merge_sparse_grad
+FLAGS_communicator_merge_sparse_grad
 *******************************************
 (始于1.5.0)
 
@@ -79,11 +79,11 @@ FLAGS_communicator_merge_sparse_grad=true - 设置合并稀疏梯度。
 合并稀疏梯度会耗费时间。如果重复ID较多，内存占用会变少，通信会变快；如果重复ID较少，则并不会节约内存。
 
 
-communicator_min_send_grad_num_before_recv
+FLAGS_communicator_min_send_grad_num_before_recv
 *******************************************
 (始于1.5.0)
 
-在通信器中，有一个发送线程向参数服务器发送梯度，一个接收线程从参数服务器接收参数，且它们之间彼此独立。该flag用于控制接收线程的频率。 仅当发送线程至少发送communicator_min_send_grad_num_before_recv数量的梯度时，接收线程才会从参数服务器接收参数。
+在通信器中，有一个发送线程向参数服务器发送梯度，一个接收线程从参数服务器接收参数，且它们之间彼此独立。该flag用于控制接收线程的频率。 仅当发送线程至少发送FLAGS_communicator_min_send_grad_num_before_recv数量的梯度时，接收线程才会从参数服务器接收参数。
 
 取值范围
 ---------------
@@ -98,7 +98,7 @@ FLAGS_communicator_min_send_grad_num_before_recv=10 - 在接收线程从参数
 由于该flag和训练器的训练线程数强相关，而每个训练线程都会发送其梯度，所以缺省值应和线程数一致。
 
 
-communicator_send_queue_size
+FLAGS_communicator_send_queue_size
 *******************************************
 (始于1.5.0)
 
@@ -117,7 +117,7 @@ FLAGS_communicator_send_queue_size=10 - 设置每个梯度的队列大小为10
 该flag会影响训练速度，若队列大小过大，速度会变快但结果可能会变差。
 
 
-communicator_send_wait_times
+FLAGS_communicator_send_wait_times
 *******************************************
 (始于1.5.0)
 
@@ -132,7 +132,7 @@ Int32型，缺省值为5。
 FLAGS_communicator_send_wait_times=5 - 将合并数没有达到max_merge_var_num的情况下发送线程等待的次数设为5。
 
 
-communicator_thread_pool_size
+FLAGS_communicator_thread_pool_size
 *******************************************
 (始于1.5.0)
 
@@ -151,7 +151,7 @@ FLAGS_communicator_thread_pool_size=10 - 设置线程池大小为10。
 大部分情况下，用户不需要设置该flag。
 
 
-dist_threadpool_size
+FLAGS_dist_threadpool_size
 *******************************************
 (始于1.0.0)
 
@@ -166,7 +166,7 @@ Int32型，缺省值为0。
 FLAGS_dist_threadpool_size=10 - 将用于分布式模块的最大线程数设为10。
 
 
-rpc_deadline
+FLAGS_rpc_deadline
 *******************************************
 (始于1.0.0)
 
@@ -181,11 +181,11 @@ Int32型，缺省值为180000，单位为ms。
 FLAGS_rpc_deadline=180000 - 将deadline超时设为3分钟。
 
 
-rpc_disable_reuse_port
+FLAGS_rpc_disable_reuse_port
 *******************************************
 (始于1.2.0)
 
-rpc_disable_reuse_port为True时，grpc的 GRPC_ARG_ALLOW_REUSEPORT会被设置为False以禁用SO_REUSEPORT。
+FLAGS_rpc_disable_reuse_port为True时，grpc的 GRPC_ARG_ALLOW_REUSEPORT会被设置为False以禁用SO_REUSEPORT。
 
 取值范围
 ---------------
@@ -196,7 +196,7 @@ Bool型，缺省值为False。
 FLAGS_rpc_disable_reuse_port=True - 禁用SO_REUSEPORT。
 
 
-rpc_get_thread_num
+FLAGS_rpc_get_thread_num
 *******************************************
 (始于1.0.0)
 
@@ -211,7 +211,7 @@ Int32型，缺省值为12。
 FLAGS_rpc_get_thread_num=6 - 将从参数服务器获取参数的线程数设为6。
 
 
-rpc_send_thread_num
+FLAGS_rpc_send_thread_num
 *******************************************
 (始于1.0.0)
 
@@ -226,11 +226,11 @@ Int32型，缺省值为12。
 FLAGS_rpc_send_thread_num=6 - 将用于发送的线程数设为6。
 
 
-rpc_server_profile_path
+FLAGS_rpc_server_profile_path
 *******************************************
 since(v0.15.0)
 
-设置分析器输出日志文件路径前缀。完整路径为rpc_server_profile_path_listener_id，其中listener_id为随机数。 
+设置分析器输出日志文件路径前缀。完整路径为FLAGS_rpc_server_profile_path_listener_id，其中listener_id为随机数。 
 
 取值范围
 ---------------
diff --git a/doc/fluid/flags/distributed_en.rst b/doc/fluid/flags/distributed_en.rst
index adc7fa181..d71803cc6 100755
--- a/doc/fluid/flags/distributed_en.rst
+++ b/doc/fluid/flags/distributed_en.rst
@@ -2,7 +2,7 @@
 distributed
 ==================
 
-communicator_fake_rpc
+FLAGS_communicator_fake_rpc
 **************************************
 (since 1.5.0)
 
@@ -21,7 +21,7 @@ Note
 This flag is only for developer of paddlepaddle, user should not set it.
 
 
-communicator_independent_recv_thread
+FLAGS_communicator_independent_recv_thread
 **************************************
 (since 1.5.0)
 
@@ -40,7 +40,7 @@ Note
 This flag is for developer to debug and optimize the framework. User should not set it.
 
 
-communicator_max_merge_var_num
+FLAGS_communicator_max_merge_var_num
 **************************************
 (since 1.5.0)
 
@@ -59,7 +59,7 @@ Note
 This flag has strong relationship with trainer thread num. The default value should be the same with thread num.
 
 
-communicator_merge_sparse_grad
+FLAGS_communicator_merge_sparse_grad
 *******************************
 (since 1.5.0)
 
@@ -78,11 +78,11 @@ Note
 Merging sparse gradient would be time-consuming. If the sparse gradient has many duplicated ids, it will save memory and communication could be much faster. Otherwise it will not save memory.
 
 
-communicator_min_send_grad_num_before_recv
+FLAGS_communicator_min_send_grad_num_before_recv
 *******************************************
 (since 1.5.0)
 
-In communicator, there is one send thread that send gradient to parameter server and one receive thread that receive parameter from parameter server. They work independently. This flag is used to control the frequency of receive thread. Only when the send thread send at least communicator_min_send_grad_num_before_recv gradients will the receive thread receive parameter from parameter server.
+In communicator, there is one send thread that send gradient to parameter server and one receive thread that receive parameter from parameter server. They work independently. This flag is used to control the frequency of receive thread. Only when the send thread send at least FLAGS_communicator_min_send_grad_num_before_recv gradients will the receive thread receive parameter from parameter server.
 
 Values accepted
 ---------------
@@ -97,7 +97,7 @@ Note
 This flag has strong relation with the training threads of trainer. because each training thread will send it's grad. So the default value should be training thread num.
 
 
-communicator_send_queue_size
+FLAGS_communicator_send_queue_size
 *******************************************
 (since 1.5.0)
 
@@ -116,7 +116,7 @@ Note
 This flag will affect the training speed, if the queue size is larger, the speed may be faster, but may make the result worse.
 
 
-communicator_send_wait_times
+FLAGS_communicator_send_wait_times
 *******************************************
 (since 1.5.0)
 
@@ -131,7 +131,7 @@ Example
 FLAGS_communicator_send_wait_times=5 set the times that send thread will wait if merge number does not reach max_merge_var_num to 5.
 
 
-communicator_thread_pool_size
+FLAGS_communicator_thread_pool_size
 *******************************************
 (since 1.5.0)
 
@@ -150,7 +150,7 @@ Note
 Most of time user does not need to set this flag.
 
 
-dist_threadpool_size
+FLAGS_dist_threadpool_size
 *******************************************
 (Since 1.0.0)
 
@@ -165,7 +165,7 @@ Example
 FLAGS_dist_threadpool_size=10 will enable 10 threads as max number of thread used for distributed module.
 
 
-rpc_deadline
+FLAGS_rpc_deadline
 *******************************************
 (Since 1.0.0)
 
@@ -180,11 +180,11 @@ Example
 FLAGS_rpc_deadline=180000 will set deadline timeout to 3 minute.
 
 
-rpc_disable_reuse_port
+FLAGS_rpc_disable_reuse_port
 *******************************************
 (since 1.2.0)
 
-When rpc_disable_reuse_port is true, the flag of grpc GRPC_ARG_ALLOW_REUSEPORT will be set to false to
+When FLAGS_rpc_disable_reuse_port is true, the flag of grpc GRPC_ARG_ALLOW_REUSEPORT will be set to false to
 disable the use of SO_REUSEPORT if it's available.
 
 Values accepted
@@ -196,7 +196,7 @@ Example
 FLAGS_rpc_disable_reuse_port=True will disable the use of SO_REUSEPORT.
 
 
-rpc_get_thread_num
+FLAGS_rpc_get_thread_num
 *******************************************
 (Since 1.0.0)
 
@@ -211,7 +211,7 @@ Example
 FLAGS_rpc_get_thread_num=6 will use 6 threads to get parameter from parameter server.
 
 
-rpc_send_thread_num
+FLAGS_rpc_send_thread_num
 *******************************************
 (Since 1.0.0)
 
@@ -226,11 +226,11 @@ Example
 FLAGS_rpc_send_thread_num=6 will set number thread used for send to 6.
 
 
-rpc_server_profile_path
+FLAGS_rpc_server_profile_path
 *******************************************
 since(v0.15.0)
 
-Set the profiler output log file path prefix. The complete path will be rpc_server_profile_path_listener_id, listener_id is a random number.
+Set the profiler output log file path prefix. The complete path will be FLAGS_rpc_server_profile_path_listener_id, listener_id is a random number.
 
 Values accepted
 ---------------
diff --git a/doc/fluid/flags/executor_cn.rst b/doc/fluid/flags/executor_cn.rst
index e94b4671c..56c3c7b04 100755
--- a/doc/fluid/flags/executor_cn.rst
+++ b/doc/fluid/flags/executor_cn.rst
@@ -3,7 +3,7 @@
 ==================
 
 
-enable_parallel_graph
+FLAGS_enable_parallel_graph
 *******************************************
 (始于1.2.0)
 
@@ -18,7 +18,7 @@ Bool型，缺省值为False。
 FLAGS_enable_parallel_graph=False - 通过ParallelExecutor强制禁用并行图执行模式。
 
 
-pe_profile_fname
+FLAGS_pe_profile_fname
 *******************************************
 (始于1.3.0)
 
@@ -33,7 +33,7 @@ String型，缺省值为empty ("")。
 FLAGS_pe_profile_fname="./parallel_executor.perf" - 将配置文件结果存储在parallel_executor.perf中。
 
 
-print_sub_graph_dir
+FLAGS_print_sub_graph_dir
 *******************************************
 (始于1.2.0)
 
@@ -48,7 +48,7 @@ String型，缺省值为empty ("")。
 FLAGS_print_sub_graph_dir="./sub_graphs.txt" - 将断开连接的子图打印到"./sub_graphs.txt"。
 
 
-use_ngraph
+FLAGS_use_ngraph
 *******************************************
 (始于1.4.0)
 
diff --git a/doc/fluid/flags/executor_en.rst b/doc/fluid/flags/executor_en.rst
index ccc5c5f92..7a262c001 100755
--- a/doc/fluid/flags/executor_en.rst
+++ b/doc/fluid/flags/executor_en.rst
@@ -3,7 +3,7 @@ executor
 ==================
 
 
-enable_parallel_graph
+FLAGS_enable_parallel_graph
 *******************************************
 (since 1.2.0)
 
@@ -18,7 +18,7 @@ Example
 FLAGS_enable_parallel_graph=False will force disable parallel graph execution mode by ParallelExecutor.
 
 
-pe_profile_fname
+FLAGS_pe_profile_fname
 *******************************************
 (since 1.3.0)
 
@@ -33,7 +33,7 @@ Example
 FLAGS_pe_profile_fname="./parallel_executor.perf" will store the profile result to parallel_executor.perf.
 
 
-print_sub_graph_dir
+FLAGS_print_sub_graph_dir
 *******************************************
 (since 1.2.0)
 
@@ -48,7 +48,7 @@ Example
 FLAGS_print_sub_graph_dir="./sub_graphs.txt" will print the disconnected subgraphs to "./sub_graphs.txt".
 
 
-use_ngraph
+FLAGS_use_ngraph
 *******************************************
 (since 1.4.0)
 
diff --git a/doc/fluid/flags/memory_cn.rst b/doc/fluid/flags/memory_cn.rst
index 6c09e750a..8198b57b4 100755
--- a/doc/fluid/flags/memory_cn.rst
+++ b/doc/fluid/flags/memory_cn.rst
@@ -3,7 +3,7 @@
 ==================
 
 
-allocator_strategy
+FLAGS_allocator_strategy
 ********************
 (始于1.2)
 
@@ -20,7 +20,7 @@ FLAGS_allocator_strategy=legacy - 使用legacy分配器。
 FLAGS_allocator_strategy=naive_best_fit - 使用新设计的分配器。
 
 
-eager_delete_scope
+FLAGS_eager_delete_scope
 *******************************************
 (始于0.12.0)
 
@@ -35,7 +35,7 @@ Bool型，缺省值为True。
 FLAGS_eager_delete_scope=True - 同步局域删除。
 
 
-eager_delete_tensor_gb
+FLAGS_eager_delete_tensor_gb
 *******************************************
 (始于1.0.0)
 
@@ -58,7 +58,7 @@ FLAGS_eager_delete_tensor_gb=-1.0 - 禁用垃圾回收策略。
 建议用户在训练大型网络时设置FLAGS_eager_delete_tensor_gb=0.0以启用垃圾回收策略。
 
 
-enable_inplace_whitelist
+FLAGS_enable_inplace_whitelist
 *******************************************
 (始于1.4)
 
@@ -73,7 +73,7 @@ Bool型，缺省值为False。
 FLAGS_enable_inplace_whitelist=True - 在特定op上禁止内存原位复用优化。
 
 
-fast_eager_deletion_mode
+FLAGS_fast_eager_deletion_mode
 *******************************************
 (始于1.3)
 
@@ -90,7 +90,7 @@ FLAGS_fast_eager_deletion_mode=True - 启用快速垃圾回收策略。
 FLAGS_fast_eager_deletion_mode=False - 禁用快速垃圾回收策略。
 
 
-fraction_of_gpu_memory_to_use
+FLAGS_fraction_of_gpu_memory_to_use
 *******************************************
 (始于1.2.0)
 
@@ -109,7 +109,7 @@ FLAGS_fraction_of_gpu_memory_to_use=0.1 - 分配总GPU内存大小的10%作为
 Windows系列平台会将FLAGS_fraction_of_gpu_memory_to_use默认设为0.5，Linux则会默认设为0.92。
 
 
-free_idle_memory
+FLAGS_free_idle_memory
 *******************************************
 (始于0.15.0)
 
@@ -126,7 +126,7 @@ FLAGS_free_idle_memory=True - 空闲内存太多时释放。
 FLAGS_free_idle_memory=False - 不释放空闲内存。
 
 
-fuse_parameter_groups_size
+FLAGS_fuse_parameter_groups_size
 *******************************************
 (始于1.4.0)
 
@@ -141,7 +141,7 @@ Int32型，缺省值为3。
 FLAGS_fuse_parameter_groups_size=3 - 将单组参数的梯度大小设为3。
 
 
-fuse_parameter_memory_size
+FLAGS_fuse_parameter_memory_size
 *******************************************
 (始于1.5.0)
 
@@ -156,7 +156,7 @@ Double型，缺省值为-1.0。
 FLAGS_fuse_parameter_memory_size=16 - 将单组参数梯度的上限大小设为16MB。
 
 
-init_allocated_mem
+FLAGS_init_allocated_mem
 *******************************************
 (始于0.15.0)
 
@@ -173,7 +173,7 @@ FLAGS_init_allocated_mem=True - 对分配的内存进行非零初始化。
 FLAGS_init_allocated_mem=False - 不会对分配的内存进行非零初始化。
 
 
-initial_cpu_memory_in_mb
+FLAGS_initial_cpu_memory_in_mb
 *******************************************
 (始于0.14.0)
 
@@ -188,7 +188,7 @@ Uint64型，缺省值为500，单位为MB。
 FLAGS_initial_cpu_memory_in_mb=100 - 在FLAGS_fraction_of_cpu_memory_to_use*（总物理内存）大于100MB的情况下，首次提出分配请求时，分配器预先分配100MB内存，并在预分配的内存耗尽时再次分配100MB。
 
 
-initial_gpu_memory_in_mb
+FLAGS_initial_gpu_memory_in_mb
 *******************************************
 (始于1.4.0)
 
@@ -207,7 +207,7 @@ FLAGS_initial_gpu_memory_in_mb=4096 - 分配4GB作为初始GPU内存块大小。
 如果设置该flag，则FLAGS_fraction_of_gpu_memory_to_use设置的内存大小将被该flag覆盖。如果未设置该flag，PaddlePaddle将使用FLAGS_fraction_of_gpu_memory_to_use分配GPU内存。
 
 
-limit_of_tmp_allocation
+FLAGS_limit_of_tmp_allocation
 *******************************************
 (始于1.3)
 
@@ -222,7 +222,7 @@ Int64型，缺省值为-1。
 FLAGS_limit_of_tmp_allocation=1024 - 将temporary_allocation大小的上限设为1024字节。
 
 
-memory_fraction_of_eager_deletion
+FLAGS_memory_fraction_of_eager_deletion
 *******************************************
 (始于1.4)
 
@@ -242,7 +242,7 @@ FLAGS_memory_fraction_of_eager_deletion=1 - 释放所有临时变量。
 FLAGS_memory_fraction_of_eager_deletion=0.5 - 仅释放50%比例的占用内存最多的变量。
 
 
-reallocate_gpu_memory_in_mb
+FLAGS_reallocate_gpu_memory_in_mb
 *******************************************
 (始于1.4.0)
 
@@ -261,7 +261,7 @@ FLAGS_reallocate_gpu_memory_in_mb=1024 - 如果耗尽了分配的GPU内存块，
 如果设置了该flag，PaddlePaddle将重新分配该flag指定大小的gpu内存。否则分配FLAGS_fraction_of_gpu_memory_to_use指定比例的gpu内存。
 
 
-times_excess_than_required_tmp_allocation
+FLAGS_times_excess_than_required_tmp_allocation
 *******************************************
 (始于1.3)
 
@@ -276,7 +276,7 @@ Int64型，缺省值为2。
 FLAGS_times_excess_than_required_tmp_allocation=1024 - 设置TemporaryAllocator可以返回的最大大小为1024*N。
 
 
-use_pinned_memory
+FLAGS_use_pinned_memory
 *******************************************
 (始于0.12.0)
 
diff --git a/doc/fluid/flags/memory_en.rst b/doc/fluid/flags/memory_en.rst
index 1e407ac4a..0411801e6 100755
--- a/doc/fluid/flags/memory_en.rst
+++ b/doc/fluid/flags/memory_en.rst
@@ -3,7 +3,7 @@ memory management
 ==================
 
 
-allocator_strategy
+FLAGS_allocator_strategy
 **************************************
 (since 1.2)
 
@@ -21,7 +21,7 @@ FLAGS_allocator_strategy=naive_best_fit would use the new-designed allocator.
 
 
 
-eager_delete_scope
+FLAGS_eager_delete_scope
 *******************************************
 (since 0.12.0)
 
@@ -36,7 +36,7 @@ Example
 FLAGS_eager_delete_scope=True will make scope delete synchronously.
 
 
-eager_delete_tensor_gb
+FLAGS_eager_delete_tensor_gb
 *******************************************
 (since 1.0.0)
 
@@ -60,7 +60,7 @@ It is recommended that users enable garbage collection strategy by setting FLAGS
 
 
 
-enable_inplace_whitelist
+FLAGS_enable_inplace_whitelist
 *******************************************
 (since 1.4)
 
@@ -76,7 +76,7 @@ FLAGS_enable_inplace_whitelist=True would disable memory in-place optimization o
 
 
 
-fast_eager_deletion_mode
+FLAGS_fast_eager_deletion_mode
 *******************************************
 (since 1.3)
 
@@ -93,7 +93,7 @@ FLAGS_fast_eager_deletion_mode=True would turn on fast garbage collection strate
 FLAGS_fast_eager_deletion_mode=False would turn off fast garbage collection strategy.
 
 
-fraction_of_gpu_memory_to_use
+FLAGS_fraction_of_gpu_memory_to_use
 *******************************************
 (since 1.2.0)
 
@@ -113,7 +113,7 @@ Windows series platform will set FLAGS_fraction_of_gpu_memory_to_use to 0.5 by d
 Linux will set FLAGS_fraction_of_gpu_memory_to_use to 0.92 by default.
 
 
-free_idle_memory
+FLAGS_free_idle_memory
 *******************************************
 (since 0.15.0)
 
@@ -130,7 +130,7 @@ FLAGS_free_idle_memory=True will free idle memory when there is too much of it.
 FLAGS_free_idle_memory=False will not free idle memory.
 
 
-fuse_parameter_groups_size
+FLAGS_fuse_parameter_groups_size
 *******************************************
 (since 1.4.0)
 
@@ -146,7 +146,7 @@ FLAGS_fuse_parameter_groups_size=3 will set the size of one group parameters' gr
 
 
 
-fuse_parameter_memory_size
+FLAGS_fuse_parameter_memory_size
 *******************************************
 (since 1.5.0)
 
@@ -161,7 +161,7 @@ Example
 FLAGS_fuse_parameter_memory_size=16 set the up limited memory size of one group parameters' gradient to 16 Megabytes.
 
 
-init_allocated_mem
+FLAGS_init_allocated_mem
 *******************************************
 (since 0.15.0)
 
@@ -178,7 +178,7 @@ FLAGS_init_allocated_mem=True will make the allocated memory initialize as a non
 FLAGS_init_allocated_mem=False will not initialize the allocated memory.
 
 
-initial_cpu_memory_in_mb
+FLAGS_initial_cpu_memory_in_mb
 *******************************************
 (since 0.14.0)
 
@@ -193,7 +193,7 @@ Example
 FLAGS_initial_cpu_memory_in_mb=100, if FLAGS_fraction_of_cpu_memory_to_use*(total physical memory) > 100MB, then allocator will pre-allocate 100MB when first allocation request raises, and re-allocate 100MB again when the pre-allocated memory is exhaustive.
 
 
-initial_gpu_memory_in_mb
+FLAGS_initial_gpu_memory_in_mb
 *******************************************
 (since 1.4.0)
 
@@ -213,7 +213,7 @@ If you set this flag, the memory size set by FLAGS_fraction_of_gpu_memory_to_use
 If you don't set this flag, PaddlePaddle will use FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory.
 
 
-limit_of_tmp_allocation
+FLAGS_limit_of_tmp_allocation
 *******************************************
 (since 1.3)
 
@@ -228,7 +228,7 @@ Example
 FLAGS_limit_of_tmp_allocation=1024 will set the up limit of temporary_allocation size to 1024 bytes.
 
 
-memory_fraction_of_eager_deletion
+FLAGS_memory_fraction_of_eager_deletion
 *******************************************
 (since 1.4)
 
@@ -248,7 +248,7 @@ FLAGS_memory_fraction_of_eager_deletion=1 would release all temporary variables.
 FLAGS_memory_fraction_of_eager_deletion=0.5 would only release 50% of variables with largest memory size.
 
 
-reallocate_gpu_memory_in_mb
+FLAGS_reallocate_gpu_memory_in_mb
 *******************************************
 (since 1.4.0)
 
@@ -268,12 +268,12 @@ If this flag is set, PaddlePaddle will reallocate the gpu memory with size speci
 Else PaddlePaddle will reallocate with size set by FLAGS_fraction_of_gpu_memory_to_use.
 
 
-times_excess_than_required_tmp_allocation
+FLAGS_times_excess_than_required_tmp_allocation
 *******************************************
 (since 1.3)
 
 The FLAGS_times_excess_than_required_tmp_allocation indicates the max size the TemporaryAllocator can return. For Example
-, if the required memory size is N, and times_excess_than_required_tmp_allocation is 2.0, the TemporaryAllocator will return the available allocation that the range of size is N ~ 2*N.
+, if the required memory size is N, and FLAGS_times_excess_than_required_tmp_allocation is 2.0, the TemporaryAllocator will return the available allocation that the range of size is N ~ 2*N.
 
 Values accepted
 ---------------
@@ -284,7 +284,7 @@ Example
 FLAGS_times_excess_than_required_tmp_allocation=1024 will set the max size of the TemporaryAllocator can return to 1024*N.
 
 
-use_pinned_memory
+FLAGS_use_pinned_memory
 *******************************************
 (since 0.12.0)
 
diff --git a/doc/fluid/flags/others_cn.rst b/doc/fluid/flags/others_cn.rst
index 3e8f2ed79..3cd6f6a6a 100755
--- a/doc/fluid/flags/others_cn.rst
+++ b/doc/fluid/flags/others_cn.rst
@@ -4,7 +4,7 @@
 
 
 
-benchmark
+FLAGS_benchmark
 ********************
 (始于0.12.0)
 
@@ -19,7 +19,7 @@ Bool型，缺省值为False。
 FLAGS_benchmark=True -  同步以测试基准。
 
 
-inner_op_parallelism
+FLAGS_inner_op_parallelism
 *******************************************
 (始于1.3.0)
 
@@ -38,7 +38,7 @@ FLAGS_inner_op_parallelism=5 - 将operator内的线程数设为5。
 目前只有稀疏的adam op支持inner_op_parallelism。
 
 
-max_body_size
+FLAGS_max_body_size
 *******************************************
 (始于1.0.0)
 
@@ -53,7 +53,7 @@ Int32型，缺省值为2147483647。
 FLAGS_max_body_size=2147483647 - 将BRPC消息大小设为2147483647。
 
 
-sync_nccl_allreduce
+FLAGS_sync_nccl_allreduce
 *******************************************
 (始于1.3)
 
@@ -68,7 +68,7 @@ Bool型，缺省值为True。
 FLAGS_sync_nccl_allreduce=True - 在allreduce_op_handle中调用 `cudaStreamSynchronize(nccl_stream)` 。
 
 
-tracer_profile_fname
+FLAGS_tracer_profile_fname
 *******************************************
 (始于1.4.0)
 
diff --git a/doc/fluid/flags/others_en.rst b/doc/fluid/flags/others_en.rst
index 9b03141e1..91fdfdf02 100755
--- a/doc/fluid/flags/others_en.rst
+++ b/doc/fluid/flags/others_en.rst
@@ -4,7 +4,7 @@ others
 
 
 
-benchmark
+FLAGS_benchmark
 **************************************
 (since 0.12.0)
 
@@ -19,7 +19,7 @@ Example
 FLAGS_benchmark=True will do some synchronizations to test benchmark.
 
 
-inner_op_parallelism
+FLAGS_inner_op_parallelism
 *******************************************
 (since 1.3.0)
 
@@ -38,7 +38,7 @@ Note
 currently only sparse adam op supports inner_op_parallelism.
 
 
-max_body_size
+FLAGS_max_body_size
 *******************************************
 (Since 1.0.0)
 
@@ -53,7 +53,7 @@ Example
 FLAGS_max_body_size=2147483647 will set the BRPC message size to 2147483647.
 
 
-sync_nccl_allreduce
+FLAGS_sync_nccl_allreduce
 *******************************************
 (since 1.3)
 
@@ -68,7 +68,7 @@ Example
 FLAGS_sync_nccl_allreduce=True will call `cudaStreamSynchronize(nccl_stream)` in allreduce_op_handle.
 
 
-tracer_profile_fname
+FLAGS_tracer_profile_fname
 *******************************************
 (since 1.4.0)
 
-- 
GitLab