未验证 提交 27235cf2 编写于 作者: X XiaoguangHu 提交者: GitHub

Add document annotations for FLAGS that need to be open to external developers...

Add document annotations for FLAGS that need to be open to external developers test=develop (#19692)

Add document annotations for FLAGS that need to be open to external developers
上级 1c25c88a
...@@ -21,17 +21,40 @@ ...@@ -21,17 +21,40 @@
* NOTE(paddle-dev): This file is designed to define all public FLAGS. * NOTE(paddle-dev): This file is designed to define all public FLAGS.
*/ */
/* Paddle initialization related */ /**
* Paddle initialization related FLAG
* Name: FLAGS_paddle_num_threads
* Since Version: 0.15.0
* Value Range: int32, default=1
* Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
* instance to 2
* Note:
*/
DEFINE_int32(paddle_num_threads, 1, DEFINE_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance."); "Number of threads for each paddle instance.");
/* Operator related */ /**
* Operator related FLAG
* Name: FLAGS_check_nan_inf
* Since Version: 0.13.0
* Value Range: bool, default=false
* Example:
* Note: Used to debug. Checking whether operator produce NAN/INF or not.
*/
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
/* CUDA related */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
/**
* CUDA related related FLAG
* Name: FLAGS_enable_cublas_tensor_op_math
* Since Version: 1.2.0
* Value Range: bool, default=false
* Example:
* Note: whether to use Tensor Core, faster but it may loss precision.
*/
DEFINE_bool( DEFINE_bool(
enable_cublas_tensor_op_math, false, enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
...@@ -42,6 +65,15 @@ DEFINE_bool( ...@@ -42,6 +65,15 @@ DEFINE_bool(
"input and output must be half precision) and recurrent neural networks " "input and output must be half precision) and recurrent neural networks "
"(RNNs)."); "(RNNs).");
/**
* CUDA related FLAG
* Name: FLAGS_selected_gpus
* Since Version: 1.3.0
* Value Range: integer list separated by comma, default empty list
* Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
* cards
* Note: A list of device ids separated by comma, like: 0,1,2,3
*/
DEFINE_string(selected_gpus, "", DEFINE_string(selected_gpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. " "A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and " "This option is useful when doing multi process training and "
...@@ -52,59 +84,167 @@ DEFINE_string(selected_gpus, "", ...@@ -52,59 +84,167 @@ DEFINE_string(selected_gpus, "",
"share-memory only."); "share-memory only.");
#endif #endif
/* CUDNN related */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
/**
* CUDNN related FLAG
* Name: FLAGS_cudnn_deterministic
* Since Version: 0.13.0
* Value Range: bool, default=false
* Example:
* Note: whether to use deterministic algorithm in cudnn.
* If true, it will slow down some operators such as conv and pooling.
*/
DEFINE_bool(cudnn_deterministic, false, DEFINE_bool(cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution " "Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If " "operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic."); "true, the algorithm is deterministic.");
/**
* CUDNN related FLAG
* Name: FLAGS_conv_workspace_size_limit
* Since Version: 0.13.0
* Value Range: uint64, default=4096 (MB)
* Example:
* Note: The internal function of cuDNN obtains the fastest matching algorithm
* within this memory limit. Usually, faster algorithms can be chosen in
* larger workspaces, but memory space can also be significantly
* increased.
* Users need to balance memory and speed.
*/
DEFINE_uint64(conv_workspace_size_limit, DEFINE_uint64(conv_workspace_size_limit,
paddle::platform::kDefaultConvWorkspaceSizeLimitMB, paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
"cuDNN convolution workspace limit in MB unit."); "cuDNN convolution workspace limit in MB unit.");
/**
* CUDNN related FLAG
* Name: FLAGS_cudnn_exhaustive_search
* Since Version: 1.2.0
* Value Range: bool, default=false
* Example:
* Note: Represents whether an exhaustive search method is used to
* select a convolution algorithm. There are two search methods in cuDNN,
* heuristic search and exhaustive search. Exhaustive search attempts
* all cuDNN algorithms to select the fastest. This method is very
* time-consuming, and the selected algorithm will be cached for a given
* layer specification. Once you change the layer specifications
* (such as batch size, feature map size), it will search again.
*/
DEFINE_bool(cudnn_exhaustive_search, false, DEFINE_bool(cudnn_exhaustive_search, false,
"Whether enable exhaustive search for cuDNN convolution or " "Whether enable exhaustive search for cuDNN convolution or "
"not, default is False."); "not, default is False.");
/**
* CUDNN related FLAG
* Name: FLAGS_cudnn_exhaustive_search_times
* Since Version:
* Value Range:
* Example:
* Note: only used to predict for advanced developer
*/
DEFINE_int64(cudnn_exhaustive_search_times, -1, DEFINE_int64(cudnn_exhaustive_search_times, -1,
"Exhaustive search times for cuDNN convolution, " "Exhaustive search times for cuDNN convolution, "
"default is -1, not exhaustive search"); "default is -1, not exhaustive search");
// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in /**
// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT * CUDNN related FLAG
// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The * Name: FLAGS_cudnn_batchnorm_spatial_persistent
// reason we set it to false by default is that this mode may use scaled * Since Version: 1.4.0
// atomic integer reduction that may cause a numerical overflow for certain * Value Range: bool, default=false
// input data range. * Example:
* Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
* faster in
* some tasks because an optimized path may be selected for
* CUDNN_DATA_FLOAT
* and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
* reason we set it to false by default is that this mode may use scaled
* atomic integer reduction that may cause a numerical overflow for
* certain
* input data range.
*/
DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
"Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
"batch_norm, default is False."); "batch_norm, default is False.");
#endif #endif
/* NCCL related */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
// asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049 /**
// If you want to change this default value, why?(gongwb) * NCCL related FLAG
* Name: FLAGS_enable_cublas_tensor_op_math
* Since Version:
* Value Range:
* Example:
* Note: asynchronous nccl allreduce or synchronous issue:
* https://github.com/PaddlePaddle/Paddle/issues/15049
* If you want to change this default value, why?(gongwb)
*/
DEFINE_bool( DEFINE_bool(
sync_nccl_allreduce, true, sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`" "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios."); "after allreduce, this mode can get better performance in some scenarios.");
#endif #endif
/* Distributed related */
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
/**
* Distributed related FLAG
* Name: FLAGS_communicator_max_merge_var_num
* Since Version: 1.5.0
* Value Range: int32, default=20
* Example:
* Note: The maximum number of gradients to be merged into a gradient and
* sent through the communicator. The trainer puts all the gradients
* into the queue, and then the communicator takes the gradients out
* of the queue and sends them after merging.
*/
DEFINE_int32(communicator_max_merge_var_num, 20, DEFINE_int32(communicator_max_merge_var_num, 20,
"max var num to merge and send"); "max var num to merge and send");
/**
* Distributed related FLAG
* Name: FLAGS_communicator_send_queue_size
* Since Version: 1.5.0
* Value Range: int32, default=20
* Example:
* Note: Size for each gradient queue. The trainer puts the gradient into
* the queue, and then the communicator takes it out of the queue and
* sends it out. When the communicator is slow, the queue may be full,
* and the trainer will be continuously blocked before the queue has
* space. It is used to avoid training much faster than communication,
* so that too many gradients are not sent out in time.
*/
DEFINE_int32(communicator_send_queue_size, 20, DEFINE_int32(communicator_send_queue_size, 20,
"queue size to recv gradient before send"); "queue size to recv gradient before send");
#endif #endif
/**
* Distributed related FLAG
* Name: FLAGS_dist_threadpool_size
* Since Version: 1.0.0
* Value Range: int32, default=0
* Example:
* Note: Control the number of threads used for distributed modules.
* If it is not set, it is set to a hard thread.
*/
DEFINE_int32(dist_threadpool_size, 0, DEFINE_int32(dist_threadpool_size, 0,
"number of threads used for distributed executed."); "number of threads used for distributed executed.");
/* Garbage collector related */ /**
* Garbage collector related FLAG
* Name: FLAGS_eager_delete_tensor_gb
* Since Version: 1.0.0
* Value Range: double, default=kDefaultEagerDeleteTensorGB
* Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
* no longer used.
* FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
* garbage occupies 1.0GB of memory.
* FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
* policy.
* Note: Represents whether a garbage collection strategy is used to optimize
* network memory usage.
* It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
* enable garbage collection strategy when training large networks.
*/
// Disable gc by default when inference library is built // Disable gc by default when inference library is built
#ifdef PADDLE_ON_INFERENCE #ifdef PADDLE_ON_INFERENCE
static const double kDefaultEagerDeleteTensorGB = -1; static const double kDefaultEagerDeleteTensorGB = -1;
...@@ -117,35 +257,121 @@ DEFINE_double( ...@@ -117,35 +257,121 @@ DEFINE_double(
"Memory size threshold (GB) when the garbage collector clear tensors." "Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0"); "Disabled when this value is less than 0");
/**
* Memory related FLAG
* Name: FLAGS_fast_eager_deletion_mode
* Since Version: 1.3.0
* Value Range: bool, default=true
* Example:
* Note: Whether to use fast garbage collection strategy.
* If not set, the GPU memory is released at the end of the CUDA kernel.
* Otherwise, the GPU memory will be released before the CUDA kernel
* has finished, which will make the garbage collection strategy faster.
* Only works when garbage collection strategy is enabled.
*/
DEFINE_bool(fast_eager_deletion_mode, true, DEFINE_bool(fast_eager_deletion_mode, true,
"Fast eager deletion mode. If enabled, memory would release " "Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends."); "immediately without waiting GPU kernel ends.");
/**
* Memory related FLAG
* Name: FLAGS_memory_fraction_of_eager_deletion
* Since Version: 1.4
* Value Range: double [0.0, 1.0], default=1.0
* Example:
* Note: The percentage of memory size of garbage collection policy
* to release variables.
* If FLAGS_memory_fraction_of_eager_deletion = 1.0,
* all temporary variables in the network will be released.
* If FLAGS_memory_fraction_of_eager_deletion = 0.0,
* no temporary variables in the network are released.
* If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
* all temporary variables will be sorted in descending order
* according to their memory size, and only variables with the
* largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
* The flag is only valid when running parallel data compilers.
*/
DEFINE_double(memory_fraction_of_eager_deletion, 1.0, DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in " "Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and " "the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest " "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
"variables would be deleted."); "variables would be deleted.");
/* Allocator related */ /**
* Allocator related FLAG
* Name: FLAGS_allocator_strategy
* Since Version: 1.2
* Value Range: string, {naive_best_fit, auto_groth}, default=naive_best_fit
* Example:
* Note: Allocator policy for selecting Paddle Paddle.
* The allocator strategy is under development and the non-legacy
* allocator is not yet stable.
*/
DEFINE_string(allocator_strategy, "naive_best_fit", DEFINE_string(allocator_strategy, "naive_best_fit",
"The allocation strategy. naive_best_fit means the original best " "The allocation strategy. naive_best_fit means the original best "
"fit allocator of Fluid. " "fit allocator of Fluid. "
"auto_growth means the experimental auto-growth allocator. " "auto_growth means the experimental auto-growth allocator. "
"Enum in [naive_best_fit, auto_growth]."); "Enum in [naive_best_fit, auto_growth].");
/**
* Memory related FLAG
* Name: FLAGS_fraction_of_cpu_memory_to_use
* Since Version:
* Value Range:
* Example:
* Note:
*/
DEFINE_double(fraction_of_cpu_memory_to_use, 1, DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle," "Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
/**
* Memory related FLAG
* Name: FLAGS_initial_cpu_memory_in_mb
* Since Version: 0.14.0
* Value Range: uint64, default=500 (MB)
* Example:
* Note: The CPU memory block size of the initial allocator in MB.
* The allocator takes the minimum values of
* FLAGS_initial_cpu_memory_in_mb and
* FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
* as memory block sizes.
*/
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit."); "Initial CPU memory for PaddlePaddle, in MD unit.");
/**
* Memory related FLAG
* Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
* Since Version:
* Value Range:
* Example:
* Note:
*/
DEFINE_double( DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5, fraction_of_cuda_pinned_memory_to_use, 0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
/**
* Memory related FLAG
* Name: FLAGS_fraction_of_gpu_memory_to_use
* Since Version: 1.2.0
* Value Range: double, default=0.5 if win32, 0.92 else
* Example:
* Note: Represents the proportion of allocated memory blocks to the total
* memory size
* of the GPU. Future memory usage will be allocated from this memory
* block.
* If the memory block does not have enough GPU memory, new memory blocks
* of
* the same size as the memory block will be allocated from the GPU
* request
* until the GPU does not have enough memory.
*/
#ifndef _WIN32 #ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f; constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else #else
...@@ -154,7 +380,6 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; ...@@ -154,7 +380,6 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
// which may lead to insufficient memory left for paddle // which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f; constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif #endif
DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"Allocate a trunk of gpu memory that is this fraction of the " "Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated " "total gpu memory size. Future memory usage will be allocated "
...@@ -162,6 +387,18 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, ...@@ -162,6 +387,18 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu " "additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."); "until the gpu has no memory left for another trunk.");
/**
* Memory related FLAG
* Name: FLAGS_initial_gpu_memory_in_mb
* Since Version: 1.4.0
* Value Range: uint64, default=0 (MB)
* Example:
* Note: Allocate a specified size of GPU memory block. Later memory usage
* will be allocated from that memory block. If the memory block does not
* have enough GPU memory, the memory block with the size
* FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
* the GPU has no remaining memory.
*/
DEFINE_uint64( DEFINE_uint64(
initial_gpu_memory_in_mb, 0ul, initial_gpu_memory_in_mb, 0ul,
"Allocate a trunk of gpu memory whose byte size is specified by " "Allocate a trunk of gpu memory whose byte size is specified by "
...@@ -175,8 +412,18 @@ DEFINE_uint64( ...@@ -175,8 +412,18 @@ DEFINE_uint64(
"flag. If you don't set this flag, PaddlePaddle will use " "flag. If you don't set this flag, PaddlePaddle will use "
"FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory"); "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
/**
* Memory related FLAG
* Name: FLAGS_reallocate_gpu_memory_in_mb
* Since Version: 1.4.0
* Value Range: uint64, default=0 (MB)
* Example:
* Note: If the allocated GPU memory blocks are exhausted,
* additional GPU memory blocks are reallocated
*/
DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
"If this flag is set, Paddle will reallocate the gpu memory with " "If this flag is set, Paddle will reallocate the gpu memory with "
"size specified by this flag. Else Paddle will reallocate by " "size specified by this flag. Else Paddle will reallocate by "
"FLAGS_fraction_of_gpu_memory_to_use"); "FLAGS_fraction_of_gpu_memory_to_use");
#endif #endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册