From 27235cf2227287b1da497841374297ff9133878f Mon Sep 17 00:00:00 2001 From: XiaoguangHu <46782768+XiaoguangHu01@users.noreply.github.com> Date: Tue, 10 Sep 2019 14:18:43 +0800 Subject: [PATCH] Add document annotations for FLAGS that need to be open to external developers test=develop (#19692) Add document annotations for FLAGS that need to be open to external developers --- paddle/fluid/platform/flags.cc | 283 ++++++++++++++++++++++++++++++--- 1 file changed, 265 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index b2224b05bef..0a3608311e8 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -21,17 +21,40 @@ * NOTE(paddle-dev): This file is designed to define all public FLAGS. */ -/* Paddle initialization related */ +/** + * Paddle initialization related FLAG + * Name: FLAGS_paddle_num_threads + * Since Version: 0.15.0 + * Value Range: int32, default=1 + * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per + * instance to 2 + * Note: + */ DEFINE_int32(paddle_num_threads, 1, "Number of threads for each paddle instance."); -/* Operator related */ +/** + * Operator related FLAG + * Name: FLAGS_check_nan_inf + * Since Version: 0.13.0 + * Value Range: bool, default=false + * Example: + * Note: Used to debug. Checking whether operator produce NAN/INF or not. + */ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); -/* CUDA related */ #ifdef PADDLE_WITH_CUDA + +/** + * CUDA related related FLAG + * Name: FLAGS_enable_cublas_tensor_op_math + * Since Version: 1.2.0 + * Value Range: bool, default=false + * Example: + * Note: whether to use Tensor Core, faster but it may loss precision. + */ DEFINE_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " @@ -42,6 +65,15 @@ DEFINE_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); +/** + * CUDA related FLAG + * Name: FLAGS_selected_gpus + * Since Version: 1.3.0 + * Value Range: integer list separated by comma, default empty list + * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu + * cards + * Note: A list of device ids separated by comma, like: 0,1,2,3 + */ DEFINE_string(selected_gpus, "", "A list of device ids separated by comma, like: 0,1,2,3. " "This option is useful when doing multi process training and " @@ -52,59 +84,167 @@ DEFINE_string(selected_gpus, "", "share-memory only."); #endif -/* CUDNN related */ #ifdef PADDLE_WITH_CUDA + +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_deterministic + * Since Version: 0.13.0 + * Value Range: bool, default=false + * Example: + * Note: whether to use deterministic algorithm in cudnn. + * If true, it will slow down some operators such as conv and pooling. + */ DEFINE_bool(cudnn_deterministic, false, "Whether allow using an autotuning algorithm for convolution " "operator. The autotuning algorithm may be non-deterministic. If " "true, the algorithm is deterministic."); +/** + * CUDNN related FLAG + * Name: FLAGS_conv_workspace_size_limit + * Since Version: 0.13.0 + * Value Range: uint64, default=4096 (MB) + * Example: + * Note: The internal function of cuDNN obtains the fastest matching algorithm + * within this memory limit. Usually, faster algorithms can be chosen in + * larger workspaces, but memory space can also be significantly + * increased. + * Users need to balance memory and speed. + */ DEFINE_uint64(conv_workspace_size_limit, paddle::platform::kDefaultConvWorkspaceSizeLimitMB, "cuDNN convolution workspace limit in MB unit."); +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_exhaustive_search + * Since Version: 1.2.0 + * Value Range: bool, default=false + * Example: + * Note: Represents whether an exhaustive search method is used to + * select a convolution algorithm. There are two search methods in cuDNN, + * heuristic search and exhaustive search. Exhaustive search attempts + * all cuDNN algorithms to select the fastest. This method is very + * time-consuming, and the selected algorithm will be cached for a given + * layer specification. Once you change the layer specifications + * (such as batch size, feature map size), it will search again. + */ DEFINE_bool(cudnn_exhaustive_search, false, "Whether enable exhaustive search for cuDNN convolution or " "not, default is False."); +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_exhaustive_search_times + * Since Version: + * Value Range: + * Example: + * Note: only used to predict for advanced developer + */ DEFINE_int64(cudnn_exhaustive_search_times, -1, "Exhaustive search times for cuDNN convolution, " "default is -1, not exhaustive search"); -// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in -// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT -// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The -// reason we set it to false by default is that this mode may use scaled -// atomic integer reduction that may cause a numerical overflow for certain -// input data range. +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_batchnorm_spatial_persistent + * Since Version: 1.4.0 + * Value Range: bool, default=false + * Example: + * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be + * faster in + * some tasks because an optimized path may be selected for + * CUDNN_DATA_FLOAT + * and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The + * reason we set it to false by default is that this mode may use scaled + * atomic integer reduction that may cause a numerical overflow for + * certain + * input data range. + */ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " "batch_norm, default is False."); #endif -/* NCCL related */ #ifdef PADDLE_WITH_CUDA -// asynchronous nccl allreduce or synchronous issue: -// https://github.com/PaddlePaddle/Paddle/issues/15049 -// If you want to change this default value, why?(gongwb) + +/** + * NCCL related FLAG + * Name: FLAGS_enable_cublas_tensor_op_math + * Since Version: + * Value Range: + * Example: + * Note: asynchronous nccl allreduce or synchronous issue: + * https://github.com/PaddlePaddle/Paddle/issues/15049 + * If you want to change this default value, why?(gongwb) + */ DEFINE_bool( sync_nccl_allreduce, true, "If set true, will call `cudaStreamSynchronize(nccl_stream)`" "after allreduce, this mode can get better performance in some scenarios."); #endif -/* Distributed related */ #ifdef PADDLE_WITH_DISTRIBUTE +/** + * Distributed related FLAG + * Name: FLAGS_communicator_max_merge_var_num + * Since Version: 1.5.0 + * Value Range: int32, default=20 + * Example: + * Note: The maximum number of gradients to be merged into a gradient and + * sent through the communicator. The trainer puts all the gradients + * into the queue, and then the communicator takes the gradients out + * of the queue and sends them after merging. + */ DEFINE_int32(communicator_max_merge_var_num, 20, "max var num to merge and send"); + +/** + * Distributed related FLAG + * Name: FLAGS_communicator_send_queue_size + * Since Version: 1.5.0 + * Value Range: int32, default=20 + * Example: + * Note: Size for each gradient queue. The trainer puts the gradient into + * the queue, and then the communicator takes it out of the queue and + * sends it out. When the communicator is slow, the queue may be full, + * and the trainer will be continuously blocked before the queue has + * space. It is used to avoid training much faster than communication, + * so that too many gradients are not sent out in time. + */ DEFINE_int32(communicator_send_queue_size, 20, "queue size to recv gradient before send"); #endif +/** + * Distributed related FLAG + * Name: FLAGS_dist_threadpool_size + * Since Version: 1.0.0 + * Value Range: int32, default=0 + * Example: + * Note: Control the number of threads used for distributed modules. + * If it is not set, it is set to a hard thread. + */ DEFINE_int32(dist_threadpool_size, 0, "number of threads used for distributed executed."); -/* Garbage collector related */ +/** + * Garbage collector related FLAG + * Name: FLAGS_eager_delete_tensor_gb + * Since Version: 1.0.0 + * Value Range: double, default=kDefaultEagerDeleteTensorGB + * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is + * no longer used. + * FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when + * garbage occupies 1.0GB of memory. + * FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection + * policy. + * Note: Represents whether a garbage collection strategy is used to optimize + * network memory usage. + * It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to + * enable garbage collection strategy when training large networks. + */ // Disable gc by default when inference library is built #ifdef PADDLE_ON_INFERENCE static const double kDefaultEagerDeleteTensorGB = -1; @@ -117,35 +257,121 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); +/** + * Memory related FLAG + * Name: FLAGS_fast_eager_deletion_mode + * Since Version: 1.3.0 + * Value Range: bool, default=true + * Example: + * Note: Whether to use fast garbage collection strategy. + * If not set, the GPU memory is released at the end of the CUDA kernel. + * Otherwise, the GPU memory will be released before the CUDA kernel + * has finished, which will make the garbage collection strategy faster. + * Only works when garbage collection strategy is enabled. + */ DEFINE_bool(fast_eager_deletion_mode, true, "Fast eager deletion mode. If enabled, memory would release " "immediately without waiting GPU kernel ends."); +/** + * Memory related FLAG + * Name: FLAGS_memory_fraction_of_eager_deletion + * Since Version: 1.4 + * Value Range: double [0.0, 1.0], default=1.0 + * Example: + * Note: The percentage of memory size of garbage collection policy + * to release variables. + * If FLAGS_memory_fraction_of_eager_deletion = 1.0, + * all temporary variables in the network will be released. + * If FLAGS_memory_fraction_of_eager_deletion = 0.0, + * no temporary variables in the network are released. + * If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0, + * all temporary variables will be sorted in descending order + * according to their memory size, and only variables with the + * largest FLAGS_memory_fraction_of_eager_deletion ratio will be released. + * The flag is only valid when running parallel data compilers. + */ DEFINE_double(memory_fraction_of_eager_deletion, 1.0, "Fraction of eager deletion. If less than 1.0, all variables in " "the program would be sorted according to its memory size, and " "only the FLAGS_memory_fraction_of_eager_deletion of the largest " "variables would be deleted."); -/* Allocator related */ +/** + * Allocator related FLAG + * Name: FLAGS_allocator_strategy + * Since Version: 1.2 + * Value Range: string, {naive_best_fit, auto_groth}, default=naive_best_fit + * Example: + * Note: Allocator policy for selecting Paddle Paddle. + * The allocator strategy is under development and the non-legacy + * allocator is not yet stable. + */ DEFINE_string(allocator_strategy, "naive_best_fit", "The allocation strategy. naive_best_fit means the original best " "fit allocator of Fluid. " "auto_growth means the experimental auto-growth allocator. " "Enum in [naive_best_fit, auto_growth]."); +/** + * Memory related FLAG + * Name: FLAGS_fraction_of_cpu_memory_to_use + * Since Version: + * Value Range: + * Example: + * Note: + */ DEFINE_double(fraction_of_cpu_memory_to_use, 1, "Default use 100% of CPU memory for PaddlePaddle," "reserve the rest for page tables, etc"); + +/** + * Memory related FLAG + * Name: FLAGS_initial_cpu_memory_in_mb + * Since Version: 0.14.0 + * Value Range: uint64, default=500 (MB) + * Example: + * Note: The CPU memory block size of the initial allocator in MB. + * The allocator takes the minimum values of + * FLAGS_initial_cpu_memory_in_mb and + * FLAGS_fraction_of_cpu_memory_to_use*(total physical memory) + * as memory block sizes. + */ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, "Initial CPU memory for PaddlePaddle, in MD unit."); +/** + * Memory related FLAG + * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use + * Since Version: + * Value Range: + * Example: + * Note: + */ DEFINE_double( fraction_of_cuda_pinned_memory_to_use, 0.5, "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "reserve the rest for page tables, etc"); #ifdef PADDLE_WITH_CUDA + +/** + * Memory related FLAG + * Name: FLAGS_fraction_of_gpu_memory_to_use + * Since Version: 1.2.0 + * Value Range: double, default=0.5 if win32, 0.92 else + * Example: + * Note: Represents the proportion of allocated memory blocks to the total + * memory size + * of the GPU. Future memory usage will be allocated from this memory + * block. + * If the memory block does not have enough GPU memory, new memory blocks + * of + * the same size as the memory block will be allocated from the GPU + * request + * until the GPU does not have enough memory. + */ + #ifndef _WIN32 constexpr static float fraction_of_gpu_memory_to_use = 0.92f; #else @@ -154,7 +380,6 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; // which may lead to insufficient memory left for paddle constexpr static float fraction_of_gpu_memory_to_use = 0.5f; #endif - DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "Allocate a trunk of gpu memory that is this fraction of the " "total gpu memory size. Future memory usage will be allocated " @@ -162,6 +387,18 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +/** + * Memory related FLAG + * Name: FLAGS_initial_gpu_memory_in_mb + * Since Version: 1.4.0 + * Value Range: uint64, default=0 (MB) + * Example: + * Note: Allocate a specified size of GPU memory block. Later memory usage + * will be allocated from that memory block. If the memory block does not + * have enough GPU memory, the memory block with the size + * FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until + * the GPU has no remaining memory. + */ DEFINE_uint64( initial_gpu_memory_in_mb, 0ul, "Allocate a trunk of gpu memory whose byte size is specified by " @@ -175,8 +412,18 @@ DEFINE_uint64( "flag. If you don't set this flag, PaddlePaddle will use " "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory"); +/** + * Memory related FLAG + * Name: FLAGS_reallocate_gpu_memory_in_mb + * Since Version: 1.4.0 + * Value Range: uint64, default=0 (MB) + * Example: + * Note: If the allocated GPU memory blocks are exhausted, + * additional GPU memory blocks are reallocated + */ DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, "If this flag is set, Paddle will reallocate the gpu memory with " "size specified by this flag. Else Paddle will reallocate by " "FLAGS_fraction_of_gpu_memory_to_use"); + #endif -- GitLab