flags.cc 37.5 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

16
#include "paddle/phi/core/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18
#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
19 20
#endif

21
namespace phi {
Z
Zeng Jinle 已提交
22 23 24 25 26 27 28 29 30 31

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

32
}  // namespace phi
Z
Zeng Jinle 已提交
33

34 35
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
                             0,
36 37
                             "number of threads for inner op");

38 39 40 41
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

42 43 44 45 46 47 48 49 50
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
51 52
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads,
                             1,
Z
Zeng Jinle 已提交
53
                             "Number of threads for each paddle instance.");
54

55 56 57 58 59 60 61 62
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
Z
Zeng Jinle 已提交
63
PADDLE_DEFINE_EXPORTED_bool(
64 65
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
66 67
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
68

69 70
/**
 * Operator related FLAG
71
 * Name: FLAGS_check_nan_inf_level
72
 * Since Version: 2.5.0
73
 * Value Range: int32, default=0
74
 * Example:
75 76 77 78 79 80 81 82 83
 * Note: Used to debug. Setting the check and print level when
 * FLAGS_check_nan_inf is set.
 * - 0, abort the process when any operator produce NAN/INF and only print the
 * information of tensor which holds NAN/INF.
 * - 1, continue the training or inference process and print the information of
 * all tensors which holds NAN/INF.
 * - 2, print the information of float tensors when the max or min value
 * overflowing float16's limit.
 * - 3, print the information of all tensors.
84
 */
85 86 87 88
PADDLE_DEFINE_EXPORTED_int32(
    check_nan_inf_level,
    0,
    "Setting the check and print level when FLAGS_check_nan_inf is set.");
89

D
danleifeng 已提交
90 91 92 93 94 95 96 97 98 99 100 101 102 103
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

104 105 106 107
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_ASCEND_CL)
108 109 110 111 112 113 114 115 116

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
Z
Zeng Jinle 已提交
117
PADDLE_DEFINE_EXPORTED_bool(
118 119
    enable_cublas_tensor_op_math,
    false,
120 121 122 123 124 125 126 127
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

128 129 130 131 132 133 134 135 136 137
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
PADDLE_DEFINE_EXPORTED_bool(
138 139
    gemm_use_half_precision_compute_type,
    true,
140 141 142 143
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
    "type will be set to fp32. Default is true.");

144 145 146 147 148 149 150 151 152
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
Z
Zeng Jinle 已提交
153
PADDLE_DEFINE_EXPORTED_string(
154 155
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
156 157 158 159 160 161 162
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
163 164
#endif

165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
PADDLE_DEFINE_EXPORTED_int64(
183 184
    cublaslt_exhaustive_search_times,
    0,
185 186 187 188
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

189
#if defined(PADDLE_WITH_ASCEND_CL)
Z
Zeng Jinle 已提交
190
PADDLE_DEFINE_EXPORTED_string(
191 192
    selected_npus,
    "",
Z
Zeng Jinle 已提交
193 194 195 196 197
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (NPU). If you want to use "
    "all visible devices, set this to empty string.");
PADDLE_DEFINE_EXPORTED_bool(
198 199
    hccl_check_nan,
    true,
Z
Zeng Jinle 已提交
200 201 202
    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
    "core when meets Nan value");
PADDLE_DEFINE_EXPORTED_string(
203 204
    npu_config_path,
    "",
205 206
    "The absolute path of configuration json file, like: /tmp/config.json. "
    "If proveided, it will be passed to aclInit().");
207 208
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
                             1,
Z
Zeng Jinle 已提交
209
                             "set minmum loss scaling value!");
A
Aganlengzi 已提交
210
PADDLE_DEFINE_EXPORTED_string(
211 212
    npu_precision_mode,
    "",
A
Aganlengzi 已提交
213 214 215 216 217
    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
    "'allow_mix_precision'. If you want to use the default mode ("
    "allow_fp32_to_fp16), set this to empty string. For more details, "
    "please refer to the documents");
218
#endif
219

220 221 222 223 224 225 226 227 228 229 230 231 232
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

233
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
234 235 236 237 238 239 240 241 242
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
Z
Zeng Jinle 已提交
243
PADDLE_DEFINE_EXPORTED_bool(
244 245
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
246 247 248
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
249

250 251 252 253
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
254
 * Value Range: uint64, default=512 (MB)
255 256 257 258 259 260 261
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
262 263 264 265
PADDLE_DEFINE_EXPORTED_int64(
    conv_workspace_size_limit,
    phi::backends::gpu::kDefaultConvWorkspaceSizeLimitMB,
    "cuDNN convolution workspace limit in MB unit.");
266

267 268 269 270 271 272 273 274 275 276 277 278 279 280
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
Z
Zeng Jinle 已提交
281
PADDLE_DEFINE_EXPORTED_bool(
282 283
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
284 285
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
286

287 288 289 290 291 292 293 294
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
295 296
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                             -1,
Z
Zeng Jinle 已提交
297 298
                             "Exhaustive search times for cuDNN convolution, "
                             "default is -1, not exhaustive search");
299

300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
Z
Zeng Jinle 已提交
316
PADDLE_DEFINE_EXPORTED_bool(
317 318
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
319 320
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
321 322
#endif

323
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
324 325 326

/**
 * NCCL related FLAG
327 328 329
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
330 331 332 333 334
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
Z
Zeng Jinle 已提交
335
PADDLE_DEFINE_EXPORTED_bool(
336 337
    sync_nccl_allreduce,
    true,
338 339 340 341 342
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
343 344 345 346 347 348 349 350 351 352 353
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
354 355
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                             20,
Z
Zeng Jinle 已提交
356 357
                             "max var num to merge and send");
PADDLE_DEFINE_EXPORTED_bool(
358 359
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
360 361
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
362 363 364 365 366 367 368 369 370 371 372 373 374
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
375 376
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                             20,
Z
Zeng Jinle 已提交
377
                             "queue size to recv gradient before send");
378 379
#endif

380 381 382 383 384 385 386 387 388
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
Z
Zeng Jinle 已提交
389
PADDLE_DEFINE_EXPORTED_int32(
390 391
    dist_threadpool_size,
    0,
Z
Zeng Jinle 已提交
392
    "number of threads used for distributed executed.");
393

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
410 411 412
// Disable gc by default when inference library is built
static const double kDefaultEagerDeleteTensorGB = 0;

Z
Zeng Jinle 已提交
413
PADDLE_DEFINE_EXPORTED_double(
414 415
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
416 417 418
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

419 420 421 422 423 424 425 426 427 428 429 430
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
Z
Zeng Jinle 已提交
431
PADDLE_DEFINE_EXPORTED_bool(
432 433
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
434 435
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
436

437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
Z
Zeng Jinle 已提交
455
PADDLE_DEFINE_EXPORTED_double(
456 457
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
458 459 460 461
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
462

463 464 465 466
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
467 468
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
469
 * Example:
470
 * Note: For selecting allocator policy of PaddlePaddle.
471
 */
472
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
Z
Zeng Jinle 已提交
473
PADDLE_DEFINE_EXPORTED_string(
474 475
    allocator_strategy,
    kDefaultAllocatorStrategy,
476 477 478 479 480 481 482 483 484 485 486
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
487

488 489 490
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
491 492
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
493
 * Example:
494 495 496 497 498 499
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
500
 */
501 502
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                              1,
Z
Zeng Jinle 已提交
503 504
                              "Default use 100% of CPU memory for PaddlePaddle,"
                              "reserve the rest for page tables, etc");
505 506 507 508 509 510 511 512 513 514 515 516 517

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
Z
Zeng Jinle 已提交
518
PADDLE_DEFINE_EXPORTED_uint64(
519 520
    initial_cpu_memory_in_mb,
    500ul,
Z
Zeng Jinle 已提交
521
    "Initial CPU memory for PaddlePaddle, in MD unit.");
522

523 524 525
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
526 527
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
528
 * Example:
529 530 531 532 533 534
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
535
 */
Z
Zeng Jinle 已提交
536
PADDLE_DEFINE_EXPORTED_double(
537 538
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
539 540 541
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

542 543
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
544 545 546
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
    defined(PADDLE_WITH_CUSTOM_DEVICE)
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

565 566 567 568 569 570 571 572
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
Z
Zeng Jinle 已提交
573
PADDLE_DEFINE_EXPORTED_double(
574 575
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
576 577 578 579 580
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
581

582 583 584 585 586 587 588 589 590 591 592 593
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
Z
Zeng Jinle 已提交
594
PADDLE_DEFINE_EXPORTED_uint64(
595 596
    initial_gpu_memory_in_mb,
    0ul,
597 598 599 600 601 602 603 604 605 606 607
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

608 609 610 611 612 613 614 615 616
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
Z
Zeng Jinle 已提交
617
PADDLE_DEFINE_EXPORTED_uint64(
618 619
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
620 621 622 623 624
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

PADDLE_DEFINE_EXPORTED_uint64(
625 626
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
627 628 629 630 631 632
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
633

634
#endif
635 636 637 638 639 640 641 642 643

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
644
PADDLE_DEFINE_EXPORTED_double(
645 646
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
647 648 649 650
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
651

652
PADDLE_DEFINE_EXPORTED_bool(
653 654
    reader_queue_speed_test_mode,
    false,
655 656 657
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

658 659 660 661 662 663 664 665
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
666
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
682 683 684
#ifdef PADDLE_NO_PYTHON
static const int32_t kDefaultCallStackLevel = 2;
#else
685
static const int32_t kDefaultCallStackLevel = 1;
686
#endif
687

Z
Zeng Jinle 已提交
688
PADDLE_DEFINE_EXPORTED_int32(
689 690
    call_stack_level,
    kDefaultCallStackLevel,
691 692 693 694 695 696 697 698
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
699 700 701 702 703 704 705 706 707 708

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
709 710
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient,
                            false,
Z
Zeng Jinle 已提交
711 712
                            "Sum gradients by the reverse order of "
                            "the forward execution sequence.");
713 714 715 716 717 718 719 720 721

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
Z
Zeng Jinle 已提交
722
PADDLE_DEFINE_EXPORTED_int32(
723 724
    max_inplace_grad_add,
    0,
725 726 727 728
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
729 730 731 732 733 734 735 736 737

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
738 739
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                              "",
Z
Zeng Jinle 已提交
740
                              "List of OneDNN operation types to be turned on");
741 742 743 744 745 746 747 748 749

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
Z
Zeng Jinle 已提交
750
PADDLE_DEFINE_EXPORTED_string(
751 752
    tracer_mkldnn_ops_off,
    "",
Z
Zeng Jinle 已提交
753
    "List of OneDNN operation types to be turned off");
754

755 756 757 758 759 760 761 762 763
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
764
PADDLE_DEFINE_EXPORTED_bool(
765 766
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
767
    "Check kernel launch status after every kernel compute");
768 769
#endif

770 771 772 773 774 775 776 777 778
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
779 780
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                            false,
Z
Zeng Jinle 已提交
781
                            "Disable cudnn in conv2d");
782

783 784
PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
                            false,
785
                            "Whether to use fast math GPU functions.");
786
#endif
B
Baibaifan 已提交
787 788 789 790 791 792 793 794 795

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
F
fwenguang 已提交
796 797 798
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_MLU)
799 800
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                             120,
Z
Zeng Jinle 已提交
801
                             "The maximum time for get host by name time");
B
Baibaifan 已提交
802
#endif
803 804 805 806 807 808 809 810 811 812

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
Z
Zeng Jinle 已提交
813
PADDLE_DEFINE_EXPORTED_bool(
814 815
    apply_pass_to_program,
    false,
816
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
817

D
danleifeng 已提交
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                            false,
                            "It controls whether load graph node and edge with "
                            "mutli threads parallely.");

/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
PADDLE_DEFINE_EXPORTED_bool(
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

F
Feng Xing 已提交
846
/**
L
Liu-xiandong 已提交
847
 * KP kernel related FLAG
F
Feng Xing 已提交
848 849 850
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
851 852
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
853 854
 * Note:
 */
855 856
PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel,
                            false,
L
Liu-xiandong 已提交
857
                            "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
858

859
/**
860 861 862 863 864 865 866 867 868 869
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
870 871
PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                            false,
872 873 874 875 876
                            "It controls whether the allreduce operations "
                            "would only wait one event instead of multiple "
                            "events. Currently, only fuse allreduce supports "
                            "this. Otherwise, the precision may be wrong.");

877
#ifdef PADDLE_WITH_CINN
878
/*
879 880 881 882 883 884 885 886
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
PADDLE_DEFINE_EXPORTED_bool(
    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
887 888 889 890 891 892 893 894 895

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
896 897
PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops,
                              "",
898 899 900 901 902 903 904 905 906 907 908
                              "It controls the cinn op subset to be used, "
                              "which has the highest priority.");

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
909 910
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops,
                              "",
911
                              "It controls the cinn op subset to be not used.");
912 913 914 915 916 917 918 919 920 921

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
922 923
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            true,
924 925 926 927 928 929 930 931 932 933 934
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
935 936
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                            false,
937 938 939
                            "It controls whether to use cinn with "
                            "its auto-tune feature enabled");

940
#endif
941

942 943
DEFINE_int32(record_pool_max_size,
             2000000,
Y
yaoxuefeng 已提交
944 945
             "SlotRecordDataset slot record pool max size");
DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
946 947
DEFINE_bool(enable_slotpool_wait_release,
            false,
Y
yaoxuefeng 已提交
948
            "enable slotrecord obejct wait release, default false");
949 950
DEFINE_bool(enable_slotrecord_reset_shrink,
            false,
Y
yaoxuefeng 已提交
951
            "enable slotrecord obejct reset shrink memory, default false");
952 953
DEFINE_bool(enable_ins_parser_file,
            false,
D
danleifeng 已提交
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
            "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                              0.75,
                              "the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_gpu_direct_access,
    false,
    "enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                            true,
                            "enable load_node_list_into_hbm, default true");
981 982 983 984 985 986 987 988 989 990 991 992

/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif
993 994 995 996 997 998 999 1000 1001

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
1002

H
hong 已提交
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
/**
 * Conv Search cache max number related FLAG
 * Name: FLAGS_search_cache_max_number
 * Since Version: 2.3.0
 * Value Range: int32, default=1000000
 * Example:
 */
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
                             1000000,
                             "search_cache_max_number.");

1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
/**
 * Preformance related FLAG
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
 * uses more gpu memory.
 */
PADDLE_DEFINE_EXPORTED_bool(
1024 1025
    einsum_opt,
    false,
1026
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1027 1028 1029 1030 1031 1032

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
1033
 * default=Predictor
1034 1035
 * Example:
 * Note:
1036
 * FLAGS_jit_engine_type == New, using InterpreterEngine by default
1037
 * FLAGS_jit_engine_type == Predictor, using inference Predictor by default
1038 1039
 */
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
1040
                              "Predictor",
1041
                              "Choose default funciton type in JitLayer.");
1042

1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
/**
 * Custom Device NPU related FLAG
 * Name: FLAGS_npu_storage_format
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable NPU Storage Format for Ascend910 performance improvement.
 */
PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, "");

1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
 * CUDNNv8 related FLAG
 * Name: enable_cudnn_frontend
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable CUDNNv8 Frontend API for CUDNN kernels.
 */
PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, "");
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075

/**
 * CUDNNv8 related FLAG
 * Name: cudnn_cache_saturation_count
 * Since Version: 2.5.0
 * Value Range: int64_t, default=1
 * Example:
 * Note: Set saturation count for CUDNNv8 cache. A candidate execution
 * plan need to be considered as the fastest plan by exhaustive search
 * N times before it is actually added in the cache. It is useful when
 * the result of exhaustive search is unstable.
 */
PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
1076
#endif  // PADDLE_WITH_CUDNN_FRONTEND