flags.cc 38.1 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

16
#include "paddle/phi/core/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18
#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
19 20
#endif

21
namespace phi {
Z
Zeng Jinle 已提交
22 23 24 25 26 27 28 29 30 31

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

32
}  // namespace phi
Z
Zeng Jinle 已提交
33

34 35
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
                             0,
36 37
                             "number of threads for inner op");

38 39 40 41
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

42 43 44 45 46 47 48 49 50
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
51 52
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads,
                             1,
Z
Zeng Jinle 已提交
53
                             "Number of threads for each paddle instance.");
54

55 56 57 58 59 60 61 62 63 64 65 66 67 68
/**
 * Low Precision Op related FLAG
 * Name: FLAGS_low_precision_op_list
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Get the low precision op list of current module.
 */
PADDLE_DEFINE_EXPORTED_bool(low_precision_op_list,
                            false,
                            "Checking whether get the low precision op list of "
                            "current module. It will be "
                            "rerun the low precision list after module.");

69 70 71 72 73 74 75 76
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
Z
Zeng Jinle 已提交
77
PADDLE_DEFINE_EXPORTED_bool(
78 79
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
80 81
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
82

83 84
/**
 * Operator related FLAG
85
 * Name: FLAGS_check_nan_inf_level
86
 * Since Version: 2.5.0
87
 * Value Range: int32, default=0
88
 * Example:
89 90 91 92 93 94 95 96 97
 * Note: Used to debug. Setting the check and print level when
 * FLAGS_check_nan_inf is set.
 * - 0, abort the process when any operator produce NAN/INF and only print the
 * information of tensor which holds NAN/INF.
 * - 1, continue the training or inference process and print the information of
 * all tensors which holds NAN/INF.
 * - 2, print the information of float tensors when the max or min value
 * overflowing float16's limit.
 * - 3, print the information of all tensors.
98
 */
99 100 101 102
PADDLE_DEFINE_EXPORTED_int32(
    check_nan_inf_level,
    0,
    "Setting the check and print level when FLAGS_check_nan_inf is set.");
103

D
danleifeng 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

118 119 120 121
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_ASCEND_CL)
122 123 124 125 126 127 128 129 130

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
Z
Zeng Jinle 已提交
131
PADDLE_DEFINE_EXPORTED_bool(
132 133
    enable_cublas_tensor_op_math,
    false,
134 135 136 137 138 139 140 141
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

142 143 144 145 146 147 148 149 150 151
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
PADDLE_DEFINE_EXPORTED_bool(
152 153
    gemm_use_half_precision_compute_type,
    true,
154 155 156 157
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
    "type will be set to fp32. Default is true.");

158 159 160 161 162 163 164 165 166
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
Z
Zeng Jinle 已提交
167
PADDLE_DEFINE_EXPORTED_string(
168 169
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
170 171 172 173 174 175 176
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
177 178
#endif

179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
PADDLE_DEFINE_EXPORTED_int64(
197 198
    cublaslt_exhaustive_search_times,
    0,
199 200 201 202
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

203
#if defined(PADDLE_WITH_ASCEND_CL)
Z
Zeng Jinle 已提交
204
PADDLE_DEFINE_EXPORTED_string(
205 206
    selected_npus,
    "",
Z
Zeng Jinle 已提交
207 208 209 210 211
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (NPU). If you want to use "
    "all visible devices, set this to empty string.");
PADDLE_DEFINE_EXPORTED_bool(
212 213
    hccl_check_nan,
    true,
Z
Zeng Jinle 已提交
214 215 216
    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
    "core when meets Nan value");
PADDLE_DEFINE_EXPORTED_string(
217 218
    npu_config_path,
    "",
219 220
    "The absolute path of configuration json file, like: /tmp/config.json. "
    "If proveided, it will be passed to aclInit().");
221 222
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
                             1,
Z
Zeng Jinle 已提交
223
                             "set minmum loss scaling value!");
A
Aganlengzi 已提交
224
PADDLE_DEFINE_EXPORTED_string(
225 226
    npu_precision_mode,
    "",
A
Aganlengzi 已提交
227 228 229 230 231
    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
    "'allow_mix_precision'. If you want to use the default mode ("
    "allow_fp32_to_fp16), set this to empty string. For more details, "
    "please refer to the documents");
232
#endif
233

234 235 236 237 238 239 240 241 242 243 244 245 246
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

247
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
248 249 250 251 252 253 254 255 256
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
Z
Zeng Jinle 已提交
257
PADDLE_DEFINE_EXPORTED_bool(
258 259
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
260 261 262
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
263

264 265 266 267
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
268
 * Value Range: uint64, default=512 (MB)
269 270 271 272 273 274 275
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
276 277 278 279
PADDLE_DEFINE_EXPORTED_int64(
    conv_workspace_size_limit,
    phi::backends::gpu::kDefaultConvWorkspaceSizeLimitMB,
    "cuDNN convolution workspace limit in MB unit.");
280

281 282 283 284 285 286 287 288 289 290 291 292 293 294
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
Z
Zeng Jinle 已提交
295
PADDLE_DEFINE_EXPORTED_bool(
296 297
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
298 299
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
300

301 302 303 304 305 306 307 308
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
309 310
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                             -1,
Z
Zeng Jinle 已提交
311 312
                             "Exhaustive search times for cuDNN convolution, "
                             "default is -1, not exhaustive search");
313

314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
Z
Zeng Jinle 已提交
330
PADDLE_DEFINE_EXPORTED_bool(
331 332
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
333 334
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
335 336
#endif

337
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
338 339 340

/**
 * NCCL related FLAG
341 342 343
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
344 345 346 347 348
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
Z
Zeng Jinle 已提交
349
PADDLE_DEFINE_EXPORTED_bool(
350 351
    sync_nccl_allreduce,
    true,
352 353 354 355 356
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
357 358 359 360 361 362 363 364 365 366 367
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
368 369
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                             20,
Z
Zeng Jinle 已提交
370 371
                             "max var num to merge and send");
PADDLE_DEFINE_EXPORTED_bool(
372 373
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
374 375
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
376 377 378 379 380 381 382 383 384 385 386 387 388
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
389 390
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                             20,
Z
Zeng Jinle 已提交
391
                             "queue size to recv gradient before send");
392 393
#endif

394 395 396 397 398 399 400 401 402
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
Z
Zeng Jinle 已提交
403
PADDLE_DEFINE_EXPORTED_int32(
404 405
    dist_threadpool_size,
    0,
Z
Zeng Jinle 已提交
406
    "number of threads used for distributed executed.");
407

408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
424 425 426
// Disable gc by default when inference library is built
static const double kDefaultEagerDeleteTensorGB = 0;

Z
Zeng Jinle 已提交
427
PADDLE_DEFINE_EXPORTED_double(
428 429
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
430 431 432
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

433 434 435 436 437 438 439 440 441 442 443 444
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
Z
Zeng Jinle 已提交
445
PADDLE_DEFINE_EXPORTED_bool(
446 447
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
448 449
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
450

451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
Z
Zeng Jinle 已提交
469
PADDLE_DEFINE_EXPORTED_double(
470 471
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
472 473 474 475
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
476

477 478 479 480
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
481 482
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
483
 * Example:
484
 * Note: For selecting allocator policy of PaddlePaddle.
485
 */
486
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
Z
Zeng Jinle 已提交
487
PADDLE_DEFINE_EXPORTED_string(
488 489
    allocator_strategy,
    kDefaultAllocatorStrategy,
490 491 492 493 494 495 496 497 498 499 500
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
501

502 503 504
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
505 506
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
507
 * Example:
508 509 510 511 512 513
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
514
 */
515 516
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                              1,
Z
Zeng Jinle 已提交
517 518
                              "Default use 100% of CPU memory for PaddlePaddle,"
                              "reserve the rest for page tables, etc");
519 520 521 522 523 524 525 526 527 528 529 530 531

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
Z
Zeng Jinle 已提交
532
PADDLE_DEFINE_EXPORTED_uint64(
533 534
    initial_cpu_memory_in_mb,
    500ul,
Z
Zeng Jinle 已提交
535
    "Initial CPU memory for PaddlePaddle, in MD unit.");
536

537 538 539
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
540 541
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
542
 * Example:
543 544 545 546 547 548
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
549
 */
Z
Zeng Jinle 已提交
550
PADDLE_DEFINE_EXPORTED_double(
551 552
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
553 554 555
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

556 557
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
558 559 560
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
    defined(PADDLE_WITH_CUSTOM_DEVICE)
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

579 580 581 582 583 584 585 586
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
Z
Zeng Jinle 已提交
587
PADDLE_DEFINE_EXPORTED_double(
588 589
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
590 591 592 593 594
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
595

596 597 598 599 600 601 602 603 604 605 606 607
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
Z
Zeng Jinle 已提交
608
PADDLE_DEFINE_EXPORTED_uint64(
609 610
    initial_gpu_memory_in_mb,
    0ul,
611 612 613 614 615 616 617 618 619 620 621
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

622 623 624 625 626 627 628 629 630
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
Z
Zeng Jinle 已提交
631
PADDLE_DEFINE_EXPORTED_uint64(
632 633
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
634 635 636 637 638
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

PADDLE_DEFINE_EXPORTED_uint64(
639 640
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
641 642 643 644 645 646
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
647

648
#endif
649 650 651 652 653 654 655 656 657

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
658
PADDLE_DEFINE_EXPORTED_double(
659 660
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
661 662 663 664
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
665

666
PADDLE_DEFINE_EXPORTED_bool(
667 668
    reader_queue_speed_test_mode,
    false,
669 670 671
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

672 673 674 675 676 677 678 679
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
680
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
696 697 698
#ifdef PADDLE_NO_PYTHON
static const int32_t kDefaultCallStackLevel = 2;
#else
699
static const int32_t kDefaultCallStackLevel = 1;
700
#endif
701

Z
Zeng Jinle 已提交
702
PADDLE_DEFINE_EXPORTED_int32(
703 704
    call_stack_level,
    kDefaultCallStackLevel,
705 706 707 708 709 710 711 712
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
713 714 715 716 717 718 719 720 721 722

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
723 724
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient,
                            false,
Z
Zeng Jinle 已提交
725 726
                            "Sum gradients by the reverse order of "
                            "the forward execution sequence.");
727 728 729 730 731 732 733 734 735

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
Z
Zeng Jinle 已提交
736
PADDLE_DEFINE_EXPORTED_int32(
737 738
    max_inplace_grad_add,
    0,
739 740 741 742
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
743 744 745 746 747 748 749 750 751

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
752 753
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                              "",
Z
Zeng Jinle 已提交
754
                              "List of OneDNN operation types to be turned on");
755 756 757 758 759 760 761 762 763

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
Z
Zeng Jinle 已提交
764
PADDLE_DEFINE_EXPORTED_string(
765 766
    tracer_mkldnn_ops_off,
    "",
Z
Zeng Jinle 已提交
767
    "List of OneDNN operation types to be turned off");
768

769 770 771 772 773 774 775 776 777
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
778
PADDLE_DEFINE_EXPORTED_bool(
779 780
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
781
    "Check kernel launch status after every kernel compute");
782 783
#endif

784 785 786 787 788 789 790 791 792
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
793 794
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                            false,
Z
Zeng Jinle 已提交
795
                            "Disable cudnn in conv2d");
796

797 798
PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
                            false,
799
                            "Whether to use fast math GPU functions.");
800
#endif
B
Baibaifan 已提交
801 802 803 804 805 806 807 808 809

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
F
fwenguang 已提交
810 811 812
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_MLU)
813 814
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                             120,
Z
Zeng Jinle 已提交
815
                             "The maximum time for get host by name time");
B
Baibaifan 已提交
816
#endif
817 818 819 820 821 822 823 824 825 826

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
Z
Zeng Jinle 已提交
827
PADDLE_DEFINE_EXPORTED_bool(
828 829
    apply_pass_to_program,
    false,
830
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
831

D
danleifeng 已提交
832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                            false,
                            "It controls whether load graph node and edge with "
                            "mutli threads parallely.");

/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
PADDLE_DEFINE_EXPORTED_bool(
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

F
Feng Xing 已提交
860
/**
L
Liu-xiandong 已提交
861
 * KP kernel related FLAG
F
Feng Xing 已提交
862 863 864
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
865 866
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
867 868
 * Note:
 */
869 870
PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel,
                            false,
L
Liu-xiandong 已提交
871
                            "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
872

873
/**
874 875 876 877 878 879 880 881 882 883
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
884 885
PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                            false,
886 887 888 889 890
                            "It controls whether the allreduce operations "
                            "would only wait one event instead of multiple "
                            "events. Currently, only fuse allreduce supports "
                            "this. Otherwise, the precision may be wrong.");

891
#ifdef PADDLE_WITH_CINN
892
/*
893 894 895 896 897 898 899 900
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
PADDLE_DEFINE_EXPORTED_bool(
    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
901 902 903 904 905 906 907 908 909

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
910 911
PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops,
                              "",
912 913 914 915 916 917 918 919 920 921 922
                              "It controls the cinn op subset to be used, "
                              "which has the highest priority.");

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
923 924
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops,
                              "",
925
                              "It controls the cinn op subset to be not used.");
926 927 928 929 930 931 932 933 934 935

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
936 937
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            true,
938 939 940 941 942 943 944 945 946 947 948
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
949 950
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                            false,
951 952 953
                            "It controls whether to use cinn with "
                            "its auto-tune feature enabled");

954
#endif
955

956 957
DEFINE_int32(record_pool_max_size,
             2000000,
Y
yaoxuefeng 已提交
958 959
             "SlotRecordDataset slot record pool max size");
DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
960 961
DEFINE_bool(enable_slotpool_wait_release,
            false,
Y
yaoxuefeng 已提交
962
            "enable slotrecord obejct wait release, default false");
963 964
DEFINE_bool(enable_slotrecord_reset_shrink,
            false,
Y
yaoxuefeng 已提交
965
            "enable slotrecord obejct reset shrink memory, default false");
966 967
DEFINE_bool(enable_ins_parser_file,
            false,
D
danleifeng 已提交
968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
            "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                              0.75,
                              "the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_gpu_direct_access,
    false,
    "enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                            true,
                            "enable load_node_list_into_hbm, default true");
995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006

/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif
1007 1008 1009 1010 1011 1012 1013 1014 1015

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
1016

H
hong 已提交
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
/**
 * Conv Search cache max number related FLAG
 * Name: FLAGS_search_cache_max_number
 * Since Version: 2.3.0
 * Value Range: int32, default=1000000
 * Example:
 */
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
                             1000000,
                             "search_cache_max_number.");

1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
/**
 * Preformance related FLAG
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
 * uses more gpu memory.
 */
PADDLE_DEFINE_EXPORTED_bool(
1038 1039
    einsum_opt,
    false,
1040
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1041 1042 1043 1044 1045 1046

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
1047
 * default=Predictor
1048 1049
 * Example:
 * Note:
1050
 * FLAGS_jit_engine_type == New, using InterpreterEngine by default
1051
 * FLAGS_jit_engine_type == Predictor, using inference Predictor by default
1052 1053
 */
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
1054
                              "Predictor",
1055
                              "Choose default funciton type in JitLayer.");
1056

1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
/**
 * Custom Device NPU related FLAG
 * Name: FLAGS_npu_storage_format
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable NPU Storage Format for Ascend910 performance improvement.
 */
PADDLE_DEFINE_EXPORTED_bool(npu_storage_format, false, "");

1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
 * CUDNNv8 related FLAG
 * Name: enable_cudnn_frontend
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable CUDNNv8 Frontend API for CUDNN kernels.
 */
PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, "");
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089

/**
 * CUDNNv8 related FLAG
 * Name: cudnn_cache_saturation_count
 * Since Version: 2.5.0
 * Value Range: int64_t, default=1
 * Example:
 * Note: Set saturation count for CUDNNv8 cache. A candidate execution
 * plan need to be considered as the fastest plan by exhaustive search
 * N times before it is actually added in the cache. It is useful when
 * the result of exhaustive search is unstable.
 */
PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
1090
#endif  // PADDLE_WITH_CUDNN_FRONTEND