flags.cc 45.8 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

16
#include "paddle/phi/core/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18
#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
19 20
#endif

21
namespace phi {
Z
Zeng Jinle 已提交
22 23 24 25 26 27 28 29 30 31

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

32
}  // namespace phi
Z
Zeng Jinle 已提交
33

34 35 36
PHI_DEFINE_EXPORTED_int32(inner_op_parallelism,
                          0,
                          "number of threads for inner op");
37

38 39 40 41
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

42 43 44 45 46 47 48 49 50
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
51 52 53
PHI_DEFINE_EXPORTED_int32(paddle_num_threads,
                          1,
                          "Number of threads for each paddle instance.");
54

55 56 57
/**
 * Low Precision Op related FLAG
 * Name: FLAGS_low_precision_op_list
58 59
 * Since Version: 2.5.0
 * Value Range: int32, default=0
60 61
 * Example:
 * Note: Used to debug. Get the low precision op list of current module.
62 63 64
 * FLAGS_check_nan_inf is set.
 * - 1, return the low precision op list of current module.
 * - 2, return the op list of current module.
65
 */
66 67 68 69 70
PHI_DEFINE_EXPORTED_int32(low_precision_op_list,
                          0,
                          "Setting the level of low precision op"
                          "list printing. It will be return the "
                          "low precision op list of current module.");
71

72 73 74 75 76 77 78 79
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
80
PHI_DEFINE_EXPORTED_bool(
81 82
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
83 84
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
85

86 87
/**
 * Operator related FLAG
88
 * Name: FLAGS_check_nan_inf_level
89
 * Since Version: 2.5.0
90
 * Value Range: int32, default=0
91
 * Example:
92 93 94 95 96 97 98 99 100
 * Note: Used to debug. Setting the check and print level when
 * FLAGS_check_nan_inf is set.
 * - 0, abort the process when any operator produce NAN/INF and only print the
 * information of tensor which holds NAN/INF.
 * - 1, continue the training or inference process and print the information of
 * all tensors which holds NAN/INF.
 * - 2, print the information of float tensors when the max or min value
 * overflowing float16's limit.
 * - 3, print the information of all tensors.
101
 */
102
PHI_DEFINE_EXPORTED_int32(
103 104 105
    check_nan_inf_level,
    0,
    "Setting the check and print level when FLAGS_check_nan_inf is set.");
106

D
danleifeng 已提交
107 108 109 110 111 112 113 114
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
115
PHI_DEFINE_EXPORTED_bool(
D
danleifeng 已提交
116 117 118 119 120
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

121 122
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
123
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
124 125 126 127 128 129 130 131 132

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
133
PHI_DEFINE_EXPORTED_bool(
134 135
    enable_cublas_tensor_op_math,
    false,
136 137 138 139 140 141 142 143
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

144 145 146 147
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
148
 * Value Range: bool, default=false
149 150 151 152
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
153
PHI_DEFINE_EXPORTED_bool(
154
    gemm_use_half_precision_compute_type,
155
    false,
156 157
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
158
    "type will be set to fp16. Default is false.");
159

160 161 162 163 164 165 166 167 168
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
169
PHI_DEFINE_EXPORTED_string(
170 171
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
172 173 174 175 176 177 178
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
179 180
#endif

181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
198
PHI_DEFINE_EXPORTED_int64(
199 200
    cublaslt_exhaustive_search_times,
    0,
201 202 203 204
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

205 206 207 208 209 210 211 212
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
213
PHI_DEFINE_EXPORTED_bool(
214 215 216 217
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

218
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
219 220 221 222 223 224 225 226 227
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
228
PHI_DEFINE_EXPORTED_bool(
229 230
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
231 232 233
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
234 235 236 237 238

/**
 * CUDA related FLAG
 * Name: FLAGS_embedding_deterministic
 * Since Version: 2.5
239
 * Value Range: int64, default=0
240 241
 * Example:
 * Note: whether to use deterministic algorithm in embedding op.
242 243 244
 *       If it is 1, it will use the optimized deterministic CUDA kernel in
 *       embedding op. If it is 2, it will use the legacy deterministic
 *       CUDA kernel in embedding op.
245
 */
246
PHI_DEFINE_EXPORTED_int64(
247
    embedding_deterministic,
248
    0,
249 250
    "Whether allow using an deterministic algorithm for embedding "
    "operator. The deterministic algorithm may be slower. If "
251
    "it is larger than 0, the algorithm is deterministic.");
252

253 254 255 256
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
257
 * Value Range: uint64, default=512 (MB)
258 259 260 261 262 263 264
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
265 266 267
PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
                          phi::backends::gpu::kDefaultConvWorkspaceSizeLimitMB,
                          "cuDNN convolution workspace limit in MB unit.");
268

269 270 271 272 273 274 275 276 277 278 279 280 281 282
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
283
PHI_DEFINE_EXPORTED_bool(
284 285
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
286 287
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
288

289 290 291 292 293 294 295 296
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
297 298 299 300
PHI_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                          -1,
                          "Exhaustive search times for cuDNN convolution, "
                          "default is -1, not exhaustive search");
301

302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
318
PHI_DEFINE_EXPORTED_bool(
319 320
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
321 322
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
323 324
#endif

325
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
326 327 328

/**
 * NCCL related FLAG
329 330 331
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
332 333 334 335 336
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
337
PHI_DEFINE_EXPORTED_bool(
338 339
    sync_nccl_allreduce,
    true,
340 341 342 343 344
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
345 346 347 348 349 350 351 352 353 354 355
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
356 357 358 359
PHI_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                          20,
                          "max var num to merge and send");
PHI_DEFINE_EXPORTED_bool(
360 361
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
362 363
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
364 365 366 367 368 369 370 371 372 373 374 375 376
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
377 378 379
PHI_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                          20,
                          "queue size to recv gradient before send");
380 381
#endif

382 383 384 385 386 387 388 389 390
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
391 392 393
PHI_DEFINE_EXPORTED_int32(dist_threadpool_size,
                          0,
                          "number of threads used for distributed executed.");
394

395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
411 412 413
// Disable gc by default when inference library is built
static const double kDefaultEagerDeleteTensorGB = 0;

414
PHI_DEFINE_EXPORTED_double(
415 416
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
417 418 419
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

420 421 422 423 424 425 426 427 428 429 430 431
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
432
PHI_DEFINE_EXPORTED_bool(
433 434
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
435 436
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
437

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
456
PHI_DEFINE_EXPORTED_double(
457 458
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
459 460 461 462
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
463

464 465 466 467
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
468 469
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
470
 * Example:
471
 * Note: For selecting allocator policy of PaddlePaddle.
472
 */
473
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";  // NOLINT
474
PHI_DEFINE_EXPORTED_string(
475 476
    allocator_strategy,
    kDefaultAllocatorStrategy,
477 478 479 480 481 482 483 484 485 486 487
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
488

489 490 491
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
492 493
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
494
 * Example:
495 496 497 498 499 500
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
501
 */
502 503 504 505
PHI_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                           1,
                           "Default use 100% of CPU memory for PaddlePaddle,"
                           "reserve the rest for page tables, etc");
506 507 508 509 510 511 512 513 514 515 516 517 518

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
519 520 521
PHI_DEFINE_EXPORTED_uint64(initial_cpu_memory_in_mb,
                           500ul,
                           "Initial CPU memory for PaddlePaddle, in MD unit.");
522

523 524 525
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
526 527
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
528
 * Example:
529 530 531 532 533 534
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
535
 */
536
PHI_DEFINE_EXPORTED_double(
537 538
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
539 540 541
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

542 543
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
544
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
545
    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

564 565 566 567 568 569 570 571
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
572
PHI_DEFINE_EXPORTED_double(
573 574
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
575 576 577 578 579
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
580

581 582 583 584 585 586 587 588 589 590 591 592
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
593
PHI_DEFINE_EXPORTED_uint64(
594 595
    initial_gpu_memory_in_mb,
    0ul,
596 597 598 599 600 601 602 603 604 605 606
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

607 608 609 610 611 612 613 614 615
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
616
PHI_DEFINE_EXPORTED_uint64(
617 618
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
619 620 621 622
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

623
PHI_DEFINE_EXPORTED_uint64(
624 625
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
626 627 628 629 630 631
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
632

633 634 635 636 637 638 639 640 641 642
/**
 * Memory related FLAG
 * Name: FLAGS_auto_growth_chunk_size_in_mb
 * Since Version: 2.5.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: The minimal chunk size of GPU memory block in auto_growth allocator.
 *       The real chunk size is max(request_size,
 *       FLAGS_auto_growth_chunk_size_in_mb).
 */
643
PHI_DEFINE_EXPORTED_uint64(
644 645 646 647 648 649
    auto_growth_chunk_size_in_mb,
    0ul,
    "The minimal chunk size of GPU memory block in auto_growth allocator.  "
    "The real chunk size is max(request_size, "
    "FLAGS_auto_growth_chunk_size_in_mb).");

650
#endif
651 652 653 654 655 656 657 658 659

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
660
PHI_DEFINE_EXPORTED_double(
661 662
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
663 664 665 666
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
667

668
PHI_DEFINE_EXPORTED_bool(
669 670
    reader_queue_speed_test_mode,
    false,
671 672 673
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

674 675 676 677 678 679 680 681
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
682
PHI_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
698 699 700
#ifdef PADDLE_NO_PYTHON
static const int32_t kDefaultCallStackLevel = 2;
#else
701
static const int32_t kDefaultCallStackLevel = 1;
702
#endif
703

704
PHI_DEFINE_EXPORTED_int32(
705 706
    call_stack_level,
    kDefaultCallStackLevel,
707 708 709 710 711 712 713 714
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
715 716 717 718 719 720 721 722 723 724

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
725 726 727 728
PHI_DEFINE_EXPORTED_bool(sort_sum_gradient,
                         false,
                         "Sum gradients by the reverse order of "
                         "the forward execution sequence.");
729 730 731 732 733 734 735 736 737

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
738
PHI_DEFINE_EXPORTED_int32(
739 740
    max_inplace_grad_add,
    0,
741 742 743 744
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
745

746 747 748 749 750 751 752 753
/**
 * Tensor.numpy() has a hack, and this flag can close this hack
 * [true]: set 0D Tensor to 1D Numpy
 * [false]: not set 0D Tensor to 1D Numpy, close the hack
 *
 * Now, just set true by default in 2.5 transition time
 * which will be removed in future (2.6 or 2.7) .
 */
754
PHI_DEFINE_EXPORTED_bool(set_to_1d, true, "set 0D Tensor to 1D numpy");
755

756 757 758 759 760 761 762 763
/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
764 765 766
PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                           "",
                           "List of OneDNN operation types to be turned on");
767 768 769 770 771 772 773 774 775

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
776 777 778
PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
                           "",
                           "List of OneDNN operation types to be turned off");
779

780 781 782 783 784 785 786 787 788
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
789
PHI_DEFINE_EXPORTED_bool(
790 791
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
792
    "Check kernel launch status after every kernel compute");
793 794
#endif

795 796 797 798 799 800 801 802 803
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
804 805 806
PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                         false,
                         "Disable cudnn in conv2d");
807

808 809 810
PHI_DEFINE_EXPORTED_bool(use_fast_math,
                         false,
                         "Whether to use fast math GPU functions.");
811
#endif
B
Baibaifan 已提交
812 813 814 815 816 817 818 819 820

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
821
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
822
    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUSTOM_DEVICE)
823 824 825
PHI_DEFINE_EXPORTED_int32(get_host_by_name_time,
                          120,
                          "The maximum time for get host by name time");
B
Baibaifan 已提交
826
#endif
827 828 829 830 831 832 833 834 835 836

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
837
PHI_DEFINE_EXPORTED_bool(
838 839
    apply_pass_to_program,
    false,
840
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
841

D
danleifeng 已提交
842 843 844 845 846 847 848 849 850
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
851 852 853 854
PHI_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                         false,
                         "It controls whether load graph node and edge with "
                         "mutli threads parallely.");
D
danleifeng 已提交
855

L
lxsbupt 已提交
856 857 858 859 860 861 862 863 864
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_metapath_split_opt
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
865 866 867 868
PHI_DEFINE_EXPORTED_bool(graph_metapath_split_opt,
                         false,
                         "It controls whether load graph node and edge with "
                         "mutli threads parallely.");
L
lxsbupt 已提交
869

D
danleifeng 已提交
870 871 872 873 874 875 876 877 878
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
879
PHI_DEFINE_EXPORTED_bool(
D
danleifeng 已提交
880 881 882 883
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

L
lxsbupt 已提交
884 885 886 887 888 889 890 891 892
/**
 * Distributed related FLAG
 * Name: enable_exit_when_partial_worker
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control  whether exit trainer when an worker has no ins.
 *       If it is not set, trainer will exit until all worker finish train.
 */
893
PHI_DEFINE_EXPORTED_bool(
L
lxsbupt 已提交
894 895 896 897 898 899 900 901 902 903 904 905
    enable_exit_when_partial_worker,
    false,
    "It controls whether exit trainer when an worker has no ins.");

/**
 * Distributed related FLAG
 * Name: enable_exit_when_partial_worker
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: represent gpugraph storage mode, 1 for full hbm, 2 for hbm + mem + ssd.
 */
906 907 908
PHI_DEFINE_EXPORTED_int32(gpugraph_storage_mode,
                          1,
                          "gpugraph storage mode, default 1");
L
lxsbupt 已提交
909

F
Feng Xing 已提交
910
/**
L
Liu-xiandong 已提交
911
 * KP kernel related FLAG
F
Feng Xing 已提交
912 913 914
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
915 916
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
917 918
 * Note:
 */
919 920 921
PHI_DEFINE_EXPORTED_bool(run_kp_kernel,
                         false,
                         "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
922

923
/**
924 925 926 927 928 929 930 931 932 933
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
934 935 936 937 938 939
PHI_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                         false,
                         "It controls whether the allreduce operations "
                         "would only wait one event instead of multiple "
                         "events. Currently, only fuse allreduce supports "
                         "this. Otherwise, the precision may be wrong.");
940

941
#ifdef PADDLE_WITH_CINN
942
/*
943 944 945 946 947 948
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
949 950 951
PHI_DEFINE_EXPORTED_bool(use_cinn,
                         false,
                         "It controls whether to run PaddlePaddle using CINN");
952 953 954 955 956 957 958 959 960

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
961 962 963 964
PHI_DEFINE_EXPORTED_string(allow_cinn_ops,
                           "",
                           "It controls the cinn op subset to be used, "
                           "which has the highest priority.");
965 966 967 968 969 970 971 972 973

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
974 975 976
PHI_DEFINE_EXPORTED_string(deny_cinn_ops,
                           "",
                           "It controls the cinn op subset to be not used.");
977 978 979 980 981 982 983 984 985 986

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
987 988 989 990
PHI_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                         true,
                         "It controls whether to execute cinn compiled "
                         "program with ParallelExecutor");
991

992 993 994 995 996 997 998 999 1000
/*
 * CINN related FLAG
 * Name: FLAGS_enable_interpretercore_launch_cinn
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_interpretercore_launch_cinn=true would execute the CINN
 * compiled instructions of a paddle graph with InterpreterCore, otherwise with
 * the CINN compiled runtime program in sequential order.
 */
1001 1002 1003 1004
PHI_DEFINE_EXPORTED_bool(enable_interpretercore_launch_cinn,
                         true,
                         "It controls whether to execute cinn compiled "
                         "program with InterpreterCore");
1005

1006 1007 1008 1009 1010 1011 1012 1013
/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
1014 1015 1016 1017
PHI_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                         false,
                         "It controls whether to use cinn with "
                         "its auto-tune feature enabled");
1018

1019 1020 1021 1022 1023 1024 1025 1026 1027
/*
 * CINN related FLAG
 * Name: FLAGS_cinn_subgraph_graphviz_dir
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_cinn_subgraph_graphviz_dir="./cinn_graph/" will save the
 * CINN sub-graph into "./cinn_graph/", and each sub-graph will save into
 * "fusion_groups_*"" directory
 */
1028 1029 1030 1031
PHI_DEFINE_EXPORTED_string(cinn_subgraph_graphviz_dir,
                           "",
                           "Specify the directory path of dot file of "
                           "graph, which is used for debug.");
1032

1033
#endif
1034

1035 1036 1037 1038 1039 1040 1041 1042
/*
 * CUDA Graph related FLAG
 * Name: FLAGS_new_executor_use_cuda_graph
 * Since Version: 2.4
 * Value Range: bool, default=false
 * Example: FLAGS_new_executor_use_cuda_graph=true would allow
 * new executor to use CUDA Graph.
 */
1043 1044 1045
PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
                         false,
                         "Use CUDA Graph in new executor");
1046

1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
/*
 * Executor related FLAG
 * Name: FLAGS_executor_log_deps_every_microseconds
 * Since Version: 2.5
 * Value Range: uint64, default=0
 * Example: FLAGS_executor_log_deps_every_microseconds=n (n>0) would
 * allow new executor log deps every n microseconds.
 */
PHI_DEFINE_EXPORTED_uint64(executor_log_deps_every_microseconds,
                           0,
                           "Enable new executor log deps every n microseconds");

1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
PD_DEFINE_int32(record_pool_max_size,
                2000000,
                "SlotRecordDataset slot record pool max size");
PD_DEFINE_int32(slotpool_thread_num,
                1,
                "SlotRecordDataset slot pool thread num");
PD_DEFINE_bool(enable_slotpool_wait_release,  // NOLINT
               false,
               "enable slotrecord object wait release, default false");
PD_DEFINE_bool(enable_slotrecord_reset_shrink,  // NOLINT
               false,
               "enable slotrecord object reset shrink memory, default false");
PD_DEFINE_bool(enable_ins_parser_file,  // NOLINT
               false,
               "enable parser ins file, default false");
1074
PHI_DEFINE_EXPORTED_bool(
D
danleifeng 已提交
1075 1076 1077
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
1078 1079 1080 1081 1082
PHI_DEFINE_EXPORTED_bool(
    cache_inference_while_scope,
    false,
    "Cache the scope of the while op to avoid repeated creation of the scope "
    "for each iteration and improve inference performance.");
1083 1084 1085 1086
PHI_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                           0.75,
                           "the load factor of hbm table, default 0.75");
PHI_DEFINE_EXPORTED_bool(
D
danleifeng 已提交
1087 1088
    gpugraph_enable_gpu_direct_access,
    false,
C
co63oc 已提交
1089
    "enable direct access between multi gpu cards, default false");
1090
PHI_DEFINE_EXPORTED_bool(
D
danleifeng 已提交
1091 1092 1093
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
1094
PHI_DEFINE_EXPORTED_uint64(
D
danleifeng 已提交
1095 1096 1097
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
1098 1099 1100 1101
PHI_DEFINE_EXPORTED_uint64(gpugraph_slot_feasign_max_num,
                           5,
                           "max feasign number in one slot, default 5");
PHI_DEFINE_EXPORTED_int32(
D
danleifeng 已提交
1102 1103 1104
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
PHI_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                         true,
                         "enable load_node_list_into_hbm, default true");
PHI_DEFINE_EXPORTED_int32(gpugraph_sparse_table_storage_mode,
                          0,
                          "parse_table_storage_mode, default 0");
PHI_DEFINE_EXPORTED_bool(enable_auto_detect_gpu_topo,
                         true,
                         "enable auto detect gpu topo, default true");
PHI_DEFINE_EXPORTED_bool(enable_auto_rdma_trans,
                         true,
                         "enable auto gpu rdma trans, default true");
PHI_DEFINE_EXPORTED_bool(enable_tracker_all2all,
                         false,
                         "enable tracker all2all log, default false");
PHI_DEFINE_EXPORTED_bool(enable_all2all_use_fp16,
                         false,
                         "enable all2all use fp16, default false");
PHI_DEFINE_EXPORTED_bool(enable_sparse_inner_gather,
                         false,
                         "enable sparse inner gather, default false");
PHI_DEFINE_EXPORTED_bool(gpugraph_debug_gpu_memory,
                         false,
                         "enable debug gpu memory, default false");
1129 1130 1131 1132 1133 1134 1135 1136 1137
/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1138
PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
1139
#endif
1140 1141 1142 1143 1144 1145 1146 1147

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
1148
PHI_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
1149

H
hong 已提交
1150 1151 1152 1153 1154 1155 1156
/**
 * Conv Search cache max number related FLAG
 * Name: FLAGS_search_cache_max_number
 * Since Version: 2.3.0
 * Value Range: int32, default=1000000
 * Example:
 */
1157 1158 1159
PHI_DEFINE_EXPORTED_int32(search_cache_max_number,
                          1000000,
                          "search_cache_max_number.");
H
hong 已提交
1160

1161
/**
C
co63oc 已提交
1162
 * Performance related FLAG
1163 1164 1165 1166
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
C
co63oc 已提交
1167
 * Note: If True, EinsumOp will be optimized by innercache reuse, which
1168 1169
 * uses more gpu memory.
 */
1170
PHI_DEFINE_EXPORTED_bool(
1171 1172
    einsum_opt,
    false,
1173
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1174 1175 1176 1177 1178 1179

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
1180
 * default=Predictor
1181 1182
 * Example:
 * Note:
1183
 * FLAGS_jit_engine_type == New, using InterpreterEngine by default
1184
 * FLAGS_jit_engine_type == Predictor, using inference Predictor by default
1185
 */
1186 1187
PHI_DEFINE_EXPORTED_string(jit_engine_type,
                           "Predictor",
C
co63oc 已提交
1188
                           "Choose default function type in JitLayer.");
1189

1190 1191 1192 1193 1194 1195 1196 1197
/**
 * Custom Device NPU related FLAG
 * Name: FLAGS_npu_storage_format
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable NPU Storage Format for Ascend910 performance improvement.
 */
1198
PHI_DEFINE_EXPORTED_bool(npu_storage_format, false, "");
1199

1200 1201 1202 1203 1204 1205 1206 1207 1208
#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
 * CUDNNv8 related FLAG
 * Name: enable_cudnn_frontend
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable CUDNNv8 Frontend API for CUDNN kernels.
 */
1209
PHI_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, "");
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221

/**
 * CUDNNv8 related FLAG
 * Name: cudnn_cache_saturation_count
 * Since Version: 2.5.0
 * Value Range: int64_t, default=1
 * Example:
 * Note: Set saturation count for CUDNNv8 cache. A candidate execution
 * plan need to be considered as the fastest plan by exhaustive search
 * N times before it is actually added in the cache. It is useful when
 * the result of exhaustive search is unstable.
 */
1222
PHI_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
1223
#endif  // PADDLE_WITH_CUDNN_FRONTEND
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233

/**
 * CI related FLAG
 * Name: trt_ibuilder_cache
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: This FLAG is only enabled when CI is running. If True, a persistent
 * IBuilder is added to avoid TensorRT unload/reload kernels.
 */
1234 1235 1236
PHI_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
                         false,
                         "Add a persistent ibuilder.");
1237 1238 1239 1240 1241

/**
 * mmap_allocator related FLAG
 * Name: use_shm_cache
 * Since Version: 2.5.0
Z
zhangbo9674 已提交
1242
 * Value Range: bool, default=false
1243 1244 1245 1246
 * Example:
 * Note: . If True, mmap_allocator will cache shm file to decrease munmap
 * operation.
 */
1247 1248 1249
PHI_DEFINE_EXPORTED_bool(use_shm_cache,
                         false,
                         "Use shm cache in mmap_allocator.");
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262

/**
 * Tensor operants related FLAG
 * Name: tensor_operants_mode
 * Since Version: 2.5.0
 * Value Range: string, {eager, phi, static}
 * default=eager
 * Example:
 * Note: For switching tensor operants mode of PaddlePaddle.
 *       - eager mode: tensor operants with dygraph autograd;
 *       - phi mode: tensor operants with only phi forward API;
 *       - static mode: tensor operants within static graph.
 */
1263 1264 1265
PHI_DEFINE_EXPORTED_string(tensor_operants_mode,
                           "eager",
                           "Tensor operants mode");
1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277

/**
 * Using new IR in executor  FLAG
 * Name: enable_new_ir_in_executor
 * Since Version: 2.6.0
 * Value Range: bool, default=false
 * Example:
 * Note: If Ture, executor will use new IR
 */
PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor,
                         false,
                         "Enable new IR in executor");
1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289

/**
 * Using new IR API in Python
 * Name: enable_new_ir_api
 * Since Version: 2.6.0
 * Value Range: bool, default=false
 * Example:
 * Note: If Ture, New IR API will be used in Python
 */
PHI_DEFINE_EXPORTED_bool(enable_new_ir_api,
                         false,
                         "Enable new IR API in Python");
1290 1291 1292

/**
 * Using new IR in executor  FLAG
1293
 * Name: enable_new_ir_in_executor_trace_run
1294 1295 1296
 * Since Version: 2.6.0
 * Value Range: bool, default=false
 * Example:
1297
 * Note: If Ture, executor will use new IR and run in beta version by for trace
1298 1299
 * version.
 */
1300
PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor_trace_run,
1301 1302
                         false,
                         "Enable new IR in executor");
1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317

PHI_DEFINE_EXPORTED_bool(enable_record_memory, false, "Enable memory recorder");

PHI_DEFINE_EXPORTED_bool(
    eager_delete_scope,
    true,
    "Delete local scope eagerly. It will reduce GPU memory usage but "
    "slow down the destruction of variables.(around 1% performance harm)");

// Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level.
PHI_DEFINE_EXPORTED_int64(host_trace_level,
                          1,
                          "RecordEvent will works "
                          "if host_trace_level >= level.");