flags.cc 37.1 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Z
Zeng Jinle 已提交
16
#include "paddle/fluid/platform/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18 19 20
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif

Z
Zeng Jinle 已提交
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
namespace paddle {
namespace platform {

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

}  // namespace platform
}  // namespace paddle

36 37
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
                             0,
38 39
                             "number of threads for inner op");

40 41 42 43
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

44 45 46 47 48 49 50 51 52
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
53 54
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads,
                             1,
Z
Zeng Jinle 已提交
55
                             "Number of threads for each paddle instance.");
56

57 58 59 60 61 62 63 64
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
Z
Zeng Jinle 已提交
65
PADDLE_DEFINE_EXPORTED_bool(
66 67
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
68 69
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
70

71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/**
 * Operator related FLAG
 * Name: FLAGS_abort_on_nan_inf
 * Since Version: 2.5.0
 * Value Range: bool, default=true
 * Example:
 * Note: Used to debug. Whether abort the process when any operator produce
 * NAN/INF. It only works when FLAGS_check_nan_inf is set.
 */
PADDLE_DEFINE_EXPORTED_bool(
    abort_on_nan_inf,
    true,
    "Whether abort the process when any operator produce NAN/INF or not.");

/**
 * Operator related FLAG
 * Name: FLAGS_check_tensor_max_min
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Enable to calculate and print the max and min value of
 * each operator's output tensor. It only works when FLAGS_check_nan_inf is set.
 */
PADDLE_DEFINE_EXPORTED_bool(
    check_tensor_max_min,
    false,
    "Whether to check all the output tensors's min and max value.");

D
danleifeng 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

113 114 115 116
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_ASCEND_CL)
117 118 119 120 121 122 123 124 125

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
Z
Zeng Jinle 已提交
126
PADDLE_DEFINE_EXPORTED_bool(
127 128
    enable_cublas_tensor_op_math,
    false,
129 130 131 132 133 134 135 136
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

137 138 139 140 141 142 143 144 145 146
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
PADDLE_DEFINE_EXPORTED_bool(
147 148
    gemm_use_half_precision_compute_type,
    true,
149 150 151 152
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
    "type will be set to fp32. Default is true.");

153 154 155 156 157 158 159 160 161
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
Z
Zeng Jinle 已提交
162
PADDLE_DEFINE_EXPORTED_string(
163 164
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
165 166 167 168 169 170 171
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
172 173
#endif

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
PADDLE_DEFINE_EXPORTED_int64(
192 193
    cublaslt_exhaustive_search_times,
    0,
194 195 196 197
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

198
#if defined(PADDLE_WITH_ASCEND_CL)
Z
Zeng Jinle 已提交
199
PADDLE_DEFINE_EXPORTED_string(
200 201
    selected_npus,
    "",
Z
Zeng Jinle 已提交
202 203 204 205 206
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (NPU). If you want to use "
    "all visible devices, set this to empty string.");
PADDLE_DEFINE_EXPORTED_bool(
207 208
    hccl_check_nan,
    true,
Z
Zeng Jinle 已提交
209 210 211
    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
    "core when meets Nan value");
PADDLE_DEFINE_EXPORTED_string(
212 213
    npu_config_path,
    "",
214 215
    "The absolute path of configuration json file, like: /tmp/config.json. "
    "If proveided, it will be passed to aclInit().");
216 217
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
                             1,
Z
Zeng Jinle 已提交
218
                             "set minmum loss scaling value!");
A
Aganlengzi 已提交
219
PADDLE_DEFINE_EXPORTED_string(
220 221
    npu_precision_mode,
    "",
A
Aganlengzi 已提交
222 223 224 225 226
    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
    "'allow_mix_precision'. If you want to use the default mode ("
    "allow_fp32_to_fp16), set this to empty string. For more details, "
    "please refer to the documents");
227
#endif
228

229 230 231 232 233 234 235 236 237 238 239 240 241
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

242
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
243 244 245 246 247 248 249 250 251
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
Z
Zeng Jinle 已提交
252
PADDLE_DEFINE_EXPORTED_bool(
253 254
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
255 256 257
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
258

259 260 261 262
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
263
 * Value Range: uint64, default=512 (MB)
264 265 266 267 268 269 270
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
271 272 273
PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
                             paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
                             "cuDNN convolution workspace limit in MB unit.");
274

275 276 277 278 279 280 281 282 283 284 285 286 287 288
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
Z
Zeng Jinle 已提交
289
PADDLE_DEFINE_EXPORTED_bool(
290 291
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
292 293
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
294

295 296 297 298 299 300 301 302
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
303 304
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                             -1,
Z
Zeng Jinle 已提交
305 306
                             "Exhaustive search times for cuDNN convolution, "
                             "default is -1, not exhaustive search");
307

308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
Z
Zeng Jinle 已提交
324
PADDLE_DEFINE_EXPORTED_bool(
325 326
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
327 328
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
329 330
#endif

331
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
332 333 334

/**
 * NCCL related FLAG
335 336 337
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
338 339 340 341 342
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
Z
Zeng Jinle 已提交
343
PADDLE_DEFINE_EXPORTED_bool(
344 345
    sync_nccl_allreduce,
    true,
346 347 348 349 350
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
351 352 353 354 355 356 357 358 359 360 361
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
362 363
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                             20,
Z
Zeng Jinle 已提交
364 365
                             "max var num to merge and send");
PADDLE_DEFINE_EXPORTED_bool(
366 367
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
368 369
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
370 371 372 373 374 375 376 377 378 379 380 381 382
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
383 384
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                             20,
Z
Zeng Jinle 已提交
385
                             "queue size to recv gradient before send");
386 387
#endif

388 389 390 391 392 393 394 395 396
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
Z
Zeng Jinle 已提交
397
PADDLE_DEFINE_EXPORTED_int32(
398 399
    dist_threadpool_size,
    0,
Z
Zeng Jinle 已提交
400
    "number of threads used for distributed executed.");
401

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
418 419 420
// Disable gc by default when inference library is built
static const double kDefaultEagerDeleteTensorGB = 0;

Z
Zeng Jinle 已提交
421
PADDLE_DEFINE_EXPORTED_double(
422 423
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
424 425 426
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

427 428 429 430 431 432 433 434 435 436 437 438
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
Z
Zeng Jinle 已提交
439
PADDLE_DEFINE_EXPORTED_bool(
440 441
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
442 443
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
444

445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
Z
Zeng Jinle 已提交
463
PADDLE_DEFINE_EXPORTED_double(
464 465
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
466 467 468 469
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
470

471 472 473 474
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
475 476
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
477
 * Example:
478
 * Note: For selecting allocator policy of PaddlePaddle.
479
 */
480
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
Z
Zeng Jinle 已提交
481
PADDLE_DEFINE_EXPORTED_string(
482 483
    allocator_strategy,
    kDefaultAllocatorStrategy,
484 485 486 487 488 489 490 491 492 493 494
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
495

496 497 498
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
499 500
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
501
 * Example:
502 503 504 505 506 507
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
508
 */
509 510
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                              1,
Z
Zeng Jinle 已提交
511 512
                              "Default use 100% of CPU memory for PaddlePaddle,"
                              "reserve the rest for page tables, etc");
513 514 515 516 517 518 519 520 521 522 523 524 525

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
Z
Zeng Jinle 已提交
526
PADDLE_DEFINE_EXPORTED_uint64(
527 528
    initial_cpu_memory_in_mb,
    500ul,
Z
Zeng Jinle 已提交
529
    "Initial CPU memory for PaddlePaddle, in MD unit.");
530

531 532 533
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
534 535
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
536
 * Example:
537 538 539 540 541 542
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
543
 */
Z
Zeng Jinle 已提交
544
PADDLE_DEFINE_EXPORTED_double(
545 546
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
547 548 549
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

550 551
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
552 553 554
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
    defined(PADDLE_WITH_CUSTOM_DEVICE)
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

573 574 575 576 577 578 579 580
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
Z
Zeng Jinle 已提交
581
PADDLE_DEFINE_EXPORTED_double(
582 583
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
584 585 586 587 588
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
589

590 591 592 593 594 595 596 597 598 599 600 601
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
Z
Zeng Jinle 已提交
602
PADDLE_DEFINE_EXPORTED_uint64(
603 604
    initial_gpu_memory_in_mb,
    0ul,
605 606 607 608 609 610 611 612 613 614 615
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

616 617 618 619 620 621 622 623 624
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
Z
Zeng Jinle 已提交
625
PADDLE_DEFINE_EXPORTED_uint64(
626 627
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
628 629 630 631 632
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

PADDLE_DEFINE_EXPORTED_uint64(
633 634
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
635 636 637 638 639 640
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
641

642
#endif
643 644 645 646 647 648 649 650 651

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
652
PADDLE_DEFINE_EXPORTED_double(
653 654
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
655 656 657 658
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
659

660
PADDLE_DEFINE_EXPORTED_bool(
661 662
    reader_queue_speed_test_mode,
    false,
663 664 665
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

666 667 668 669 670 671 672 673
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
674
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
690 691 692
#ifdef PADDLE_NO_PYTHON
static const int32_t kDefaultCallStackLevel = 2;
#else
693
static const int32_t kDefaultCallStackLevel = 1;
694
#endif
695

Z
Zeng Jinle 已提交
696
PADDLE_DEFINE_EXPORTED_int32(
697 698
    call_stack_level,
    kDefaultCallStackLevel,
699 700 701 702 703 704 705 706
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
707 708 709 710 711 712 713 714 715 716

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
717 718
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient,
                            false,
Z
Zeng Jinle 已提交
719 720
                            "Sum gradients by the reverse order of "
                            "the forward execution sequence.");
721 722 723 724 725 726 727 728 729

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
Z
Zeng Jinle 已提交
730
PADDLE_DEFINE_EXPORTED_int32(
731 732
    max_inplace_grad_add,
    0,
733 734 735 736
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
737 738 739 740 741 742 743 744 745

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
746 747
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                              "",
Z
Zeng Jinle 已提交
748
                              "List of OneDNN operation types to be turned on");
749 750 751 752 753 754 755 756 757

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
Z
Zeng Jinle 已提交
758
PADDLE_DEFINE_EXPORTED_string(
759 760
    tracer_mkldnn_ops_off,
    "",
Z
Zeng Jinle 已提交
761
    "List of OneDNN operation types to be turned off");
762

763 764 765 766 767 768 769 770 771
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
772
PADDLE_DEFINE_EXPORTED_bool(
773 774
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
775
    "Check kernel launch status after every kernel compute");
776 777
#endif

778 779 780 781 782 783 784 785 786
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
787 788
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                            false,
Z
Zeng Jinle 已提交
789
                            "Disable cudnn in conv2d");
790

791 792
PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
                            false,
793
                            "Whether to use fast math GPU functions.");
794
#endif
B
Baibaifan 已提交
795 796 797 798 799 800 801 802 803

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
F
fwenguang 已提交
804 805 806
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_MLU)
807 808
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                             120,
Z
Zeng Jinle 已提交
809
                             "The maximum time for get host by name time");
B
Baibaifan 已提交
810
#endif
811 812 813 814 815 816 817 818 819 820

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
Z
Zeng Jinle 已提交
821
PADDLE_DEFINE_EXPORTED_bool(
822 823
    apply_pass_to_program,
    false,
824
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
825

D
danleifeng 已提交
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                            false,
                            "It controls whether load graph node and edge with "
                            "mutli threads parallely.");

/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
PADDLE_DEFINE_EXPORTED_bool(
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

F
Feng Xing 已提交
854
/**
L
Liu-xiandong 已提交
855
 * KP kernel related FLAG
F
Feng Xing 已提交
856 857 858
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
859 860
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
861 862
 * Note:
 */
863 864
PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel,
                            false,
L
Liu-xiandong 已提交
865
                            "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
866

867
/**
868 869 870 871 872 873 874 875 876 877
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
878 879
PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                            false,
880 881 882 883 884
                            "It controls whether the allreduce operations "
                            "would only wait one event instead of multiple "
                            "events. Currently, only fuse allreduce supports "
                            "this. Otherwise, the precision may be wrong.");

885
#ifdef PADDLE_WITH_CINN
886
/*
887 888 889 890 891 892 893 894
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
PADDLE_DEFINE_EXPORTED_bool(
    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
895 896 897 898 899 900 901 902 903

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
904 905
PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops,
                              "",
906 907 908 909 910 911 912 913 914 915 916
                              "It controls the cinn op subset to be used, "
                              "which has the highest priority.");

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
917 918
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops,
                              "",
919
                              "It controls the cinn op subset to be not used.");
920 921 922 923 924 925 926 927 928 929

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
930 931
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            true,
932 933 934 935 936 937 938 939 940 941 942
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
943 944
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                            false,
945 946 947
                            "It controls whether to use cinn with "
                            "its auto-tune feature enabled");

948
#endif
949

950 951
DEFINE_int32(record_pool_max_size,
             2000000,
Y
yaoxuefeng 已提交
952 953
             "SlotRecordDataset slot record pool max size");
DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
954 955
DEFINE_bool(enable_slotpool_wait_release,
            false,
Y
yaoxuefeng 已提交
956
            "enable slotrecord obejct wait release, default false");
957 958
DEFINE_bool(enable_slotrecord_reset_shrink,
            false,
Y
yaoxuefeng 已提交
959
            "enable slotrecord obejct reset shrink memory, default false");
960 961
DEFINE_bool(enable_ins_parser_file,
            false,
D
danleifeng 已提交
962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
            "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                              0.75,
                              "the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_gpu_direct_access,
    false,
    "enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                            true,
                            "enable load_node_list_into_hbm, default true");
989 990 991 992 993 994 995 996 997 998 999 1000

/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif
1001 1002 1003 1004 1005 1006 1007 1008 1009

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
1010

H
hong 已提交
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
/**
 * Conv Search cache max number related FLAG
 * Name: FLAGS_search_cache_max_number
 * Since Version: 2.3.0
 * Value Range: int32, default=1000000
 * Example:
 */
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
                             1000000,
                             "search_cache_max_number.");

1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
/**
 * Preformance related FLAG
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
 * uses more gpu memory.
 */
PADDLE_DEFINE_EXPORTED_bool(
1032 1033
    einsum_opt,
    false,
1034
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1035 1036 1037 1038 1039 1040

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
1041
 * default=Predictor
1042 1043
 * Example:
 * Note:
1044 1045
 * FLAGS_jit_engine_type == Executor, using ExecutorEngine by default
 * FLAGS_jit_engine_type == PE, using PEEngine by default
1046
 * FLAGS_jit_engine_type == New, using InterpreterEngine by default
1047
 * FLAGS_jit_engine_type == Predictor, using inference Predictor by default
1048 1049
 */
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
1050
                              "Predictor",
1051
                              "Choose default funciton type in JitLayer.");
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063

#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
 * CUDNNv8 related FLAG
 * Name: enable_cudnn_frontend
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable CUDNNv8 Frontend API for CUDNN kernels.
 */
PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, "");
#endif  // PADDLE_WITH_CUDNN_FRONTEND