flags.cc 37.4 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Z
Zeng Jinle 已提交
16
#include "paddle/fluid/platform/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18 19 20
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif

Z
Zeng Jinle 已提交
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
namespace paddle {
namespace platform {

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

}  // namespace platform
}  // namespace paddle

36 37
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
                             0,
38 39
                             "number of threads for inner op");

40 41 42 43
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

44 45 46 47 48 49 50 51 52
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
53 54
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads,
                             1,
Z
Zeng Jinle 已提交
55
                             "Number of threads for each paddle instance.");
56

57 58 59 60 61 62 63 64
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
Z
Zeng Jinle 已提交
65
PADDLE_DEFINE_EXPORTED_bool(
66 67
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
68 69
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
70

71 72
/**
 * Operator related FLAG
73
 * Name: FLAGS_check_nan_inf_level
74
 * Since Version: 2.5.0
75
 * Value Range: int32, default=0
76
 * Example:
77 78 79 80 81 82 83 84 85
 * Note: Used to debug. Setting the check and print level when
 * FLAGS_check_nan_inf is set.
 * - 0, abort the process when any operator produce NAN/INF and only print the
 * information of tensor which holds NAN/INF.
 * - 1, continue the training or inference process and print the information of
 * all tensors which holds NAN/INF.
 * - 2, print the information of float tensors when the max or min value
 * overflowing float16's limit.
 * - 3, print the information of all tensors.
86
 */
87 88 89 90
PADDLE_DEFINE_EXPORTED_int32(
    check_nan_inf_level,
    0,
    "Setting the check and print level when FLAGS_check_nan_inf is set.");
91

D
danleifeng 已提交
92 93 94 95 96 97 98 99 100 101 102 103 104 105
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

106 107 108 109
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_ASCEND_CL)
110 111 112 113 114 115 116 117 118

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
Z
Zeng Jinle 已提交
119
PADDLE_DEFINE_EXPORTED_bool(
120 121
    enable_cublas_tensor_op_math,
    false,
122 123 124 125 126 127 128 129
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

130 131 132 133 134 135 136 137 138 139
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
PADDLE_DEFINE_EXPORTED_bool(
140 141
    gemm_use_half_precision_compute_type,
    true,
142 143 144 145
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
    "type will be set to fp32. Default is true.");

146 147 148 149 150 151 152 153 154
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
Z
Zeng Jinle 已提交
155
PADDLE_DEFINE_EXPORTED_string(
156 157
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
158 159 160 161 162 163 164
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
165 166
#endif

167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
PADDLE_DEFINE_EXPORTED_int64(
185 186
    cublaslt_exhaustive_search_times,
    0,
187 188 189 190
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

191
#if defined(PADDLE_WITH_ASCEND_CL)
Z
Zeng Jinle 已提交
192
PADDLE_DEFINE_EXPORTED_string(
193 194
    selected_npus,
    "",
Z
Zeng Jinle 已提交
195 196 197 198 199
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (NPU). If you want to use "
    "all visible devices, set this to empty string.");
PADDLE_DEFINE_EXPORTED_bool(
200 201
    hccl_check_nan,
    true,
Z
Zeng Jinle 已提交
202 203 204
    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
    "core when meets Nan value");
PADDLE_DEFINE_EXPORTED_string(
205 206
    npu_config_path,
    "",
207 208
    "The absolute path of configuration json file, like: /tmp/config.json. "
    "If proveided, it will be passed to aclInit().");
209 210
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
                             1,
Z
Zeng Jinle 已提交
211
                             "set minmum loss scaling value!");
A
Aganlengzi 已提交
212
PADDLE_DEFINE_EXPORTED_string(
213 214
    npu_precision_mode,
    "",
A
Aganlengzi 已提交
215 216 217 218 219
    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
    "'allow_mix_precision'. If you want to use the default mode ("
    "allow_fp32_to_fp16), set this to empty string. For more details, "
    "please refer to the documents");
220
#endif
221

222 223 224 225 226 227 228 229 230 231 232 233 234
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

235
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
236 237 238 239 240 241 242 243 244
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
Z
Zeng Jinle 已提交
245
PADDLE_DEFINE_EXPORTED_bool(
246 247
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
248 249 250
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
251

252 253 254 255
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
256
 * Value Range: uint64, default=512 (MB)
257 258 259 260 261 262 263
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
264 265 266
PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
                             paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
                             "cuDNN convolution workspace limit in MB unit.");
267

268 269 270 271 272 273 274 275 276 277 278 279 280 281
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
Z
Zeng Jinle 已提交
282
PADDLE_DEFINE_EXPORTED_bool(
283 284
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
285 286
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
287

288 289 290 291 292 293 294 295
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
296 297
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                             -1,
Z
Zeng Jinle 已提交
298 299
                             "Exhaustive search times for cuDNN convolution, "
                             "default is -1, not exhaustive search");
300

301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
Z
Zeng Jinle 已提交
317
PADDLE_DEFINE_EXPORTED_bool(
318 319
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
320 321
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
322 323
#endif

324
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
325 326 327

/**
 * NCCL related FLAG
328 329 330
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
331 332 333 334 335
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
Z
Zeng Jinle 已提交
336
PADDLE_DEFINE_EXPORTED_bool(
337 338
    sync_nccl_allreduce,
    true,
339 340 341 342 343
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
344 345 346 347 348 349 350 351 352 353 354
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
355 356
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                             20,
Z
Zeng Jinle 已提交
357 358
                             "max var num to merge and send");
PADDLE_DEFINE_EXPORTED_bool(
359 360
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
361 362
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
363 364 365 366 367 368 369 370 371 372 373 374 375
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
376 377
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                             20,
Z
Zeng Jinle 已提交
378
                             "queue size to recv gradient before send");
379 380
#endif

381 382 383 384 385 386 387 388 389
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
Z
Zeng Jinle 已提交
390
PADDLE_DEFINE_EXPORTED_int32(
391 392
    dist_threadpool_size,
    0,
Z
Zeng Jinle 已提交
393
    "number of threads used for distributed executed.");
394

395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
411 412 413
// Disable gc by default when inference library is built
static const double kDefaultEagerDeleteTensorGB = 0;

Z
Zeng Jinle 已提交
414
PADDLE_DEFINE_EXPORTED_double(
415 416
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
417 418 419
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

420 421 422 423 424 425 426 427 428 429 430 431
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
Z
Zeng Jinle 已提交
432
PADDLE_DEFINE_EXPORTED_bool(
433 434
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
435 436
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
437

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
Z
Zeng Jinle 已提交
456
PADDLE_DEFINE_EXPORTED_double(
457 458
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
459 460 461 462
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
463

464 465 466 467
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
468 469
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
470
 * Example:
471
 * Note: For selecting allocator policy of PaddlePaddle.
472
 */
473
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
Z
Zeng Jinle 已提交
474
PADDLE_DEFINE_EXPORTED_string(
475 476
    allocator_strategy,
    kDefaultAllocatorStrategy,
477 478 479 480 481 482 483 484 485 486 487
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
488

489 490 491
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
492 493
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
494
 * Example:
495 496 497 498 499 500
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
501
 */
502 503
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                              1,
Z
Zeng Jinle 已提交
504 505
                              "Default use 100% of CPU memory for PaddlePaddle,"
                              "reserve the rest for page tables, etc");
506 507 508 509 510 511 512 513 514 515 516 517 518

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
Z
Zeng Jinle 已提交
519
PADDLE_DEFINE_EXPORTED_uint64(
520 521
    initial_cpu_memory_in_mb,
    500ul,
Z
Zeng Jinle 已提交
522
    "Initial CPU memory for PaddlePaddle, in MD unit.");
523

524 525 526
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
527 528
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
529
 * Example:
530 531 532 533 534 535
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
536
 */
Z
Zeng Jinle 已提交
537
PADDLE_DEFINE_EXPORTED_double(
538 539
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
540 541 542
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

543 544
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
545 546 547
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
    defined(PADDLE_WITH_CUSTOM_DEVICE)
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

566 567 568 569 570 571 572 573
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
Z
Zeng Jinle 已提交
574
PADDLE_DEFINE_EXPORTED_double(
575 576
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
577 578 579 580 581
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
582

583 584 585 586 587 588 589 590 591 592 593 594
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
Z
Zeng Jinle 已提交
595
PADDLE_DEFINE_EXPORTED_uint64(
596 597
    initial_gpu_memory_in_mb,
    0ul,
598 599 600 601 602 603 604 605 606 607 608
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

609 610 611 612 613 614 615 616 617
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
Z
Zeng Jinle 已提交
618
PADDLE_DEFINE_EXPORTED_uint64(
619 620
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
621 622 623 624 625
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

PADDLE_DEFINE_EXPORTED_uint64(
626 627
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
628 629 630 631 632 633
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
634

635
#endif
636 637 638 639 640 641 642 643 644

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
645
PADDLE_DEFINE_EXPORTED_double(
646 647
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
648 649 650 651
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
652

653
PADDLE_DEFINE_EXPORTED_bool(
654 655
    reader_queue_speed_test_mode,
    false,
656 657 658
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

659 660 661 662 663 664 665 666
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
667
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
683 684 685
#ifdef PADDLE_NO_PYTHON
static const int32_t kDefaultCallStackLevel = 2;
#else
686
static const int32_t kDefaultCallStackLevel = 1;
687
#endif
688

Z
Zeng Jinle 已提交
689
PADDLE_DEFINE_EXPORTED_int32(
690 691
    call_stack_level,
    kDefaultCallStackLevel,
692 693 694 695 696 697 698 699
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
700 701 702 703 704 705 706 707 708 709

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
710 711
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient,
                            false,
Z
Zeng Jinle 已提交
712 713
                            "Sum gradients by the reverse order of "
                            "the forward execution sequence.");
714 715 716 717 718 719 720 721 722

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
Z
Zeng Jinle 已提交
723
PADDLE_DEFINE_EXPORTED_int32(
724 725
    max_inplace_grad_add,
    0,
726 727 728 729
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
730 731 732 733 734 735 736 737 738

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
739 740
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                              "",
Z
Zeng Jinle 已提交
741
                              "List of OneDNN operation types to be turned on");
742 743 744 745 746 747 748 749 750

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
Z
Zeng Jinle 已提交
751
PADDLE_DEFINE_EXPORTED_string(
752 753
    tracer_mkldnn_ops_off,
    "",
Z
Zeng Jinle 已提交
754
    "List of OneDNN operation types to be turned off");
755

756 757 758 759 760 761 762 763 764
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
765
PADDLE_DEFINE_EXPORTED_bool(
766 767
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
768
    "Check kernel launch status after every kernel compute");
769 770
#endif

771 772 773 774 775 776 777 778 779
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
780 781
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                            false,
Z
Zeng Jinle 已提交
782
                            "Disable cudnn in conv2d");
783

784 785
PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
                            false,
786
                            "Whether to use fast math GPU functions.");
787
#endif
B
Baibaifan 已提交
788 789 790 791 792 793 794 795 796

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
F
fwenguang 已提交
797 798 799
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_MLU)
800 801
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                             120,
Z
Zeng Jinle 已提交
802
                             "The maximum time for get host by name time");
B
Baibaifan 已提交
803
#endif
804 805 806 807 808 809 810 811 812 813

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
Z
Zeng Jinle 已提交
814
PADDLE_DEFINE_EXPORTED_bool(
815 816
    apply_pass_to_program,
    false,
817
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
818

D
danleifeng 已提交
819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                            false,
                            "It controls whether load graph node and edge with "
                            "mutli threads parallely.");

/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
PADDLE_DEFINE_EXPORTED_bool(
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

F
Feng Xing 已提交
847
/**
L
Liu-xiandong 已提交
848
 * KP kernel related FLAG
F
Feng Xing 已提交
849 850 851
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
852 853
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
854 855
 * Note:
 */
856 857
PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel,
                            false,
L
Liu-xiandong 已提交
858
                            "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
859

860
/**
861 862 863 864 865 866 867 868 869 870
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
871 872
PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                            false,
873 874 875 876 877
                            "It controls whether the allreduce operations "
                            "would only wait one event instead of multiple "
                            "events. Currently, only fuse allreduce supports "
                            "this. Otherwise, the precision may be wrong.");

878
#ifdef PADDLE_WITH_CINN
879
/*
880 881 882 883 884 885 886 887
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
PADDLE_DEFINE_EXPORTED_bool(
    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
888 889 890 891 892 893 894 895 896

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
897 898
PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops,
                              "",
899 900 901 902 903 904 905 906 907 908 909
                              "It controls the cinn op subset to be used, "
                              "which has the highest priority.");

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
910 911
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops,
                              "",
912
                              "It controls the cinn op subset to be not used.");
913 914 915 916 917 918 919 920 921 922

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
923 924
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            true,
925 926 927 928 929 930 931 932 933 934 935
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
936 937
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                            false,
938 939 940
                            "It controls whether to use cinn with "
                            "its auto-tune feature enabled");

941
#endif
942

943 944
DEFINE_int32(record_pool_max_size,
             2000000,
Y
yaoxuefeng 已提交
945 946
             "SlotRecordDataset slot record pool max size");
DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
947 948
DEFINE_bool(enable_slotpool_wait_release,
            false,
Y
yaoxuefeng 已提交
949
            "enable slotrecord obejct wait release, default false");
950 951
DEFINE_bool(enable_slotrecord_reset_shrink,
            false,
Y
yaoxuefeng 已提交
952
            "enable slotrecord obejct reset shrink memory, default false");
953 954
DEFINE_bool(enable_ins_parser_file,
            false,
D
danleifeng 已提交
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981
            "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                              0.75,
                              "the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_gpu_direct_access,
    false,
    "enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                            true,
                            "enable load_node_list_into_hbm, default true");
982 983 984 985 986 987 988 989 990 991 992 993

/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif
994 995 996 997 998 999 1000 1001 1002

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
1003

H
hong 已提交
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
/**
 * Conv Search cache max number related FLAG
 * Name: FLAGS_search_cache_max_number
 * Since Version: 2.3.0
 * Value Range: int32, default=1000000
 * Example:
 */
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
                             1000000,
                             "search_cache_max_number.");

1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
/**
 * Preformance related FLAG
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
 * uses more gpu memory.
 */
PADDLE_DEFINE_EXPORTED_bool(
1025 1026
    einsum_opt,
    false,
1027
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1028 1029 1030 1031 1032 1033

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
1034
 * default=Predictor
1035 1036
 * Example:
 * Note:
1037
 * FLAGS_jit_engine_type == New, using InterpreterEngine by default
1038
 * FLAGS_jit_engine_type == Predictor, using inference Predictor by default
1039 1040
 */
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
1041
                              "Predictor",
1042
                              "Choose default funciton type in JitLayer.");
1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053

#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
 * CUDNNv8 related FLAG
 * Name: enable_cudnn_frontend
 * Since Version: 2.5.0
 * Value Range: bool, default=false
 * Example:
 * Note: Enable CUDNNv8 Frontend API for CUDNN kernels.
 */
PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, "");
1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066

/**
 * CUDNNv8 related FLAG
 * Name: cudnn_cache_saturation_count
 * Since Version: 2.5.0
 * Value Range: int64_t, default=1
 * Example:
 * Note: Set saturation count for CUDNNv8 cache. A candidate execution
 * plan need to be considered as the fastest plan by exhaustive search
 * N times before it is actually added in the cache. It is useful when
 * the result of exhaustive search is unstable.
 */
PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, "");
1067
#endif  // PADDLE_WITH_CUDNN_FRONTEND