flags.cc 35.6 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Z
Zeng Jinle 已提交
16
#include "paddle/fluid/platform/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18 19 20
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif

Z
Zeng Jinle 已提交
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
namespace paddle {
namespace platform {

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

}  // namespace platform
}  // namespace paddle

36 37
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
                             0,
38 39
                             "number of threads for inner op");

40 41 42 43
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

44 45 46 47 48 49 50 51 52
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
53 54
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads,
                             1,
Z
Zeng Jinle 已提交
55
                             "Number of threads for each paddle instance.");
56

57 58 59 60 61 62 63 64
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
Z
Zeng Jinle 已提交
65
PADDLE_DEFINE_EXPORTED_bool(
66 67
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
68 69
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
70

D
danleifeng 已提交
71 72 73 74 75 76 77 78 79 80 81 82 83 84
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

85 86 87 88
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_ASCEND_CL)
89 90 91 92 93 94 95 96 97

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
Z
Zeng Jinle 已提交
98
PADDLE_DEFINE_EXPORTED_bool(
99 100
    enable_cublas_tensor_op_math,
    false,
101 102 103 104 105 106 107 108
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

109 110 111 112 113 114 115 116 117 118
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
PADDLE_DEFINE_EXPORTED_bool(
119 120
    gemm_use_half_precision_compute_type,
    true,
121 122 123 124
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
    "type will be set to fp32. Default is true.");

125 126 127 128 129 130 131 132 133
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
Z
Zeng Jinle 已提交
134
PADDLE_DEFINE_EXPORTED_string(
135 136
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
137 138 139 140 141 142 143
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
144 145
#endif

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
PADDLE_DEFINE_EXPORTED_int64(
164 165
    cublaslt_exhaustive_search_times,
    0,
166 167 168 169
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

170
#if defined(PADDLE_WITH_ASCEND_CL)
Z
Zeng Jinle 已提交
171
PADDLE_DEFINE_EXPORTED_string(
172 173
    selected_npus,
    "",
Z
Zeng Jinle 已提交
174 175 176 177 178
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (NPU). If you want to use "
    "all visible devices, set this to empty string.");
PADDLE_DEFINE_EXPORTED_bool(
179 180
    hccl_check_nan,
    true,
Z
Zeng Jinle 已提交
181 182 183
    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
    "core when meets Nan value");
PADDLE_DEFINE_EXPORTED_string(
184 185
    npu_config_path,
    "",
186 187
    "The absolute path of configuration json file, like: /tmp/config.json. "
    "If proveided, it will be passed to aclInit().");
188 189
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
                             1,
Z
Zeng Jinle 已提交
190
                             "set minmum loss scaling value!");
A
Aganlengzi 已提交
191
PADDLE_DEFINE_EXPORTED_string(
192 193
    npu_precision_mode,
    "",
A
Aganlengzi 已提交
194 195 196 197 198
    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
    "'allow_mix_precision'. If you want to use the default mode ("
    "allow_fp32_to_fp16), set this to empty string. For more details, "
    "please refer to the documents");
199
#endif
200

201 202 203 204 205 206 207 208 209 210 211 212 213
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

214
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
215 216 217 218 219 220 221 222 223
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
Z
Zeng Jinle 已提交
224
PADDLE_DEFINE_EXPORTED_bool(
225 226
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
227 228 229
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
230

231 232 233 234
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
235
 * Value Range: uint64, default=512 (MB)
236 237 238 239 240 241 242
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
243 244 245
PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
                             paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
                             "cuDNN convolution workspace limit in MB unit.");
246

247 248 249 250 251 252 253 254 255 256 257 258 259 260
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
Z
Zeng Jinle 已提交
261
PADDLE_DEFINE_EXPORTED_bool(
262 263
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
264 265
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
266

267 268 269 270 271 272 273 274
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
275 276
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                             -1,
Z
Zeng Jinle 已提交
277 278
                             "Exhaustive search times for cuDNN convolution, "
                             "default is -1, not exhaustive search");
279

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
Z
Zeng Jinle 已提交
296
PADDLE_DEFINE_EXPORTED_bool(
297 298
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
299 300
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
301 302
#endif

303
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
304 305 306

/**
 * NCCL related FLAG
307 308 309
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
310 311 312 313 314
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
Z
Zeng Jinle 已提交
315
PADDLE_DEFINE_EXPORTED_bool(
316 317
    sync_nccl_allreduce,
    true,
318 319 320 321 322
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
323 324 325 326 327 328 329 330 331 332 333
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
334 335
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                             20,
Z
Zeng Jinle 已提交
336 337
                             "max var num to merge and send");
PADDLE_DEFINE_EXPORTED_bool(
338 339
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
340 341
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
342 343 344 345 346 347 348 349 350 351 352 353 354
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
355 356
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                             20,
Z
Zeng Jinle 已提交
357
                             "queue size to recv gradient before send");
358 359
#endif

360 361 362 363 364 365 366 367 368
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
Z
Zeng Jinle 已提交
369
PADDLE_DEFINE_EXPORTED_int32(
370 371
    dist_threadpool_size,
    0,
Z
Zeng Jinle 已提交
372
    "number of threads used for distributed executed.");
373

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
390 391 392 393 394 395 396
// Disable gc by default when inference library is built
#ifdef PADDLE_ON_INFERENCE
static const double kDefaultEagerDeleteTensorGB = -1;
#else
static const double kDefaultEagerDeleteTensorGB = 0;
#endif

Z
Zeng Jinle 已提交
397
PADDLE_DEFINE_EXPORTED_double(
398 399
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
400 401 402
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

403 404 405 406 407 408 409 410 411 412 413 414
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
Z
Zeng Jinle 已提交
415
PADDLE_DEFINE_EXPORTED_bool(
416 417
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
418 419
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
420

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
Z
Zeng Jinle 已提交
439
PADDLE_DEFINE_EXPORTED_double(
440 441
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
442 443 444 445
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
446

447 448 449 450
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
451 452
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
453
 * Example:
454
 * Note: For selecting allocator policy of PaddlePaddle.
455
 */
456
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
Z
Zeng Jinle 已提交
457
PADDLE_DEFINE_EXPORTED_string(
458 459
    allocator_strategy,
    kDefaultAllocatorStrategy,
460 461 462 463 464 465 466 467 468 469 470
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
471

472 473 474
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
475 476
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
477
 * Example:
478 479 480 481 482 483
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
484
 */
485 486
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                              1,
Z
Zeng Jinle 已提交
487 488
                              "Default use 100% of CPU memory for PaddlePaddle,"
                              "reserve the rest for page tables, etc");
489 490 491 492 493 494 495 496 497 498 499 500 501

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
Z
Zeng Jinle 已提交
502
PADDLE_DEFINE_EXPORTED_uint64(
503 504
    initial_cpu_memory_in_mb,
    500ul,
Z
Zeng Jinle 已提交
505
    "Initial CPU memory for PaddlePaddle, in MD unit.");
506

507 508 509
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
510 511
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
512
 * Example:
513 514 515 516 517 518
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
519
 */
Z
Zeng Jinle 已提交
520
PADDLE_DEFINE_EXPORTED_double(
521 522
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
523 524 525
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

526 527
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
528 529 530
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
    defined(PADDLE_WITH_CUSTOM_DEVICE)
531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

549 550 551 552 553 554 555 556
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
Z
Zeng Jinle 已提交
557
PADDLE_DEFINE_EXPORTED_double(
558 559
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
560 561 562 563 564
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
565

566 567 568 569 570 571 572 573 574 575 576 577
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
Z
Zeng Jinle 已提交
578
PADDLE_DEFINE_EXPORTED_uint64(
579 580
    initial_gpu_memory_in_mb,
    0ul,
581 582 583 584 585 586 587 588 589 590 591
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

592 593 594 595 596 597 598 599 600
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
Z
Zeng Jinle 已提交
601
PADDLE_DEFINE_EXPORTED_uint64(
602 603
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
604 605 606 607 608
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

PADDLE_DEFINE_EXPORTED_uint64(
609 610
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
611 612 613 614 615 616
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
617

618
#endif
619 620 621 622 623 624 625 626 627

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
628
PADDLE_DEFINE_EXPORTED_double(
629 630
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
631 632 633 634
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
635

636
PADDLE_DEFINE_EXPORTED_bool(
637 638
    reader_queue_speed_test_mode,
    false,
639 640 641
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

642 643 644 645 646 647 648 649
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
650
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
666 667 668 669 670 671
#ifdef PADDLE_ON_INFERENCE
static const int32_t kDefaultCallStackLevel = 2;
#else
static const int32_t kDefaultCallStackLevel = 1;
#endif

Z
Zeng Jinle 已提交
672
PADDLE_DEFINE_EXPORTED_int32(
673 674
    call_stack_level,
    kDefaultCallStackLevel,
675 676 677 678 679 680 681 682
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
683 684 685 686 687 688 689 690 691 692

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
693 694
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient,
                            false,
Z
Zeng Jinle 已提交
695 696
                            "Sum gradients by the reverse order of "
                            "the forward execution sequence.");
697 698 699 700 701 702 703 704 705

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
Z
Zeng Jinle 已提交
706
PADDLE_DEFINE_EXPORTED_int32(
707 708
    max_inplace_grad_add,
    0,
709 710 711 712
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
713 714 715 716 717 718 719 720 721

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
722 723
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                              "",
Z
Zeng Jinle 已提交
724
                              "List of OneDNN operation types to be turned on");
725 726 727 728 729 730 731 732 733

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
Z
Zeng Jinle 已提交
734
PADDLE_DEFINE_EXPORTED_string(
735 736
    tracer_mkldnn_ops_off,
    "",
Z
Zeng Jinle 已提交
737
    "List of OneDNN operation types to be turned off");
738

739 740 741 742 743 744 745 746 747
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
748
PADDLE_DEFINE_EXPORTED_bool(
749 750
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
751
    "Check kernel launch status after every kernel compute");
752 753
#endif

754 755 756 757 758 759 760 761 762
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
763 764
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                            false,
Z
Zeng Jinle 已提交
765
                            "Disable cudnn in conv2d");
766

767 768
PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
                            false,
769
                            "Whether to use fast math GPU functions.");
770
#endif
B
Baibaifan 已提交
771 772 773 774 775 776 777 778 779

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
F
fwenguang 已提交
780 781 782
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_MLU)
783 784
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                             120,
Z
Zeng Jinle 已提交
785
                             "The maximum time for get host by name time");
B
Baibaifan 已提交
786
#endif
787 788 789 790 791 792 793 794 795 796

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
Z
Zeng Jinle 已提交
797
PADDLE_DEFINE_EXPORTED_bool(
798 799
    apply_pass_to_program,
    false,
800
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
801

D
danleifeng 已提交
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                            false,
                            "It controls whether load graph node and edge with "
                            "mutli threads parallely.");

/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
PADDLE_DEFINE_EXPORTED_bool(
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

F
Feng Xing 已提交
830
/**
L
Liu-xiandong 已提交
831
 * KP kernel related FLAG
F
Feng Xing 已提交
832 833 834
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
835 836
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
837 838
 * Note:
 */
839 840
PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel,
                            false,
L
Liu-xiandong 已提交
841
                            "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
842

843
/**
844 845 846 847 848 849 850 851 852 853
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
854 855
PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                            false,
856 857 858 859 860
                            "It controls whether the allreduce operations "
                            "would only wait one event instead of multiple "
                            "events. Currently, only fuse allreduce supports "
                            "this. Otherwise, the precision may be wrong.");

861
#ifdef PADDLE_WITH_CINN
862
/*
863 864 865 866 867 868 869 870
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
PADDLE_DEFINE_EXPORTED_bool(
    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
871 872 873 874 875 876 877 878 879

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
880 881
PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops,
                              "",
882 883 884 885 886 887 888 889 890 891 892
                              "It controls the cinn op subset to be used, "
                              "which has the highest priority.");

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
893 894
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops,
                              "",
895
                              "It controls the cinn op subset to be not used.");
896 897 898 899 900 901 902 903 904 905

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
906 907
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            true,
908 909 910 911 912 913 914 915 916 917 918
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
919 920
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                            false,
921 922 923
                            "It controls whether to use cinn with "
                            "its auto-tune feature enabled");

924
#endif
925

926 927
DEFINE_int32(record_pool_max_size,
             2000000,
Y
yaoxuefeng 已提交
928 929
             "SlotRecordDataset slot record pool max size");
DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
930 931
DEFINE_bool(enable_slotpool_wait_release,
            false,
Y
yaoxuefeng 已提交
932
            "enable slotrecord obejct wait release, default false");
933 934
DEFINE_bool(enable_slotrecord_reset_shrink,
            false,
Y
yaoxuefeng 已提交
935
            "enable slotrecord obejct reset shrink memory, default false");
936 937
DEFINE_bool(enable_ins_parser_file,
            false,
D
danleifeng 已提交
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964
            "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                              0.75,
                              "the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_gpu_direct_access,
    false,
    "enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                            true,
                            "enable load_node_list_into_hbm, default true");
965 966 967 968 969 970 971 972 973 974 975 976

/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif
977 978 979 980 981 982 983 984 985

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
986 987 988 989 990 991 992 993 994 995 996

/**
 * Preformance related FLAG
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
 * uses more gpu memory.
 */
PADDLE_DEFINE_EXPORTED_bool(
997 998
    einsum_opt,
    false,
999
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
 * default=PE
 * Example:
 * Note:
 * FLAGS_jit_engine_type == Executor, using ExecutorFunction by default
 * FLAGS_jit_engine_type == PE, using PEFunction by default
 */
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                              "PE",
                              "Choose default funciton type in JitLayer.");