flags.cc 35.9 KB
Newer Older
1
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Z
Zeng Jinle 已提交
16
#include "paddle/fluid/platform/flags.h"
17
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
18 19 20
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif

Z
Zeng Jinle 已提交
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
namespace paddle {
namespace platform {

const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
  return *GetMutableExportedFlagInfoMap();
}

ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
  static ExportedFlagInfoMap g_exported_flag_info_map;
  return &g_exported_flag_info_map;
}

}  // namespace platform
}  // namespace paddle

36 37
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
                             0,
38 39
                             "number of threads for inner op");

40 41 42 43
/**
 * NOTE(paddle-dev): This file is designed to define all public FLAGS.
 */

44 45 46 47 48 49 50 51 52
/**
 * Paddle initialization related FLAG
 * Name: FLAGS_paddle_num_threads
 * Since Version: 0.15.0
 * Value Range: int32, default=1
 * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per
 * instance to 2
 * Note:
 */
53 54
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads,
                             1,
Z
Zeng Jinle 已提交
55
                             "Number of threads for each paddle instance.");
56

57 58 59 60 61 62 63 64
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
Z
Zeng Jinle 已提交
65
PADDLE_DEFINE_EXPORTED_bool(
66 67
    check_nan_inf,
    false,
Z
Zeng Jinle 已提交
68 69
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");
70

D
danleifeng 已提交
71 72 73 74 75 76 77 78 79 80 81 82 83 84
/**
 * Operator related FLAG
 * Name: FLAGS_check_nan_inf
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: Used to debug. Checking whether operator produce NAN/INF or not.
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_opt_get_features,
    false,
    "Checking whether operator produce NAN/INF or not. It will be "
    "extremely slow so please use this flag wisely.");

85 86 87 88
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_ASCEND_CL)
89 90 91 92 93 94 95 96 97

/**
 * CUDA related related FLAG
 * Name: FLAGS_enable_cublas_tensor_op_math
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use Tensor Core, faster but it may loss precision.
 */
Z
Zeng Jinle 已提交
98
PADDLE_DEFINE_EXPORTED_bool(
99 100
    enable_cublas_tensor_op_math,
    false,
101 102 103 104 105 106 107 108
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
    "but it may loss precision. Currently, There are two CUDA libraries that"
    " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
    " GEMM computations(the matrices must be either half precision or single "
    "precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

109 110 111 112 113 114 115 116 117 118
/**
 * CUDA related related FLAG
 * Name: FLAGS_gemm_use_half_precision_compute_type
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example:
 * Note: whether to use fp16 compute type when the input and output is fp16,
 * faster but it may loss precision.
 */
PADDLE_DEFINE_EXPORTED_bool(
119 120
    gemm_use_half_precision_compute_type,
    true,
121 122 123 124
    "Whether to use fp16 compute type when the input and output is fp16, "
    "faster but it may loss precision in most case. If true, the compute "
    "type will be set to fp32. Default is true.");

125 126 127 128 129 130 131 132 133
/**
 * CUDA related FLAG
 * Name: FLAGS_selected_gpus
 * Since Version: 1.3.0
 * Value Range: integer list separated by comma, default empty list
 * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu
 * cards
 * Note: A list of device ids separated by comma, like: 0,1,2,3
 */
Z
Zeng Jinle 已提交
134
PADDLE_DEFINE_EXPORTED_string(
135 136
    selected_gpus,
    "",
Z
Zeng Jinle 已提交
137 138 139 140 141 142 143
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (GPU). If you want to use "
    "all visible devices, set this to empty string. NOTE: the "
    "reason of doing this is that we want to use P2P communication"
    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
    "share-memory only.");
144 145
#endif

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
#if defined(PADDLE_WITH_CUDA)
/**
 * CUDA related FLAG
 * Name: FLAGS_cublaslt_exhaustive_search_times
 * Since Version: 2.3.0
 * Value Range: int64_t, default=0
 * Example:
 * Note: Represents times of exhaustive search to evaluate performance of
 *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
 *       with value > 0 to enable exhaustive search. Default is 0, means
 *       getting algorithms via heuristic search. There are two search methods
 *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
 *       attempts all cuBlasLt algorithms to select the fastest, which is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification Once you change the layer specifications
 *       (such as M, N and K), it will re-search again.
 */
PADDLE_DEFINE_EXPORTED_int64(
164 165
    cublaslt_exhaustive_search_times,
    0,
166 167 168 169
    "The times of exhaustive search for cuBlasLt matmul with/without "
    " epilogue algorithms, default is 0, means disabling exhaustive search.");
#endif

170
#if defined(PADDLE_WITH_ASCEND_CL)
Z
Zeng Jinle 已提交
171
PADDLE_DEFINE_EXPORTED_string(
172 173
    selected_npus,
    "",
Z
Zeng Jinle 已提交
174 175 176 177 178
    "A list of device ids separated by comma, like: 0,1,2,3. "
    "This option is useful when doing multi process training and "
    "each process have only one device (NPU). If you want to use "
    "all visible devices, set this to empty string.");
PADDLE_DEFINE_EXPORTED_bool(
179 180
    hccl_check_nan,
    true,
Z
Zeng Jinle 已提交
181 182 183
    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
    "core when meets Nan value");
PADDLE_DEFINE_EXPORTED_string(
184 185
    npu_config_path,
    "",
186 187
    "The absolute path of configuration json file, like: /tmp/config.json. "
    "If proveided, it will be passed to aclInit().");
188 189
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
                             1,
Z
Zeng Jinle 已提交
190
                             "set minmum loss scaling value!");
A
Aganlengzi 已提交
191
PADDLE_DEFINE_EXPORTED_string(
192 193
    npu_precision_mode,
    "",
A
Aganlengzi 已提交
194 195 196 197 198
    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
    "'allow_mix_precision'. If you want to use the default mode ("
    "allow_fp32_to_fp16), set this to empty string. For more details, "
    "please refer to the documents");
199
#endif
200

201 202 203 204 205 206 207 208 209 210 211 212 213
/*
 * Kernel related FLAG
 * Name: FLAGS_enable_api_kernel_fallback
 * Since Version: 2.4
 * Value Range: bool, default=true
 * Example: FLAGS_enable_api_kernel_fallback=true would allow kernel of current
 * backend fallback to CPU one when not found
 */
PADDLE_DEFINE_EXPORTED_bool(
    enable_api_kernel_fallback,
    true,
    "Whether enable api kernel fallback to CPU one when not found");

214
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
215 216 217 218 219 220 221 222 223
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_deterministic
 * Since Version: 0.13.0
 * Value Range: bool, default=false
 * Example:
 * Note: whether to use deterministic algorithm in cudnn.
 *       If true, it will slow down some operators such as conv and pooling.
 */
Z
Zeng Jinle 已提交
224
PADDLE_DEFINE_EXPORTED_bool(
225 226
    cudnn_deterministic,
    false,
Z
Zeng Jinle 已提交
227 228 229
    "Whether allow using an autotuning algorithm for convolution "
    "operator. The autotuning algorithm may be non-deterministic. If "
    "true, the algorithm is deterministic.");
230

231 232 233 234
/**
 * CUDNN related FLAG
 * Name: FLAGS_conv_workspace_size_limit
 * Since Version: 0.13.0
235
 * Value Range: uint64, default=512 (MB)
236 237 238 239 240 241 242
 * Example:
 * Note: The internal function of cuDNN obtains the fastest matching algorithm
 *       within this memory limit. Usually, faster algorithms can be chosen in
 *       larger workspaces, but memory space can also be significantly
 * increased.
 *       Users need to balance memory and speed.
 */
243 244 245
PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
                             paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
                             "cuDNN convolution workspace limit in MB unit.");
246

247 248 249 250 251 252 253 254 255 256 257 258 259 260
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search
 * Since Version: 1.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Represents whether an exhaustive search method is used to
 *       select a convolution algorithm. There are two search methods in cuDNN,
 *       heuristic search and exhaustive search. Exhaustive search attempts
 *       all cuDNN algorithms to select the fastest. This method is very
 *       time-consuming, and the selected algorithm will be cached for a given
 *       layer specification. Once you change the layer specifications
 *       (such as batch size, feature map size), it will search again.
 */
Z
Zeng Jinle 已提交
261
PADDLE_DEFINE_EXPORTED_bool(
262 263
    cudnn_exhaustive_search,
    false,
Z
Zeng Jinle 已提交
264 265
    "Whether enable exhaustive search for cuDNN convolution or "
    "not, default is False.");
266

267 268 269 270 271 272 273 274
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_exhaustive_search_times
 * Since Version:
 * Value Range:
 * Example:
 * Note: only used to predict for advanced developer
 */
275 276
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times,
                             -1,
Z
Zeng Jinle 已提交
277 278
                             "Exhaustive search times for cuDNN convolution, "
                             "default is -1, not exhaustive search");
279

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
/**
 * CUDNN related FLAG
 * Name: FLAGS_cudnn_batchnorm_spatial_persistent
 * Since Version: 1.4.0
 * Value Range: bool, default=false
 * Example:
 * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be
 * faster in
 *       some tasks because an optimized path may be selected for
 * CUDNN_DATA_FLOAT
 *       and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
 *       reason we set it to false by default is that this mode may use scaled
 *       atomic integer reduction that may cause a numerical overflow for
 * certain
 *       input data range.
 */
Z
Zeng Jinle 已提交
296
PADDLE_DEFINE_EXPORTED_bool(
297 298
    cudnn_batchnorm_spatial_persistent,
    false,
Z
Zeng Jinle 已提交
299 300
    "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
    "batch_norm, default is False.");
301 302
#endif

303
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
304 305 306

/**
 * NCCL related FLAG
307 308 309
 * Name: FLAGS_sync_nccl_allreduce
 * Since Version: 1.3
 * Value Range: bool, default=true
310 311 312 313 314
 * Example:
 * Note: asynchronous nccl allreduce or synchronous issue:
 *       https://github.com/PaddlePaddle/Paddle/issues/15049
 *       If you want to change this default value, why?(gongwb)
 */
Z
Zeng Jinle 已提交
315
PADDLE_DEFINE_EXPORTED_bool(
316 317
    sync_nccl_allreduce,
    true,
318 319 320 321 322
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
    "after allreduce, this mode can get better performance in some scenarios.");
#endif

#ifdef PADDLE_WITH_DISTRIBUTE
323 324 325 326 327 328 329 330 331 332 333
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_max_merge_var_num
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: The maximum number of gradients to be merged into a gradient and
 *       sent through the communicator. The trainer puts all the gradients
 *       into the queue, and then the communicator takes the gradients out
 *       of the queue and sends them after merging.
 */
334 335
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num,
                             20,
Z
Zeng Jinle 已提交
336 337
                             "max var num to merge and send");
PADDLE_DEFINE_EXPORTED_bool(
338 339
    communicator_is_sgd_optimizer,
    true,
Z
Zeng Jinle 已提交
340 341
    "gradient sent to the server is the sum of the gradients "
    "calculated by each thread if optimizer is sgd");
342 343 344 345 346 347 348 349 350 351 352 353 354
/**
 * Distributed related FLAG
 * Name: FLAGS_communicator_send_queue_size
 * Since Version: 1.5.0
 * Value Range: int32, default=20
 * Example:
 * Note: Size for each gradient queue. The trainer puts the gradient into
 *       the queue, and then the communicator takes it out of the queue and
 *       sends it out. When the communicator is slow, the queue may be full,
 *       and the trainer will be continuously blocked before the queue has
 *       space. It is used to avoid training much faster than communication,
 *       so that too many gradients are not sent out in time.
 */
355 356
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size,
                             20,
Z
Zeng Jinle 已提交
357
                             "queue size to recv gradient before send");
358 359
#endif

360 361 362 363 364 365 366 367 368
/**
 * Distributed related FLAG
 * Name: FLAGS_dist_threadpool_size
 * Since Version: 1.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: Control the number of threads used for distributed modules.
 *       If it is not set, it is set to a hard thread.
 */
Z
Zeng Jinle 已提交
369
PADDLE_DEFINE_EXPORTED_int32(
370 371
    dist_threadpool_size,
    0,
Z
Zeng Jinle 已提交
372
    "number of threads used for distributed executed.");
373

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
/**
 * Garbage collector related FLAG
 * Name: FLAGS_eager_delete_tensor_gb
 * Since Version: 1.0.0
 * Value Range: double, default=kDefaultEagerDeleteTensorGB
 * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is
 * no longer used.
 *          FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when
 * garbage occupies 1.0GB of memory.
 *          FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection
 * policy.
 * Note: Represents whether a garbage collection strategy is used to optimize
 * network memory usage.
 *       It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to
 *       enable garbage collection strategy when training large networks.
 */
390 391 392
// Disable gc by default when inference library is built
static const double kDefaultEagerDeleteTensorGB = 0;

Z
Zeng Jinle 已提交
393
PADDLE_DEFINE_EXPORTED_double(
394 395
    eager_delete_tensor_gb,
    kDefaultEagerDeleteTensorGB,
396 397 398
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

399 400 401 402 403 404 405 406 407 408 409 410
/**
 * Memory related FLAG
 * Name: FLAGS_fast_eager_deletion_mode
 * Since Version: 1.3.0
 * Value Range: bool, default=true
 * Example:
 * Note: Whether to use fast garbage collection strategy.
 *       If not set, the GPU memory is released at the end of the CUDA kernel.
 *       Otherwise, the GPU memory will be released before the CUDA kernel
 *       has finished, which will make the garbage collection strategy faster.
 *       Only works when garbage collection strategy is enabled.
 */
Z
Zeng Jinle 已提交
411
PADDLE_DEFINE_EXPORTED_bool(
412 413
    fast_eager_deletion_mode,
    true,
Z
Zeng Jinle 已提交
414 415
    "Fast eager deletion mode. If enabled, memory would release "
    "immediately without waiting GPU kernel ends.");
416

417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
/**
 * Memory related FLAG
 * Name: FLAGS_memory_fraction_of_eager_deletion
 * Since Version: 1.4
 * Value Range: double [0.0, 1.0], default=1.0
 * Example:
 * Note: The percentage of memory size of garbage collection policy
 *       to release variables.
 *       If FLAGS_memory_fraction_of_eager_deletion = 1.0,
 *       all temporary variables in the network will be released.
 *       If FLAGS_memory_fraction_of_eager_deletion = 0.0,
 *       no temporary variables in the network are released.
 *       If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0,
 *       all temporary variables will be sorted in descending order
 *       according to their memory size, and only variables with the
 *       largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
 *       The flag is only valid when running parallel data compilers.
 */
Z
Zeng Jinle 已提交
435
PADDLE_DEFINE_EXPORTED_double(
436 437
    memory_fraction_of_eager_deletion,
    1.0,
Z
Zeng Jinle 已提交
438 439 440 441
    "Fraction of eager deletion. If less than 1.0, all variables in "
    "the program would be sorted according to its memory size, and "
    "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
    "variables would be deleted.");
442

443 444 445 446
/**
 * Allocator related FLAG
 * Name: FLAGS_allocator_strategy
 * Since Version: 1.2
447 448
 * Value Range: string, {naive_best_fit, auto_growth, thread_local},
 * default=auto_growth
449
 * Example:
450
 * Note: For selecting allocator policy of PaddlePaddle.
451
 */
452
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
Z
Zeng Jinle 已提交
453
PADDLE_DEFINE_EXPORTED_string(
454 455
    allocator_strategy,
    kDefaultAllocatorStrategy,
456 457 458 459 460 461 462 463 464 465 466
    "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
    "naive_best_fit means the original pre-allocated allocator of Paddle. "
    "auto_growth means the auto-growth allocator. "
    "These two strategies differ in GPU memory allocation. "
    "naive_best_fit strategy would occupy almost all GPU memory by default, "
    "which prevents users from starting several Paddle jobs on the same GPU "
    "card but leads to less memory fragmentation (i.e., maximum batch "
    "size of models may be larger). auto_growth strategy would allocate "
    "GPU memory on demand, which allows users to start several Paddle jobs "
    "on the same GPU card but may lead to more memory fragmentation "
    "(i.e., maximum batch size of models may be smaller).");
467

468 469 470
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cpu_memory_to_use
471 472
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=1
473
 * Example:
474 475 476 477 478 479
 * Note: Represents the proportion of allocated CPU memory blocks
 *       to the total memory size of the CPU. Future CPU memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CUDA pinned memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CUDA pinned
 *       request util the CPU does not have enough memory.
480
 */
481 482
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use,
                              1,
Z
Zeng Jinle 已提交
483 484
                              "Default use 100% of CPU memory for PaddlePaddle,"
                              "reserve the rest for page tables, etc");
485 486 487 488 489 490 491 492 493 494 495 496 497

/**
 * Memory related FLAG
 * Name: FLAGS_initial_cpu_memory_in_mb
 * Since Version: 0.14.0
 * Value Range: uint64, default=500 (MB)
 * Example:
 * Note: The CPU memory block size of the initial allocator in MB.
 *       The allocator takes the minimum values of
 *       FLAGS_initial_cpu_memory_in_mb and
 *       FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
 *       as memory block sizes.
 */
Z
Zeng Jinle 已提交
498
PADDLE_DEFINE_EXPORTED_uint64(
499 500
    initial_cpu_memory_in_mb,
    500ul,
Z
Zeng Jinle 已提交
501
    "Initial CPU memory for PaddlePaddle, in MD unit.");
502

503 504 505
/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use
506 507
 * Since Version: 0.12.0
 * Value Range: double, [0.0, 1.0], default=0.5
508
 * Example:
509 510 511 512 513 514
 * Note: Represents the proportion of allocated CUDA pinned memory blocks
 *       to the total memory size of the CPU. Future CUDA pinned memory usage
 *       will be allocated from this memory block. If the memory block does
 *       not have enough CPU memory, new memory blocks of the same
 *       size as the memory block will be allocated from the CPU
 *       request util the CPU does not have enough memory.
515
 */
Z
Zeng Jinle 已提交
516
PADDLE_DEFINE_EXPORTED_double(
517 518
    fraction_of_cuda_pinned_memory_to_use,
    0.5,
519 520 521
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

522 523
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
// flags.
524 525 526
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
    defined(PADDLE_WITH_CUSTOM_DEVICE)
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544

/**
 * Memory related FLAG
 * Name: FLAGS_fraction_of_gpu_memory_to_use
 * Since Version: 1.2.0
 * Value Range: double, default=0.5 if win32, 0.92 else
 * Example:
 * Note: Represents the proportion of allocated memory blocks to the total
 * memory size
 *       of the GPU. Future memory usage will be allocated from this memory
 * block.
 *       If the memory block does not have enough GPU memory, new memory blocks
 * of
 *       the same size as the memory block will be allocated from the GPU
 * request
 *       until the GPU does not have enough memory.
 */

545 546 547 548 549 550 551 552
#ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
#else
// fraction_of_gpu_memory_to_use cannot be too high on windows,
// since the win32 graphic sub-system can occupy some GPU memory
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
Z
Zeng Jinle 已提交
553
PADDLE_DEFINE_EXPORTED_double(
554 555
    fraction_of_gpu_memory_to_use,
    fraction_of_gpu_memory_to_use,
Z
Zeng Jinle 已提交
556 557 558 559 560
    "Allocate a trunk of gpu memory that is this fraction of the "
    "total gpu memory size. Future memory usage will be allocated "
    "from the trunk. If the trunk doesn't have enough gpu memory, "
    "additional trunks of the same size will be requested from gpu "
    "until the gpu has no memory left for another trunk.");
561

562 563 564 565 566 567 568 569 570 571 572 573
/**
 * Memory related FLAG
 * Name: FLAGS_initial_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: Allocate a specified size of GPU memory block. Later memory usage
 *       will be allocated from that memory block. If the memory block does not
 *       have enough GPU memory, the memory block with the size
 *       FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
 *       the GPU has no remaining memory.
 */
Z
Zeng Jinle 已提交
574
PADDLE_DEFINE_EXPORTED_uint64(
575 576
    initial_gpu_memory_in_mb,
    0ul,
577 578 579 580 581 582 583 584 585 586 587
    "Allocate a trunk of gpu memory whose byte size is specified by "
    "the flag. Future memory usage will be allocated from the "
    "trunk. If the trunk doesn't have enough gpu memory, additional "
    "trunks of the gpu memory will be requested from gpu with size "
    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
    "no memory left for the additional trunk. Note: if you set this "
    "flag, the memory size set by "
    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
    "flag. If you don't set this flag, PaddlePaddle will use "
    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");

588 589 590 591 592 593 594 595 596
/**
 * Memory related FLAG
 * Name: FLAGS_reallocate_gpu_memory_in_mb
 * Since Version: 1.4.0
 * Value Range: uint64, default=0 (MB)
 * Example:
 * Note: If the allocated GPU memory blocks are exhausted,
 *       additional GPU memory blocks are reallocated
 */
Z
Zeng Jinle 已提交
597
PADDLE_DEFINE_EXPORTED_uint64(
598 599
    reallocate_gpu_memory_in_mb,
    0ul,
Z
Zeng Jinle 已提交
600 601 602 603 604
    "If this flag is set, Paddle will reallocate the gpu memory with "
    "size specified by this flag. Else Paddle will reallocate by "
    "FLAGS_fraction_of_gpu_memory_to_use");

PADDLE_DEFINE_EXPORTED_uint64(
605 606
    gpu_memory_limit_mb,
    0UL,
Z
Zeng Jinle 已提交
607 608 609 610 611 612
    "The maximum gpu memory limit that the process can allocate. "
    "If it is equal to 0, there would be no limit and all gpu memory "
    "would be available to the process. If it is larger than 0, "
    "the process would raise out of memory error if the allocated "
    "memory exceeds the limit even though there is available "
    "memory on the gpu card. The unit is MB and default value is 0.");
613

614
#endif
615 616 617 618 619 620 621 622 623

/**
 * Scope related FLAG
 * Name: local_exe_sub_scope_limit
 * Since Version: 1.6.0
 * Value Range: double, default=256 (MB)
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
624
PADDLE_DEFINE_EXPORTED_double(
625 626
    local_exe_sub_scope_limit,
    256.0,  // MBytes
Z
Zeng Jinle 已提交
627 628 629 630
    "The memory up limit of sub-scopes of local execution scope for "
    "each CUDAPlace. If you don't need to limit the memory, "
    "you should set FLAGS_local_exe_sub_scope_limit=-1. "
    "The default value is 256 MBytes.");
631

632
PADDLE_DEFINE_EXPORTED_bool(
633 634
    reader_queue_speed_test_mode,
    false,
635 636 637
    "If set true, the queue.pop will only get data from queue but not "
    "remove the data from queue for speed testing");

638 639 640 641 642 643 644 645
/**
 * MKLDNN related FLAG
 * Name: use_mkldnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note:
 */
Z
Zeng Jinle 已提交
646
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661

/**
 * Debug related FLAG
 * Name: FLAGS_call_stack_level
 * Since Version: 2.0.0
 * Value Range: int, default=2
 * Example:
 * Note: Used to debug. Determine the call stack to print when error or
 * exeception happens.
 * If FLAGS_call_stack_level == 0, only the error message summary will be shown.
 * If FLAGS_call_stack_level == 1, the python stack and  error message summary
 * will be shown.
 * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
 * message summary will be shown.
 */
662 663 664
#ifdef PADDLE_NO_PYTHON
static const int32_t kDefaultCallStackLevel = 2;
#else
665
static const int32_t kDefaultCallStackLevel = 1;
666
#endif
667

Z
Zeng Jinle 已提交
668
PADDLE_DEFINE_EXPORTED_int32(
669 670
    call_stack_level,
    kDefaultCallStackLevel,
671 672 673 674 675 676 677 678
    "Determine the call stack to print when error or exeception happens."
    // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
    // "If FLAGS_call_stack_level == 0, only the error message summary will be "
    // "shown. "
    "If FLAGS_call_stack_level == 1, the python stack and error message "
    "summary will be shown."
    "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
    "error message summary will be shown.");
679 680 681 682 683 684 685 686 687 688

/**
 * Debug related FLAG
 * Name: sort_sum_gradient
 * Since Version: 2.0.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, gradients are summed by the reverse order of
 * the forward execution sequence.
 */
689 690
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient,
                            false,
Z
Zeng Jinle 已提交
691 692
                            "Sum gradients by the reverse order of "
                            "the forward execution sequence.");
693 694 695 696 697 698 699 700 701

/**
 * Performance related FLAG
 * Name: max_inplace_grad_add
 * Since Version: 2.0.0
 * Value Range: int32, default=0
 * Example:
 * Note: The maximum number of inplace grad_add.
 */
Z
Zeng Jinle 已提交
702
PADDLE_DEFINE_EXPORTED_int32(
703 704
    max_inplace_grad_add,
    0,
705 706 707 708
    "The maximum number of inplace grad_add. When doing "
    "gradient accumulation, if the number of gradients need to that "
    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
    "instead of sum. Default is 0.");
709 710 711 712 713 714 715 716 717

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_on
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be enabled.
 */
718 719
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on,
                              "",
Z
Zeng Jinle 已提交
720
                              "List of OneDNN operation types to be turned on");
721 722 723 724 725 726 727 728 729

/**
 * Debug related FLAG
 * Name: tracer_mkldnn_ops_off
 * Since Version: 2.0.0
 * Value Range: string, default=empty
 * Example:
 * Note: Holds list of operation types with OneDNN kernels to be disabled.
 */
Z
Zeng Jinle 已提交
730
PADDLE_DEFINE_EXPORTED_string(
731 732
    tracer_mkldnn_ops_off,
    "",
Z
Zeng Jinle 已提交
733
    "List of OneDNN operation types to be turned off");
734

735 736 737 738 739 740 741 742 743
/**
 * Debug related FLAG
 * Name: check_kernel_launch
 * Since Version: 2.1.0
 * Value Range: bool, default=false
 * Example:
 * Note: Check kernel launch status after every kernel compute.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
744
PADDLE_DEFINE_EXPORTED_bool(
745 746
    check_kernel_launch,
    false,
Z
Zeng Jinle 已提交
747
    "Check kernel launch status after every kernel compute");
748 749
#endif

750 751 752 753 754 755 756 757 758
/**
 * CUDNN related FLAG
 * Name: conv2d_disable_cudnn
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: Disable cudnn in conv2d.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
759 760
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                            false,
Z
Zeng Jinle 已提交
761
                            "Disable cudnn in conv2d");
762

763 764
PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
                            false,
765
                            "Whether to use fast math GPU functions.");
766
#endif
B
Baibaifan 已提交
767 768 769 770 771 772 773 774 775

/**
 * Distributed related FLAG
 * Name: FLAGS_get_host_by_name_time
 * Since Version: 2.2.0
 * Value Range: int32, default=120
 * Example:
 * Note: Get host by name time.
 */
F
fwenguang 已提交
776 777 778
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
    defined(PADDLE_WITH_MLU)
779 780
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                             120,
Z
Zeng Jinle 已提交
781
                             "The maximum time for get host by name time");
B
Baibaifan 已提交
782
#endif
783 784 785 786 787 788 789 790 791 792

/**
 * Distributed related FLAG
 * Name: FLAGS_apply_pass_to_program
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_apply_pass_to_program=true would apply IR Pass to
 *          program when using Fleet APIs.
 * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
 */
Z
Zeng Jinle 已提交
793
PADDLE_DEFINE_EXPORTED_bool(
794 795
    apply_pass_to_program,
    false,
796
    "It controls whether to apply IR pass to program when using Fleet APIs");
Y
yaoxuefeng 已提交
797

D
danleifeng 已提交
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
/**
 * Distributed related FLAG
 * Name: FLAGS_graph_load_in_parallel
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control whether load graph node and edge with multi threads parallely
 *       If it is not set, load graph data with one thread
 */
PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                            false,
                            "It controls whether load graph node and edge with "
                            "mutli threads parallely.");

/**
 * Distributed related FLAG
 * Name: FLAGS_graph_get_neighbor_id
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example:
 * Note: Control get all neighbor id when running sub part graph
 *       If it is not set, do not need get neighbor id when run all part graph
 */
PADDLE_DEFINE_EXPORTED_bool(
    graph_get_neighbor_id,
    false,
    "It controls get all neighbor id when running sub part graph.");

F
Feng Xing 已提交
826
/**
L
Liu-xiandong 已提交
827
 * KP kernel related FLAG
F
Feng Xing 已提交
828 829 830
 * Name: FLAGS_run_kp_kernel
 * Since Version: 2.3.0
 * Value Range: bool, default=false
L
Liu-xiandong 已提交
831 832
 * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in the
 * Op.
F
Feng Xing 已提交
833 834
 * Note:
 */
835 836
PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel,
                            false,
L
Liu-xiandong 已提交
837
                            "It controls whether to run PaddlePaddle using KP");
F
Feng Xing 已提交
838

839
/**
840 841 842 843 844 845 846 847 848 849
 * Distributed related FLAG
 * Name: FLAGS_allreduce_record_one_event
 * Since Version: 2.2.0
 * Value Range: bool, default=false
 * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
 *          operations would only wait one event instead of multiple events.
 * Note: Make the allreduce operations would only wait one event instead of
 *       multiple events. Currently, only fuse allreduce supports this.
 *       Otherwise, the precision may be wrong.
 */
850 851
PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event,
                            false,
852 853 854 855 856
                            "It controls whether the allreduce operations "
                            "would only wait one event instead of multiple "
                            "events. Currently, only fuse allreduce supports "
                            "this. Otherwise, the precision may be wrong.");

857
#ifdef PADDLE_WITH_CINN
858
/*
859 860 861 862 863 864 865 866
 * CINN related FLAG
 * Name: FLAGS_use_cinn
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
 */
PADDLE_DEFINE_EXPORTED_bool(
    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
867 868 869 870 871 872 873 874 875

/*
 * CINN related FLAG
 * Name: FLAGS_allow_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
 * when using CINN
 */
876 877
PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops,
                              "",
878 879 880 881 882 883 884 885 886 887 888
                              "It controls the cinn op subset to be used, "
                              "which has the highest priority.");

/*
 * CINN related FLAG
 * Name: FLAGS_deny_cinn_ops
 * Since Version: 2.3
 * Value Range: string, default=""
 * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
 * when using CINN
 */
889 890
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops,
                              "",
891
                              "It controls the cinn op subset to be not used.");
892 893 894 895 896 897 898 899 900 901

/*
 * CINN related FLAG
 * Name: FLAGS_enable_pe_launch_cinn
 * Since Version: 2.3
 * Value Range: bool, default=true
 * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
 * instructions of a paddle graph with ParallelExecutor, otherwise with the
 * CINN compiled runtime program in sequential order.
 */
902 903
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn,
                            true,
904 905 906 907 908 909 910 911 912 913 914
                            "It controls whether to execute cinn compiled "
                            "program with ParallelExecutor");

/*
 * CINN related FLAG
 * Name: FLAGS_enable_cinn_auto_tune
 * Since Version: 2.3
 * Value Range: bool, default=false
 * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
 * auto-tune feature enabled
 */
915 916
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                            false,
917 918 919
                            "It controls whether to use cinn with "
                            "its auto-tune feature enabled");

920
#endif
921

922 923
DEFINE_int32(record_pool_max_size,
             2000000,
Y
yaoxuefeng 已提交
924 925
             "SlotRecordDataset slot record pool max size");
DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
926 927
DEFINE_bool(enable_slotpool_wait_release,
            false,
Y
yaoxuefeng 已提交
928
            "enable slotrecord obejct wait release, default false");
929 930
DEFINE_bool(enable_slotrecord_reset_shrink,
            false,
Y
yaoxuefeng 已提交
931
            "enable slotrecord obejct reset shrink memory, default false");
932 933
DEFINE_bool(enable_ins_parser_file,
            false,
D
danleifeng 已提交
934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960
            "enable parser ins file, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_hbm_table_collision_stat,
    false,
    "enable hash collisions stat for hbm table, default false");
PADDLE_DEFINE_EXPORTED_double(gpugraph_hbm_table_load_factor,
                              0.75,
                              "the load factor of hbm table, default 0.75");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_gpu_direct_access,
    false,
    "enable direct access bwtween multi gpu cards, default false");
PADDLE_DEFINE_EXPORTED_bool(
    gpugraph_enable_segment_merge_grads,
    false,
    "enable segment merge gradients while push sparse, default false");
PADDLE_DEFINE_EXPORTED_uint64(
    gpugraph_merge_grads_segment_size,
    128,
    "segment size with segment gradient merge, default 128");
PADDLE_DEFINE_EXPORTED_int32(
    gpugraph_dedup_pull_push_mode,
    0,
    "enable dedup keys while pull push sparse, default 0");
PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                            true,
                            "enable load_node_list_into_hbm, default true");
961 962 963 964 965 966 967 968 969 970 971 972

/**
 * ProcessGroupNCCL related FLAG
 * Name: nccl_blocking_wait
 * Since Version:
 * Value Range: bool, default=false
 * Example:
 * Note: nccl blocking wait.
 */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif
973 974 975 976 977 978 979 980 981

/**
 * Autotune related FLAG
 * Name: FLAGS_use_autotune
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 */
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
982

H
hong 已提交
983 984 985 986 987 988 989 990 991 992 993
/**
 * Conv Search cache max number related FLAG
 * Name: FLAGS_search_cache_max_number
 * Since Version: 2.3.0
 * Value Range: int32, default=1000000
 * Example:
 */
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
                             1000000,
                             "search_cache_max_number.");

994 995 996 997 998 999 1000 1001 1002 1003
/**
 * Preformance related FLAG
 * Name: einsum_opt
 * Since Version: 2.3.0
 * Value Range: bool, default=false
 * Example:
 * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
 * uses more gpu memory.
 */
PADDLE_DEFINE_EXPORTED_bool(
1004 1005
    einsum_opt,
    false,
1006
    "EinsumOp backward will be speedup at the expense of more gpu memory.");
1007 1008 1009 1010 1011 1012 1013 1014 1015

/**
 * JitLayer related FLAG
 * Name: FLAGS_jit_engine_type
 * Since Version: 2.3.0
 * Value Range: string, {Executor, PE},
 * default=PE
 * Example:
 * Note:
1016 1017
 * FLAGS_jit_engine_type == Executor, using ExecutorEngine by default
 * FLAGS_jit_engine_type == PE, using PEEngine by default
1018
 * FLAGS_jit_engine_type == New, using InterpreterEngine by default
1019 1020 1021 1022
 */
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                              "PE",
                              "Choose default funciton type in JitLayer.");