diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 595aba887303d0b8f50a1f95750311bb4ca27959..5953f2443044f355352c9febc02eda05d957d8ec 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) -cc_library(imperative_profiler SRCS profiler.cc) +cc_library(imperative_profiler SRCS profiler.cc DEPS flags) if(NOT WIN32) if(WITH_NCCL OR WITH_RCCL) cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor) diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc index 57656d64ab78868a5c1c5eb73520523cd0f5d0b5..c2d668eccdaf9377d1ec2a62e50648bf48c67c00 100644 --- a/paddle/fluid/imperative/flags.cc +++ b/paddle/fluid/imperative/flags.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/fluid/imperative/flags.h" -#include "gflags/gflags.h" +#include "paddle/fluid/platform/flags.h" -DEFINE_uint64(dygraph_debug, 0, - "Debug level of dygraph. This flag is not " - "open to users"); +PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0, + "Debug level of dygraph. This flag is not " + "open to users"); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc index 6d0f6a12f522977564c9dc36029670d4f3c9d752..48af63056c5e361736046aa47d83735190d85c20 100644 --- a/paddle/fluid/imperative/profiler.cc +++ b/paddle/fluid/imperative/profiler.cc @@ -19,9 +19,9 @@ #endif #include #include // NOLINT -#include "gflags/gflags.h" +#include "paddle/fluid/platform/flags.h" -DEFINE_string( +PADDLE_DEFINE_EXPORTED_string( tracer_profile_fname, "xxgperf", "Profiler filename for imperative tracer, which generated by gperftools." "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 9a0637453f03f08a50bb1af958b1ba5e584869b4..6b4afae9f8c7527060004ae0b342d6a508a1b4d5 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) -cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) +cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags) cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index bfc4a1d598200ed296bdb17e29c48bed2bca1e16..78bce53b6f4ffb2f2a77b5ec2a9c645f32651de2 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -37,14 +37,15 @@ #endif #include "paddle/fluid/platform/npu_info.h" -DEFINE_int64( +PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, "The retry time (milliseconds) when allocator fails " "to allocate memory. No retry if this value is not greater than 0"); -DEFINE_bool(use_system_allocator, false, - "Whether to use system allocator to allocate CPU and GPU memory. " - "Only used for unittests."); +PADDLE_DEFINE_EXPORTED_bool( + use_system_allocator, false, + "Whether to use system allocator to allocate CPU and GPU memory. " + "Only used for unittests."); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index cca29797bb68c27514e27e3ced291202915acc64..a35d8a73f7edae601ac5ab4c01f38e8772e724b3 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -17,18 +17,21 @@ #include #include // NOLINT #include "paddle/fluid/memory/allocation/aligned_allocator.h" - -DEFINE_bool(free_idle_chunk, false, - "Whether to free idle chunk when each allocation is freed. " - "If false, all freed allocation would be cached to speed up next " - "allocation request. If true, no allocation would be cached. This " - "flag only works when FLAGS_allocator_strategy=auto_growth."); - -DEFINE_bool(free_when_no_cache_hit, false, - "Whether to free idle chunks when no cache hit. If true, idle " - "chunk would be freed when no cache hit; if false, idle " - "chunk would be freed when out of memory occurs. This flag " - "only works when FLAGS_allocator_strategy=auto_growth."); +#include "paddle/fluid/platform/flags.h" + +PADDLE_DEFINE_EXPORTED_READONLY_bool( + free_idle_chunk, false, + "Whether to free idle chunk when each allocation is freed. " + "If false, all freed allocation would be cached to speed up next " + "allocation request. If true, no allocation would be cached. This " + "flag only works when FLAGS_allocator_strategy=auto_growth."); + +PADDLE_DEFINE_EXPORTED_READONLY_bool( + free_when_no_cache_hit, false, + "Whether to free idle chunks when no cache hit. If true, idle " + "chunk would be freed when no cache hit; if false, idle " + "chunk would be freed when out of memory occurs. This flag " + "only works when FLAGS_allocator_strategy=auto_growth."); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 9cd35ad8ad9da959606d895063fe1981c5ade18f..2c00b34dd1353b583a680fd12416fa4d545566ad 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -34,12 +34,13 @@ #include "paddle/fluid/platform/xpu/xpu_header.h" #endif -DEFINE_bool(init_allocated_mem, false, - "It is a mistake that the values of the memory allocated by " - "BuddyAllocator are always zeroed in some op's implementation. " - "To find this error in time, we use init_allocated_mem to indicate " - "that initializing the allocated memory with a small value " - "during unit testing."); +PADDLE_DEFINE_EXPORTED_bool( + init_allocated_mem, false, + "It is a mistake that the values of the memory allocated by " + "BuddyAllocator are always zeroed in some op's implementation. " + "To find this error in time, we use init_allocated_mem to indicate " + "that initializing the allocated memory with a small value " + "during unit testing."); DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc index 8e249e72db514a790c977558135ed5ec3a3ab35a..cd1bdc4d60c7496878d2d2a36021fc6efd6f4443 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc @@ -15,7 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h" #include "paddle/fluid/framework/op_registry.h" -DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); +PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12, + "number of threads for rpc send"); namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index d99f991911e9ca9ebeca7a6d7fbf31ada33bf811..d0e701b0235574f78a612f351fdd082cae4e8a23 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -37,13 +37,13 @@ if (WITH_PYTHON) endif(NOT WIN32) endif() -cc_library(flags SRCS flags.cc DEPS gflags) +cc_library(flags SRCS flags.cc DEPS gflags boost) cc_library(denormal SRCS denormal.cc DEPS) cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) -set(enforce_deps flags errors boost) +set(enforce_deps flags errors boost flags) if(WITH_GPU) set(enforce_deps ${enforce_deps} external_error_proto) endif() diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 6405b556217660bc0efb52eef33c83a3aceafc80..e4860444865719ede3eb7fb388fbf734ecf284e6 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -31,7 +31,7 @@ limitations under the License. */ #endif // _WIN32 #include -#include "gflags/gflags.h" +#include "paddle/fluid/platform/flags.h" DECLARE_double(fraction_of_cpu_memory_to_use); DECLARE_uint64(initial_cpu_memory_in_mb); @@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use); // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); +PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, true, + "If set, allocate cpu pinned memory."); namespace paddle { namespace platform { @@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() { mib[1] = HW_MEMSIZE; int64_t size = 0; size_t len = sizeof(size); - if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; + if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) { + return static_cast(size); + } return 0L; #elif defined(_WIN32) MEMORYSTATUSEX sMeminfo; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 52be0c805bbd2a68481a6cdbfef0de42b1a946f5..c420a5a64be068fdb6cc58531bacd0cbb92928af 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -101,6 +101,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/type_defs.h" #endif +#include "paddle/fluid/platform/flags.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index ed465c9ea2eb8ad78543d9a42733f9228fe2ba31..b97c3106439bede55a6faafee980e7226702733f 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -12,11 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "gflags/gflags.h" +#include "paddle/fluid/platform/flags.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cudnn_workspace_helper.h" #endif +namespace paddle { +namespace platform { + +const ExportedFlagInfoMap &GetExportedFlagInfoMap() { + return *GetMutableExportedFlagInfoMap(); +} + +ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() { + static ExportedFlagInfoMap g_exported_flag_info_map; + return &g_exported_flag_info_map; +} + +} // namespace platform +} // namespace paddle + /** * NOTE(paddle-dev): This file is designed to define all public FLAGS. */ @@ -30,8 +45,8 @@ * instance to 2 * Note: */ -DEFINE_int32(paddle_num_threads, 1, - "Number of threads for each paddle instance."); +PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, 1, + "Number of threads for each paddle instance."); /** * Operator related FLAG @@ -41,9 +56,10 @@ DEFINE_int32(paddle_num_threads, 1, * Example: * Note: Used to debug. Checking whether operator produce NAN/INF or not. */ -DEFINE_bool(check_nan_inf, false, - "Checking whether operator produce NAN/INF or not. It will be " - "extremely slow so please use this flag wisely."); +PADDLE_DEFINE_EXPORTED_bool( + check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. @@ -58,7 +74,7 @@ DEFINE_bool(check_nan_inf, false, * Example: * Note: whether to use Tensor Core, faster but it may loss precision. */ -DEFINE_bool( +PADDLE_DEFINE_EXPORTED_bool( enable_cublas_tensor_op_math, false, "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " "but it may loss precision. Currently, There are two CUDA libraries that" @@ -77,30 +93,34 @@ DEFINE_bool( * cards * Note: A list of device ids separated by comma, like: 0,1,2,3 */ -DEFINE_string(selected_gpus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (GPU). If you want to use " - "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" - "share-memory only."); +PADDLE_DEFINE_EXPORTED_string( + selected_gpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (GPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" + "share-memory only."); #endif #if defined(PADDLE_WITH_ASCEND_CL) -DEFINE_string(selected_npus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (NPU). If you want to use " - "all visible devices, set this to empty string."); -DEFINE_bool(hccl_check_nan, true, - "Check Nan in tensor before hccl_allreduce_sum otherwise it'll " - "core when meets Nan value"); -DEFINE_string( +PADDLE_DEFINE_EXPORTED_string( + selected_npus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (NPU). If you want to use " + "all visible devices, set this to empty string."); +PADDLE_DEFINE_EXPORTED_bool( + hccl_check_nan, true, + "Check Nan in tensor before hccl_allreduce_sum otherwise it'll " + "core when meets Nan value"); +PADDLE_DEFINE_EXPORTED_string( npu_config_path, "", "The absolute path of configuration json file, like: /tmp/config.json. " "If proveided, it will be passed to aclInit()."); -DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); +PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1, + "set minmum loss scaling value!"); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -113,10 +133,11 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); * Note: whether to use deterministic algorithm in cudnn. * If true, it will slow down some operators such as conv and pooling. */ -DEFINE_bool(cudnn_deterministic, false, - "Whether allow using an autotuning algorithm for convolution " - "operator. The autotuning algorithm may be non-deterministic. If " - "true, the algorithm is deterministic."); +PADDLE_DEFINE_EXPORTED_bool( + cudnn_deterministic, false, + "Whether allow using an autotuning algorithm for convolution " + "operator. The autotuning algorithm may be non-deterministic. If " + "true, the algorithm is deterministic."); /** * CUDNN related FLAG @@ -130,9 +151,10 @@ DEFINE_bool(cudnn_deterministic, false, * increased. * Users need to balance memory and speed. */ -DEFINE_uint64(conv_workspace_size_limit, - paddle::platform::kDefaultConvWorkspaceSizeLimitMB, - "cuDNN convolution workspace limit in MB unit."); +PADDLE_DEFINE_EXPORTED_uint64( + conv_workspace_size_limit, + paddle::platform::kDefaultConvWorkspaceSizeLimitMB, + "cuDNN convolution workspace limit in MB unit."); /** * CUDNN related FLAG @@ -148,9 +170,10 @@ DEFINE_uint64(conv_workspace_size_limit, * layer specification. Once you change the layer specifications * (such as batch size, feature map size), it will search again. */ -DEFINE_bool(cudnn_exhaustive_search, false, - "Whether enable exhaustive search for cuDNN convolution or " - "not, default is False."); +PADDLE_DEFINE_EXPORTED_bool( + cudnn_exhaustive_search, false, + "Whether enable exhaustive search for cuDNN convolution or " + "not, default is False."); /** * CUDNN related FLAG @@ -160,9 +183,9 @@ DEFINE_bool(cudnn_exhaustive_search, false, * Example: * Note: only used to predict for advanced developer */ -DEFINE_int64(cudnn_exhaustive_search_times, -1, - "Exhaustive search times for cuDNN convolution, " - "default is -1, not exhaustive search"); +PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, -1, + "Exhaustive search times for cuDNN convolution, " + "default is -1, not exhaustive search"); /** * CUDNN related FLAG @@ -180,9 +203,10 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1, * certain * input data range. */ -DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, - "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " - "batch_norm, default is False."); +PADDLE_DEFINE_EXPORTED_bool( + cudnn_batchnorm_spatial_persistent, false, + "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " + "batch_norm, default is False."); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -197,7 +221,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, * https://github.com/PaddlePaddle/Paddle/issues/15049 * If you want to change this default value, why?(gongwb) */ -DEFINE_bool( +PADDLE_DEFINE_EXPORTED_bool( sync_nccl_allreduce, true, "If set true, will call `cudaStreamSynchronize(nccl_stream)`" "after allreduce, this mode can get better performance in some scenarios."); @@ -215,11 +239,12 @@ DEFINE_bool( * into the queue, and then the communicator takes the gradients out * of the queue and sends them after merging. */ -DEFINE_int32(communicator_max_merge_var_num, 20, - "max var num to merge and send"); -DEFINE_bool(communicator_is_sgd_optimizer, true, - "gradient sent to the server is the sum of the gradients " - "calculated by each thread if optimizer is sgd"); +PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, 20, + "max var num to merge and send"); +PADDLE_DEFINE_EXPORTED_bool( + communicator_is_sgd_optimizer, true, + "gradient sent to the server is the sum of the gradients " + "calculated by each thread if optimizer is sgd"); /** * Distributed related FLAG * Name: FLAGS_communicator_send_queue_size @@ -233,8 +258,8 @@ DEFINE_bool(communicator_is_sgd_optimizer, true, * space. It is used to avoid training much faster than communication, * so that too many gradients are not sent out in time. */ -DEFINE_int32(communicator_send_queue_size, 20, - "queue size to recv gradient before send"); +PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, 20, + "queue size to recv gradient before send"); #endif /** @@ -246,8 +271,9 @@ DEFINE_int32(communicator_send_queue_size, 20, * Note: Control the number of threads used for distributed modules. * If it is not set, it is set to a hard thread. */ -DEFINE_int32(dist_threadpool_size, 0, - "number of threads used for distributed executed."); +PADDLE_DEFINE_EXPORTED_int32( + dist_threadpool_size, 0, + "number of threads used for distributed executed."); /** * Garbage collector related FLAG @@ -272,7 +298,7 @@ static const double kDefaultEagerDeleteTensorGB = -1; static const double kDefaultEagerDeleteTensorGB = 0; #endif -DEFINE_double( +PADDLE_DEFINE_EXPORTED_double( eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB, "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); @@ -289,9 +315,10 @@ DEFINE_double( * has finished, which will make the garbage collection strategy faster. * Only works when garbage collection strategy is enabled. */ -DEFINE_bool(fast_eager_deletion_mode, true, - "Fast eager deletion mode. If enabled, memory would release " - "immediately without waiting GPU kernel ends."); +PADDLE_DEFINE_EXPORTED_bool( + fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); /** * Memory related FLAG @@ -311,11 +338,12 @@ DEFINE_bool(fast_eager_deletion_mode, true, * largest FLAGS_memory_fraction_of_eager_deletion ratio will be released. * The flag is only valid when running parallel data compilers. */ -DEFINE_double(memory_fraction_of_eager_deletion, 1.0, - "Fraction of eager deletion. If less than 1.0, all variables in " - "the program would be sorted according to its memory size, and " - "only the FLAGS_memory_fraction_of_eager_deletion of the largest " - "variables would be deleted."); +PADDLE_DEFINE_EXPORTED_double( + memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); /** * Allocator related FLAG @@ -331,7 +359,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit"; #else static constexpr char kDefaultAllocatorStrategy[] = "auto_growth"; #endif -DEFINE_string( +PADDLE_DEFINE_EXPORTED_string( allocator_strategy, kDefaultAllocatorStrategy, "The allocation strategy, enum in [naive_best_fit, auto_growth]. " "naive_best_fit means the original pre-allocated allocator of Paddle. " @@ -358,9 +386,9 @@ DEFINE_string( * size as the memory block will be allocated from the CUDA pinned * request util the CPU does not have enough memory. */ -DEFINE_double(fraction_of_cpu_memory_to_use, 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); +PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); /** * Memory related FLAG @@ -374,8 +402,9 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1, * FLAGS_fraction_of_cpu_memory_to_use*(total physical memory) * as memory block sizes. */ -DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, - "Initial CPU memory for PaddlePaddle, in MD unit."); +PADDLE_DEFINE_EXPORTED_uint64( + initial_cpu_memory_in_mb, 500ul, + "Initial CPU memory for PaddlePaddle, in MD unit."); /** * Memory related FLAG @@ -390,7 +419,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, * size as the memory block will be allocated from the CPU * request util the CPU does not have enough memory. */ -DEFINE_double( +PADDLE_DEFINE_EXPORTED_double( fraction_of_cuda_pinned_memory_to_use, 0.5, "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "reserve the rest for page tables, etc"); @@ -425,12 +454,13 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; // which may lead to insufficient memory left for paddle constexpr static float fraction_of_gpu_memory_to_use = 0.5f; #endif -DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, - "Allocate a trunk of gpu memory that is this fraction of the " - "total gpu memory size. Future memory usage will be allocated " - "from the trunk. If the trunk doesn't have enough gpu memory, " - "additional trunks of the same size will be requested from gpu " - "until the gpu has no memory left for another trunk."); +PADDLE_DEFINE_EXPORTED_double( + fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, + "Allocate a trunk of gpu memory that is this fraction of the " + "total gpu memory size. Future memory usage will be allocated " + "from the trunk. If the trunk doesn't have enough gpu memory, " + "additional trunks of the same size will be requested from gpu " + "until the gpu has no memory left for another trunk."); /** * Memory related FLAG @@ -444,7 +474,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, * FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until * the GPU has no remaining memory. */ -DEFINE_uint64( +PADDLE_DEFINE_EXPORTED_uint64( initial_gpu_memory_in_mb, 0ul, "Allocate a trunk of gpu memory whose byte size is specified by " "the flag. Future memory usage will be allocated from the " @@ -466,18 +496,20 @@ DEFINE_uint64( * Note: If the allocated GPU memory blocks are exhausted, * additional GPU memory blocks are reallocated */ -DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, - "If this flag is set, Paddle will reallocate the gpu memory with " - "size specified by this flag. Else Paddle will reallocate by " - "FLAGS_fraction_of_gpu_memory_to_use"); - -DEFINE_uint64(gpu_memory_limit_mb, 0UL, - "The maximum gpu memory limit that the process can allocate. " - "If it is equal to 0, there would be no limit and all gpu memory " - "would be available to the process. If it is larger than 0, " - "the process would raise out of memory error if the allocated " - "memory exceeds the limit even though there is available " - "memory on the gpu card. The unit is MB and default value is 0."); +PADDLE_DEFINE_EXPORTED_uint64( + reallocate_gpu_memory_in_mb, 0ul, + "If this flag is set, Paddle will reallocate the gpu memory with " + "size specified by this flag. Else Paddle will reallocate by " + "FLAGS_fraction_of_gpu_memory_to_use"); + +PADDLE_DEFINE_EXPORTED_uint64( + gpu_memory_limit_mb, 0UL, + "The maximum gpu memory limit that the process can allocate. " + "If it is equal to 0, there would be no limit and all gpu memory " + "would be available to the process. If it is larger than 0, " + "the process would raise out of memory error if the allocated " + "memory exceeds the limit even though there is available " + "memory on the gpu card. The unit is MB and default value is 0."); #endif @@ -489,11 +521,12 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL, * Example: * Note: */ -DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes - "The memory up limit of sub-scopes of local execution scope for " - "each CUDAPlace. If you don't need to limit the memory, " - "you should set FLAGS_local_exe_sub_scope_limit=-1. " - "The default value is 256 MBytes."); +PADDLE_DEFINE_EXPORTED_double( + local_exe_sub_scope_limit, 256.0, // MBytes + "The memory up limit of sub-scopes of local execution scope for " + "each CUDAPlace. If you don't need to limit the memory, " + "you should set FLAGS_local_exe_sub_scope_limit=-1. " + "The default value is 256 MBytes."); /** * MKLDNN related FLAG @@ -503,7 +536,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes * Example: * Note: */ -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); +PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run"); /** * Debug related FLAG @@ -525,7 +558,7 @@ static const int32_t kDefaultCallStackLevel = 2; static const int32_t kDefaultCallStackLevel = 1; #endif -DEFINE_int32( +PADDLE_DEFINE_EXPORTED_int32( call_stack_level, kDefaultCallStackLevel, "Determine the call stack to print when error or exeception happens." // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0 @@ -545,9 +578,9 @@ DEFINE_int32( * Note: If True, gradients are summed by the reverse order of * the forward execution sequence. */ -DEFINE_bool(sort_sum_gradient, false, - "Sum gradients by the reverse order of " - "the forward execution sequence."); +PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, false, + "Sum gradients by the reverse order of " + "the forward execution sequence."); /** * Performance related FLAG @@ -557,7 +590,7 @@ DEFINE_bool(sort_sum_gradient, false, * Example: * Note: The maximum number of inplace grad_add. */ -DEFINE_int32( +PADDLE_DEFINE_EXPORTED_int32( max_inplace_grad_add, 0, "The maximum number of inplace grad_add. When doing " "gradient accumulation, if the number of gradients need to that " @@ -572,8 +605,8 @@ DEFINE_int32( * Example: * Note: Holds list of operation types with OneDNN kernels to be enabled. */ -DEFINE_string(tracer_mkldnn_ops_on, "", - "List of OneDNN operation types to be turned on"); +PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, "", + "List of OneDNN operation types to be turned on"); /** * Debug related FLAG @@ -583,8 +616,9 @@ DEFINE_string(tracer_mkldnn_ops_on, "", * Example: * Note: Holds list of operation types with OneDNN kernels to be disabled. */ -DEFINE_string(tracer_mkldnn_ops_off, "", - "List of OneDNN operation types to be turned off"); +PADDLE_DEFINE_EXPORTED_string( + tracer_mkldnn_ops_off, "", + "List of OneDNN operation types to be turned off"); /** * Debug related FLAG @@ -595,8 +629,9 @@ DEFINE_string(tracer_mkldnn_ops_off, "", * Note: Check kernel launch status after every kernel compute. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DEFINE_bool(check_kernel_launch, false, - "Check kernel launch status after every kernel compute"); +PADDLE_DEFINE_EXPORTED_bool( + check_kernel_launch, false, + "Check kernel launch status after every kernel compute"); #endif /** @@ -608,7 +643,8 @@ DEFINE_bool(check_kernel_launch, false, * Note: Disable cudnn in conv2d. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); +PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false, + "Disable cudnn in conv2d"); #endif /** @@ -621,8 +657,8 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) -DEFINE_int32(get_host_by_name_time, 120, - "The maximum time for get host by name time"); +PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120, + "The maximum time for get host by name time"); #endif /** @@ -634,6 +670,6 @@ DEFINE_int32(get_host_by_name_time, 120, * program when using Fleet APIs. * Note: Apply IR pass to program. Be only useful when using Fleet APIs. */ -DEFINE_bool( +PADDLE_DEFINE_EXPORTED_bool( apply_pass_to_program, false, "It controls whether to apply IR pass to program when using Fleet APIs"); diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h new file mode 100644 index 0000000000000000000000000000000000000000..b9d78c2e9dc3951d3d79bf60496aa04c15afbf5d --- /dev/null +++ b/paddle/fluid/platform/flags.h @@ -0,0 +1,98 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace platform { + +struct FlagInfo { + using ValueType = + boost::variant; + std::string name; + mutable void *value_ptr; + ValueType default_value; + std::string doc; + bool is_writable; +}; + +using ExportedFlagInfoMap = std::map; +const ExportedFlagInfoMap &GetExportedFlagInfoMap(); +ExportedFlagInfoMap *GetMutableExportedFlagInfoMap(); + +#define __PADDLE_DEFINE_EXPORTED_FLAG(__name, __is_writable, __cpp_type, \ + __gflag_type, __default_value, __doc) \ + DEFINE_##__gflag_type(__name, __default_value, __doc); \ + struct __PaddleRegisterFlag_##__name { \ + __PaddleRegisterFlag_##__name() { \ + using FlagDeclaredType = \ + typename std::remove_reference::type; \ + static_assert(std::is_same::value || \ + std::is_arithmetic::value, \ + "FLAGS should be std::string or arithmetic type"); \ + auto *instance = ::paddle::platform::GetMutableExportedFlagInfoMap(); \ + auto &info = (*instance)[#__name]; \ + info.name = #__name; \ + info.value_ptr = &(FLAGS_##__name); \ + info.default_value = static_cast<__cpp_type>(__default_value); \ + info.doc = __doc; \ + info.is_writable = __is_writable; \ + } \ + int Touch() const { return 0; } \ + }; \ + static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name; \ + int TouchPaddleFlagRegister_##__name() { \ + return __PaddleRegisterFlag_instance##__name.Touch(); \ + } \ + static_assert(std::is_same<__PaddleRegisterFlag_##__name, \ + ::__PaddleRegisterFlag_##__name>::value, \ + "FLAGS should define in global namespace") + +#define PADDLE_FORCE_LINK_FLAG(__name) \ + extern int TouchPaddleFlagRegister_##__name(); \ + UNUSED static int __paddle_use_flag_##__name = \ + TouchPaddleFlagRegister_##__name() + +#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc) +#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, \ + doc) + +#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc) + +#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc) \ + __PADDLE_DEFINE_EXPORTED_FLAG(name, true, ::std::string, string, \ + default_value, doc) + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index afae046531143305679f73c2892c55cb89cfc699..290b3353ae54ccbad7ff549a318edb83b75fe7b8 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -43,9 +43,10 @@ limitations under the License. */ #endif DECLARE_int32(paddle_num_threads); -DEFINE_int32(multiple_of_cupti_buffer_size, 1, - "Multiple of the CUPTI device buffer size. If the timestamps have " - "been dropped when you are profiling, try increasing this value."); +PADDLE_DEFINE_EXPORTED_int32( + multiple_of_cupti_buffer_size, 1, + "Multiple of the CUPTI device buffer size. If the timestamps have " + "been dropped when you are profiling, try increasing this value."); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 14c772d88897f4fa28e7c37a9452b78b637419a2..415babc9cb85e6c93c17dd9fdbf7ef61fc424d4c 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,11 +14,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); +PADDLE_DEFINE_EXPORTED_bool( + benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 9c33233e1f79ac799d5acc2a711119d279a9613d..2c8f918414de4687b0be5354fc634587ec22f169 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -24,7 +24,8 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/nvtx.h" #endif -DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); +PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, + "Enable rpc profiler or not."); namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/xpu/xpu_info.cc b/paddle/fluid/platform/xpu/xpu_info.cc index 6b8ab16b47d68c3d1cd8fb961aaf3bc6caa5b9b8..3f45286d8f20209a28a12e98b0643a11d72afca6 100644 --- a/paddle/fluid/platform/xpu/xpu_info.cc +++ b/paddle/fluid/platform/xpu/xpu_info.cc @@ -18,14 +18,15 @@ limitations under the License. */ #include "paddle/fluid/platform/xpu/xpu_header.h" #include "paddle/fluid/string/split.h" -DEFINE_string(selected_xpus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (XPU). If you want to use " - "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between XPU devices, use XPU_VISIBLE_DEVICES can only use" - "share-memory only."); +PADDLE_DEFINE_EXPORTED_string( + selected_xpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (XPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between XPU devices, use XPU_VISIBLE_DEVICES can only use" + "share-memory only."); namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 59c7628447479da5a49ae22dd2daf647896a6544..b01e40750f3358eae9e8c4c38332d5f0d7f0dce2 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -29,82 +29,38 @@ #include "paddle/fluid/platform/macros.h" #include "pybind11/stl.h" -// data processing -DECLARE_bool(use_mkldnn); -DECLARE_string(tracer_mkldnn_ops_on); -DECLARE_string(tracer_mkldnn_ops_off); +// FIXME(zengjinle): these 2 flags may be removed by the linker when compiling +// CPU-only Paddle. It is because they are only used in +// AutoGrowthBestFitAllocator, but AutoGrowthBestFitAllocator is not used +// (in the translation unit level) when compiling CPU-only Paddle. I do not +// want to add PADDLE_FORCE_LINK_FLAG, but I have not found any other methods +// to solve this problem. +PADDLE_FORCE_LINK_FLAG(free_idle_chunk); +PADDLE_FORCE_LINK_FLAG(free_when_no_cache_hit); + // debug -DECLARE_bool(check_nan_inf); DECLARE_bool(cpu_deterministic); -DECLARE_bool(enable_rpc_profiler); -DECLARE_int32(multiple_of_cupti_buffer_size); -DECLARE_bool(reader_queue_speed_test_mode); -DECLARE_int32(call_stack_level); -DECLARE_bool(sort_sum_gradient); -DECLARE_bool(check_kernel_launch); -// device management -DECLARE_int32(paddle_num_threads); + +// IR +DECLARE_bool(convert_all_blocks); + // executor DECLARE_bool(enable_parallel_graph); DECLARE_string(pe_profile_fname); DECLARE_string(print_sub_graph_dir); -DECLARE_bool(use_ngraph); +DECLARE_bool(new_executor_use_inplace); + // memory management -DECLARE_string(allocator_strategy); -DECLARE_double(eager_delete_tensor_gb); -DECLARE_double(fraction_of_cpu_memory_to_use); -DECLARE_bool(free_idle_chunk); -DECLARE_bool(free_when_no_cache_hit); +DECLARE_bool(eager_delete_scope); DECLARE_int32(fuse_parameter_groups_size); DECLARE_double(fuse_parameter_memory_size); -DECLARE_bool(init_allocated_mem); -DECLARE_uint64(initial_cpu_memory_in_mb); -DECLARE_double(memory_fraction_of_eager_deletion); -DECLARE_bool(use_pinned_memory); -DECLARE_bool(use_system_allocator); + // others -DECLARE_bool(benchmark); DECLARE_int32(inner_op_parallelism); -DECLARE_int32(max_inplace_grad_add); -DECLARE_string(tracer_profile_fname); -DECLARE_bool(apply_pass_to_program); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -// cudnn -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_batchnorm_spatial_persistent); -DECLARE_bool(cudnn_deterministic); -DECLARE_bool(cudnn_exhaustive_search); -DECLARE_bool(conv2d_disable_cudnn); -// data processing -DECLARE_bool(enable_cublas_tensor_op_math); -// device management -DECLARE_string(selected_gpus); -// memory management -DECLARE_bool(eager_delete_scope); -DECLARE_bool(fast_eager_deletion_mode); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_uint64(gpu_memory_limit_mb); -DECLARE_uint64(initial_gpu_memory_in_mb); -DECLARE_uint64(reallocate_gpu_memory_in_mb); -// others -DECLARE_bool(sync_nccl_allreduce); -#endif - -#ifdef PADDLE_WITH_XPU -// device management -DECLARE_string(selected_xpus); -#endif - -#ifdef PADDLE_WITH_ASCEND_CL -// device management -DECLARE_string(selected_npus); -// set minmum loss scaling value -DECLARE_int32(min_loss_scaling); -#endif +DECLARE_bool(enable_unused_var_check); +// NOTE: where are these 2 flags from? #ifdef PADDLE_WITH_DISTRIBUTE -DECLARE_int32(rpc_send_thread_num); DECLARE_int32(rpc_get_thread_num); DECLARE_int32(rpc_prefetch_thread_num); #endif @@ -181,7 +137,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { PADDLE_ENFORCE_NOT_NULL(setter, platform::errors::InvalidArgument( "Setter of %s should not be null", name)); - var_infos_.insert({name, VarInfo(is_public, getter, setter)}); } @@ -243,81 +198,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { GlobalVarGetterSetterRegistry GlobalVarGetterSetterRegistry::instance_; -class GlobalVarGetterSetterRegistryHelper { - public: - GlobalVarGetterSetterRegistryHelper(bool is_public, bool is_writable, - const std::string &var_names) - : is_public_(is_public), - is_writable_(is_writable), - var_names_(SplitVarNames(var_names)) {} - - template - void Register(Args &&... args) const { - Impl<0, sizeof...(args) == 1, Args...>::Register( - is_public_, is_writable_, var_names_, std::forward(args)...); - } - - private: - static std::vector SplitVarNames(const std::string &names) { - auto valid_char = [](char ch) { return !std::isspace(ch) && ch != ','; }; - - std::vector ret; - size_t i = 0, j = 0, n = names.size(); - while (i < n) { - for (; i < n && !valid_char(names[i]); ++i) { - } - for (j = i + 1; j < n && valid_char(names[j]); ++j) { - } - - if (i < n && j <= n) { - auto substring = names.substr(i, j - i); - VLOG(10) << "Get substring: \"" << substring << "\""; - ret.emplace_back(substring); - } - i = j + 1; - } - return ret; - } - - private: - template - struct Impl { - static void Register(bool is_public, bool is_writable, - const std::vector &var_names, T &&var, - Args &&... args) { - PADDLE_ENFORCE_EQ(kIdx + 1 + sizeof...(args), var_names.size(), - platform::errors::InvalidArgument( - "Argument number not match name number")); - Impl::Register(is_public, is_writable, var_names, var); - Impl::Register( - is_public, is_writable, var_names, std::forward(args)...); - } - }; - - template - struct Impl { - static void Register(bool is_public, bool is_writable, - const std::vector &var_names, T &&var) { - auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); - if (is_writable) { - instance->Register( - var_names[kIdx], is_public, - GlobalVarGetterSetterRegistry::CreateGetter(std::forward(var)), - GlobalVarGetterSetterRegistry::CreateSetter(&var)); - } else { - instance->Register( - var_names[kIdx], is_public, - GlobalVarGetterSetterRegistry::CreateGetter(std::forward(var))); - } - } - }; - - private: - const bool is_public_; - const bool is_writable_; - const std::vector var_names_; -}; - static void RegisterGlobalVarGetterSetter(); void BindGlobalValueGetterSetter(pybind11::module *module) { @@ -338,65 +218,69 @@ void BindGlobalValueGetterSetter(pybind11::module *module) { } /* Public vars are designed to be writable. */ -#define REGISTER_PUBLIC_GLOBAL_VAR(...) \ - do { \ - GlobalVarGetterSetterRegistryHelper(/*is_public=*/true, \ - /*is_writable=*/true, "" #__VA_ARGS__) \ - .Register(__VA_ARGS__); \ +#define REGISTER_PUBLIC_GLOBAL_VAR(var) \ + do { \ + auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); \ + instance->Register(#var, /*is_public=*/true, \ + GlobalVarGetterSetterRegistry::CreateGetter(var), \ + GlobalVarGetterSetterRegistry::CreateSetter(&var)); \ } while (0) -#define REGISTER_PRIVATE_GLOBAL_VAR(is_writable, ...) \ - do { \ - GlobalVarGetterSetterRegistryHelper(/*is_public=*/false, is_writable, \ - "" #__VA_ARGS__) \ - .Register(__VA_ARGS__); \ - } while (0) +struct RegisterGetterSetterVisitor : public boost::static_visitor { + RegisterGetterSetterVisitor(const std::string &name, bool is_writable, + void *value_ptr) + : name_(name), is_writable_(is_writable), value_ptr_(value_ptr) {} -static void RegisterGlobalVarGetterSetter() { - REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk, - FLAGS_free_when_no_cache_hit); - - REGISTER_PUBLIC_GLOBAL_VAR( - FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph, - FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf, - FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic, - FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size, - FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname, - FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use, - FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size, - FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, - FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, - FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname, - FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add, - FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off, - FLAGS_apply_pass_to_program); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - REGISTER_PUBLIC_GLOBAL_VAR( - FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic, - FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent, - FLAGS_cudnn_exhaustive_search, FLAGS_eager_delete_scope, - FLAGS_fast_eager_deletion_mode, - FLAGS_fraction_of_cuda_pinned_memory_to_use, - FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb, - FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, - FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce, - FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch); -#endif -#ifdef PADDLE_WITH_XPU - REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus); -#endif + template + void operator()(const T &) const { + auto &value = *static_cast(value_ptr_); + auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); + bool is_public = is_writable_; // currently, all writable vars are public + if (is_writable_) { + instance->Register(name_, is_public, + GlobalVarGetterSetterRegistry::CreateGetter(value), + GlobalVarGetterSetterRegistry::CreateSetter(&value)); + } else { + instance->Register(name_, is_public, + GlobalVarGetterSetterRegistry::CreateGetter(value)); + } + } -#ifdef PADDLE_WITH_ASCEND_CL - REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus); - REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_min_loss_scaling); -#endif + private: + std::string name_; + bool is_writable_; + void *value_ptr_; +}; + +static void RegisterGlobalVarGetterSetter() { + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_cpu_deterministic); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_convert_all_blocks); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_parallel_graph); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_pe_profile_fname); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_print_sub_graph_dir); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_new_executor_use_inplace); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_eager_delete_scope); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_groups_size); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_memory_size); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_inner_op_parallelism); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_unused_var_check); #ifdef PADDLE_WITH_DITRIBUTE - REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num, - FLAGS_rpc_get_thread_num, - FLAGS_rpc_prefetch_thread_num); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_get_thread_num); + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_prefetch_thread_num); #endif + + const auto &flag_map = platform::GetExportedFlagInfoMap(); + for (const auto &pair : flag_map) { + const std::string &name = pair.second.name; + bool is_writable = pair.second.is_writable; + void *value_ptr = pair.second.value_ptr; + const auto &default_value = pair.second.default_value; + RegisterGetterSetterVisitor visitor("FLAGS_" + name, is_writable, + value_ptr); + boost::apply_visitor(visitor, default_value); + } } + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 9ed1ed30324b28c740039d6e48a28209f1b90dfa..60b99a964a57fe90454ce4618ee2799aedd697ec 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -32,9 +32,10 @@ #include "paddle/fluid/platform/place.h" #include "pybind11/stl.h" -DEFINE_bool(reader_queue_speed_test_mode, false, - "If set true, the queue.pop will only get data from queue but not " - "remove the data from queue for speed testing"); +PADDLE_DEFINE_EXPORTED_bool( + reader_queue_speed_test_mode, false, + "If set true, the queue.pop will only get data from queue but not " + "remove the data from queue for speed testing"); namespace paddle { namespace pybind {