未验证 提交 2c781455 编写于 作者: Z Zeng Jinle 提交者: GitHub

Make flag adding easier (#35823)

* make flag setter easier

* update

* rename macro name

* fix bug of public/writable

* update to pass CI

* polish

* fix CPU link error
上级 71e01d3f
...@@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) ...@@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
if(NOT WIN32) if(NOT WIN32)
if(WITH_NCCL OR WITH_RCCL) if(WITH_NCCL OR WITH_RCCL)
cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor) cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/imperative/flags.h" #include "paddle/fluid/imperative/flags.h"
#include "gflags/gflags.h" #include "paddle/fluid/platform/flags.h"
DEFINE_uint64(dygraph_debug, 0, PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0,
"Debug level of dygraph. This flag is not " "Debug level of dygraph. This flag is not "
"open to users"); "open to users");
......
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#endif #endif
#include <glog/logging.h> #include <glog/logging.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "gflags/gflags.h" #include "paddle/fluid/platform/flags.h"
DEFINE_string( PADDLE_DEFINE_EXPORTED_string(
tracer_profile_fname, "xxgperf", tracer_profile_fname, "xxgperf",
"Profiler filename for imperative tracer, which generated by gperftools." "Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
......
...@@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc ...@@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags)
cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator) cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
......
...@@ -37,12 +37,13 @@ ...@@ -37,12 +37,13 @@
#endif #endif
#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/npu_info.h"
DEFINE_int64( PADDLE_DEFINE_EXPORTED_int64(
gpu_allocator_retry_time, 10000, gpu_allocator_retry_time, 10000,
"The retry time (milliseconds) when allocator fails " "The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0"); "to allocate memory. No retry if this value is not greater than 0");
DEFINE_bool(use_system_allocator, false, PADDLE_DEFINE_EXPORTED_bool(
use_system_allocator, false,
"Whether to use system allocator to allocate CPU and GPU memory. " "Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests."); "Only used for unittests.");
......
...@@ -17,14 +17,17 @@ ...@@ -17,14 +17,17 @@
#include <algorithm> #include <algorithm>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/platform/flags.h"
DEFINE_bool(free_idle_chunk, false, PADDLE_DEFINE_EXPORTED_READONLY_bool(
free_idle_chunk, false,
"Whether to free idle chunk when each allocation is freed. " "Whether to free idle chunk when each allocation is freed. "
"If false, all freed allocation would be cached to speed up next " "If false, all freed allocation would be cached to speed up next "
"allocation request. If true, no allocation would be cached. This " "allocation request. If true, no allocation would be cached. This "
"flag only works when FLAGS_allocator_strategy=auto_growth."); "flag only works when FLAGS_allocator_strategy=auto_growth.");
DEFINE_bool(free_when_no_cache_hit, false, PADDLE_DEFINE_EXPORTED_READONLY_bool(
free_when_no_cache_hit, false,
"Whether to free idle chunks when no cache hit. If true, idle " "Whether to free idle chunks when no cache hit. If true, idle "
"chunk would be freed when no cache hit; if false, idle " "chunk would be freed when no cache hit; if false, idle "
"chunk would be freed when out of memory occurs. This flag " "chunk would be freed when out of memory occurs. This flag "
......
...@@ -34,7 +34,8 @@ ...@@ -34,7 +34,8 @@
#include "paddle/fluid/platform/xpu/xpu_header.h" #include "paddle/fluid/platform/xpu/xpu_header.h"
#endif #endif
DEFINE_bool(init_allocated_mem, false, PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by " "It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. " "BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate " "To find this error in time, we use init_allocated_mem to indicate "
......
...@@ -15,7 +15,8 @@ limitations under the License. */ ...@@ -15,7 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h" #include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12,
"number of threads for rpc send");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -37,13 +37,13 @@ if (WITH_PYTHON) ...@@ -37,13 +37,13 @@ if (WITH_PYTHON)
endif(NOT WIN32) endif(NOT WIN32)
endif() endif()
cc_library(flags SRCS flags.cc DEPS gflags) cc_library(flags SRCS flags.cc DEPS gflags boost)
cc_library(denormal SRCS denormal.cc DEPS) cc_library(denormal SRCS denormal.cc DEPS)
cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_library(errors SRCS errors.cc DEPS error_codes_proto)
cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
set(enforce_deps flags errors boost) set(enforce_deps flags errors boost flags)
if(WITH_GPU) if(WITH_GPU)
set(enforce_deps ${enforce_deps} external_error_proto) set(enforce_deps ${enforce_deps} external_error_proto)
endif() endif()
......
...@@ -31,7 +31,7 @@ limitations under the License. */ ...@@ -31,7 +31,7 @@ limitations under the License. */
#endif // _WIN32 #endif // _WIN32
#include <algorithm> #include <algorithm>
#include "gflags/gflags.h" #include "paddle/fluid/platform/flags.h"
DECLARE_double(fraction_of_cpu_memory_to_use); DECLARE_double(fraction_of_cpu_memory_to_use);
DECLARE_uint64(initial_cpu_memory_in_mb); DECLARE_uint64(initial_cpu_memory_in_mb);
...@@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use); ...@@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
// between host and device. Allocates too much would reduce the amount // between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we // of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory. // should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory."); PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, true,
"If set, allocate cpu pinned memory.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() { ...@@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() {
mib[1] = HW_MEMSIZE; mib[1] = HW_MEMSIZE;
int64_t size = 0; int64_t size = 0;
size_t len = sizeof(size); size_t len = sizeof(size);
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size; if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) {
return static_cast<size_t>(size);
}
return 0L; return 0L;
#elif defined(_WIN32) #elif defined(_WIN32)
MEMORYSTATUSEX sMeminfo; MEMORYSTATUSEX sMeminfo;
......
...@@ -101,6 +101,7 @@ limitations under the License. */ ...@@ -101,6 +101,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/type_defs.h" #include "paddle/fluid/platform/type_defs.h"
#endif #endif
#include "paddle/fluid/platform/flags.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -12,11 +12,26 @@ ...@@ -12,11 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "gflags/gflags.h" #include "paddle/fluid/platform/flags.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif #endif
namespace paddle {
namespace platform {
const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
return *GetMutableExportedFlagInfoMap();
}
ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
static ExportedFlagInfoMap g_exported_flag_info_map;
return &g_exported_flag_info_map;
}
} // namespace platform
} // namespace paddle
/** /**
* NOTE(paddle-dev): This file is designed to define all public FLAGS. * NOTE(paddle-dev): This file is designed to define all public FLAGS.
*/ */
...@@ -30,7 +45,7 @@ ...@@ -30,7 +45,7 @@
* instance to 2 * instance to 2
* Note: * Note:
*/ */
DEFINE_int32(paddle_num_threads, 1, PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance."); "Number of threads for each paddle instance.");
/** /**
...@@ -41,7 +56,8 @@ DEFINE_int32(paddle_num_threads, 1, ...@@ -41,7 +56,8 @@ DEFINE_int32(paddle_num_threads, 1,
* Example: * Example:
* Note: Used to debug. Checking whether operator produce NAN/INF or not. * Note: Used to debug. Checking whether operator produce NAN/INF or not.
*/ */
DEFINE_bool(check_nan_inf, false, PADDLE_DEFINE_EXPORTED_bool(
check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
...@@ -58,7 +74,7 @@ DEFINE_bool(check_nan_inf, false, ...@@ -58,7 +74,7 @@ DEFINE_bool(check_nan_inf, false,
* Example: * Example:
* Note: whether to use Tensor Core, faster but it may loss precision. * Note: whether to use Tensor Core, faster but it may loss precision.
*/ */
DEFINE_bool( PADDLE_DEFINE_EXPORTED_bool(
enable_cublas_tensor_op_math, false, enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"but it may loss precision. Currently, There are two CUDA libraries that" "but it may loss precision. Currently, There are two CUDA libraries that"
...@@ -77,7 +93,8 @@ DEFINE_bool( ...@@ -77,7 +93,8 @@ DEFINE_bool(
* cards * cards
* Note: A list of device ids separated by comma, like: 0,1,2,3 * Note: A list of device ids separated by comma, like: 0,1,2,3
*/ */
DEFINE_string(selected_gpus, "", PADDLE_DEFINE_EXPORTED_string(
selected_gpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. " "A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and " "This option is useful when doing multi process training and "
"each process have only one device (GPU). If you want to use " "each process have only one device (GPU). If you want to use "
...@@ -88,19 +105,22 @@ DEFINE_string(selected_gpus, "", ...@@ -88,19 +105,22 @@ DEFINE_string(selected_gpus, "",
#endif #endif
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
DEFINE_string(selected_npus, "", PADDLE_DEFINE_EXPORTED_string(
selected_npus, "",
"A list of device ids separated by comma, like: 0,1,2,3. " "A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and " "This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use " "each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string."); "all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, true, PADDLE_DEFINE_EXPORTED_bool(
hccl_check_nan, true,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll " "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value"); "core when meets Nan value");
DEFINE_string( PADDLE_DEFINE_EXPORTED_string(
npu_config_path, "", npu_config_path, "",
"The absolute path of configuration json file, like: /tmp/config.json. " "The absolute path of configuration json file, like: /tmp/config.json. "
"If proveided, it will be passed to aclInit()."); "If proveided, it will be passed to aclInit().");
DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1,
"set minmum loss scaling value!");
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -113,7 +133,8 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); ...@@ -113,7 +133,8 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
* Note: whether to use deterministic algorithm in cudnn. * Note: whether to use deterministic algorithm in cudnn.
* If true, it will slow down some operators such as conv and pooling. * If true, it will slow down some operators such as conv and pooling.
*/ */
DEFINE_bool(cudnn_deterministic, false, PADDLE_DEFINE_EXPORTED_bool(
cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution " "Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If " "operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic."); "true, the algorithm is deterministic.");
...@@ -130,7 +151,8 @@ DEFINE_bool(cudnn_deterministic, false, ...@@ -130,7 +151,8 @@ DEFINE_bool(cudnn_deterministic, false,
* increased. * increased.
* Users need to balance memory and speed. * Users need to balance memory and speed.
*/ */
DEFINE_uint64(conv_workspace_size_limit, PADDLE_DEFINE_EXPORTED_uint64(
conv_workspace_size_limit,
paddle::platform::kDefaultConvWorkspaceSizeLimitMB, paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
"cuDNN convolution workspace limit in MB unit."); "cuDNN convolution workspace limit in MB unit.");
...@@ -148,7 +170,8 @@ DEFINE_uint64(conv_workspace_size_limit, ...@@ -148,7 +170,8 @@ DEFINE_uint64(conv_workspace_size_limit,
* layer specification. Once you change the layer specifications * layer specification. Once you change the layer specifications
* (such as batch size, feature map size), it will search again. * (such as batch size, feature map size), it will search again.
*/ */
DEFINE_bool(cudnn_exhaustive_search, false, PADDLE_DEFINE_EXPORTED_bool(
cudnn_exhaustive_search, false,
"Whether enable exhaustive search for cuDNN convolution or " "Whether enable exhaustive search for cuDNN convolution or "
"not, default is False."); "not, default is False.");
...@@ -160,7 +183,7 @@ DEFINE_bool(cudnn_exhaustive_search, false, ...@@ -160,7 +183,7 @@ DEFINE_bool(cudnn_exhaustive_search, false,
* Example: * Example:
* Note: only used to predict for advanced developer * Note: only used to predict for advanced developer
*/ */
DEFINE_int64(cudnn_exhaustive_search_times, -1, PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, -1,
"Exhaustive search times for cuDNN convolution, " "Exhaustive search times for cuDNN convolution, "
"default is -1, not exhaustive search"); "default is -1, not exhaustive search");
...@@ -180,7 +203,8 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1, ...@@ -180,7 +203,8 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1,
* certain * certain
* input data range. * input data range.
*/ */
DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, PADDLE_DEFINE_EXPORTED_bool(
cudnn_batchnorm_spatial_persistent, false,
"Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
"batch_norm, default is False."); "batch_norm, default is False.");
#endif #endif
...@@ -197,7 +221,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, ...@@ -197,7 +221,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
* https://github.com/PaddlePaddle/Paddle/issues/15049 * https://github.com/PaddlePaddle/Paddle/issues/15049
* If you want to change this default value, why?(gongwb) * If you want to change this default value, why?(gongwb)
*/ */
DEFINE_bool( PADDLE_DEFINE_EXPORTED_bool(
sync_nccl_allreduce, true, sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`" "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios."); "after allreduce, this mode can get better performance in some scenarios.");
...@@ -215,9 +239,10 @@ DEFINE_bool( ...@@ -215,9 +239,10 @@ DEFINE_bool(
* into the queue, and then the communicator takes the gradients out * into the queue, and then the communicator takes the gradients out
* of the queue and sends them after merging. * of the queue and sends them after merging.
*/ */
DEFINE_int32(communicator_max_merge_var_num, 20, PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, 20,
"max var num to merge and send"); "max var num to merge and send");
DEFINE_bool(communicator_is_sgd_optimizer, true, PADDLE_DEFINE_EXPORTED_bool(
communicator_is_sgd_optimizer, true,
"gradient sent to the server is the sum of the gradients " "gradient sent to the server is the sum of the gradients "
"calculated by each thread if optimizer is sgd"); "calculated by each thread if optimizer is sgd");
/** /**
...@@ -233,7 +258,7 @@ DEFINE_bool(communicator_is_sgd_optimizer, true, ...@@ -233,7 +258,7 @@ DEFINE_bool(communicator_is_sgd_optimizer, true,
* space. It is used to avoid training much faster than communication, * space. It is used to avoid training much faster than communication,
* so that too many gradients are not sent out in time. * so that too many gradients are not sent out in time.
*/ */
DEFINE_int32(communicator_send_queue_size, 20, PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, 20,
"queue size to recv gradient before send"); "queue size to recv gradient before send");
#endif #endif
...@@ -246,7 +271,8 @@ DEFINE_int32(communicator_send_queue_size, 20, ...@@ -246,7 +271,8 @@ DEFINE_int32(communicator_send_queue_size, 20,
* Note: Control the number of threads used for distributed modules. * Note: Control the number of threads used for distributed modules.
* If it is not set, it is set to a hard thread. * If it is not set, it is set to a hard thread.
*/ */
DEFINE_int32(dist_threadpool_size, 0, PADDLE_DEFINE_EXPORTED_int32(
dist_threadpool_size, 0,
"number of threads used for distributed executed."); "number of threads used for distributed executed.");
/** /**
...@@ -272,7 +298,7 @@ static const double kDefaultEagerDeleteTensorGB = -1; ...@@ -272,7 +298,7 @@ static const double kDefaultEagerDeleteTensorGB = -1;
static const double kDefaultEagerDeleteTensorGB = 0; static const double kDefaultEagerDeleteTensorGB = 0;
#endif #endif
DEFINE_double( PADDLE_DEFINE_EXPORTED_double(
eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB, eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
"Memory size threshold (GB) when the garbage collector clear tensors." "Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0"); "Disabled when this value is less than 0");
...@@ -289,7 +315,8 @@ DEFINE_double( ...@@ -289,7 +315,8 @@ DEFINE_double(
* has finished, which will make the garbage collection strategy faster. * has finished, which will make the garbage collection strategy faster.
* Only works when garbage collection strategy is enabled. * Only works when garbage collection strategy is enabled.
*/ */
DEFINE_bool(fast_eager_deletion_mode, true, PADDLE_DEFINE_EXPORTED_bool(
fast_eager_deletion_mode, true,
"Fast eager deletion mode. If enabled, memory would release " "Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends."); "immediately without waiting GPU kernel ends.");
...@@ -311,7 +338,8 @@ DEFINE_bool(fast_eager_deletion_mode, true, ...@@ -311,7 +338,8 @@ DEFINE_bool(fast_eager_deletion_mode, true,
* largest FLAGS_memory_fraction_of_eager_deletion ratio will be released. * largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
* The flag is only valid when running parallel data compilers. * The flag is only valid when running parallel data compilers.
*/ */
DEFINE_double(memory_fraction_of_eager_deletion, 1.0, PADDLE_DEFINE_EXPORTED_double(
memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in " "Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and " "the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest " "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
...@@ -331,7 +359,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit"; ...@@ -331,7 +359,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit";
#else #else
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth"; static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
#endif #endif
DEFINE_string( PADDLE_DEFINE_EXPORTED_string(
allocator_strategy, kDefaultAllocatorStrategy, allocator_strategy, kDefaultAllocatorStrategy,
"The allocation strategy, enum in [naive_best_fit, auto_growth]. " "The allocation strategy, enum in [naive_best_fit, auto_growth]. "
"naive_best_fit means the original pre-allocated allocator of Paddle. " "naive_best_fit means the original pre-allocated allocator of Paddle. "
...@@ -358,7 +386,7 @@ DEFINE_string( ...@@ -358,7 +386,7 @@ DEFINE_string(
* size as the memory block will be allocated from the CUDA pinned * size as the memory block will be allocated from the CUDA pinned
* request util the CPU does not have enough memory. * request util the CPU does not have enough memory.
*/ */
DEFINE_double(fraction_of_cpu_memory_to_use, 1, PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle," "Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
...@@ -374,7 +402,8 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1, ...@@ -374,7 +402,8 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
* FLAGS_fraction_of_cpu_memory_to_use*(total physical memory) * FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
* as memory block sizes. * as memory block sizes.
*/ */
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, PADDLE_DEFINE_EXPORTED_uint64(
initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit."); "Initial CPU memory for PaddlePaddle, in MD unit.");
/** /**
...@@ -390,7 +419,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, ...@@ -390,7 +419,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
* size as the memory block will be allocated from the CPU * size as the memory block will be allocated from the CPU
* request util the CPU does not have enough memory. * request util the CPU does not have enough memory.
*/ */
DEFINE_double( PADDLE_DEFINE_EXPORTED_double(
fraction_of_cuda_pinned_memory_to_use, 0.5, fraction_of_cuda_pinned_memory_to_use, 0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
...@@ -425,7 +454,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f; ...@@ -425,7 +454,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
// which may lead to insufficient memory left for paddle // which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f; constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif #endif
DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, PADDLE_DEFINE_EXPORTED_double(
fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"Allocate a trunk of gpu memory that is this fraction of the " "Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated " "total gpu memory size. Future memory usage will be allocated "
"from the trunk. If the trunk doesn't have enough gpu memory, " "from the trunk. If the trunk doesn't have enough gpu memory, "
...@@ -444,7 +474,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, ...@@ -444,7 +474,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
* FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until * FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
* the GPU has no remaining memory. * the GPU has no remaining memory.
*/ */
DEFINE_uint64( PADDLE_DEFINE_EXPORTED_uint64(
initial_gpu_memory_in_mb, 0ul, initial_gpu_memory_in_mb, 0ul,
"Allocate a trunk of gpu memory whose byte size is specified by " "Allocate a trunk of gpu memory whose byte size is specified by "
"the flag. Future memory usage will be allocated from the " "the flag. Future memory usage will be allocated from the "
...@@ -466,12 +496,14 @@ DEFINE_uint64( ...@@ -466,12 +496,14 @@ DEFINE_uint64(
* Note: If the allocated GPU memory blocks are exhausted, * Note: If the allocated GPU memory blocks are exhausted,
* additional GPU memory blocks are reallocated * additional GPU memory blocks are reallocated
*/ */
DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, PADDLE_DEFINE_EXPORTED_uint64(
reallocate_gpu_memory_in_mb, 0ul,
"If this flag is set, Paddle will reallocate the gpu memory with " "If this flag is set, Paddle will reallocate the gpu memory with "
"size specified by this flag. Else Paddle will reallocate by " "size specified by this flag. Else Paddle will reallocate by "
"FLAGS_fraction_of_gpu_memory_to_use"); "FLAGS_fraction_of_gpu_memory_to_use");
DEFINE_uint64(gpu_memory_limit_mb, 0UL, PADDLE_DEFINE_EXPORTED_uint64(
gpu_memory_limit_mb, 0UL,
"The maximum gpu memory limit that the process can allocate. " "The maximum gpu memory limit that the process can allocate. "
"If it is equal to 0, there would be no limit and all gpu memory " "If it is equal to 0, there would be no limit and all gpu memory "
"would be available to the process. If it is larger than 0, " "would be available to the process. If it is larger than 0, "
...@@ -489,7 +521,8 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL, ...@@ -489,7 +521,8 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL,
* Example: * Example:
* Note: * Note:
*/ */
DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes PADDLE_DEFINE_EXPORTED_double(
local_exe_sub_scope_limit, 256.0, // MBytes
"The memory up limit of sub-scopes of local execution scope for " "The memory up limit of sub-scopes of local execution scope for "
"each CUDAPlace. If you don't need to limit the memory, " "each CUDAPlace. If you don't need to limit the memory, "
"you should set FLAGS_local_exe_sub_scope_limit=-1. " "you should set FLAGS_local_exe_sub_scope_limit=-1. "
...@@ -503,7 +536,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes ...@@ -503,7 +536,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes
* Example: * Example:
* Note: * Note:
*/ */
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
/** /**
* Debug related FLAG * Debug related FLAG
...@@ -525,7 +558,7 @@ static const int32_t kDefaultCallStackLevel = 2; ...@@ -525,7 +558,7 @@ static const int32_t kDefaultCallStackLevel = 2;
static const int32_t kDefaultCallStackLevel = 1; static const int32_t kDefaultCallStackLevel = 1;
#endif #endif
DEFINE_int32( PADDLE_DEFINE_EXPORTED_int32(
call_stack_level, kDefaultCallStackLevel, call_stack_level, kDefaultCallStackLevel,
"Determine the call stack to print when error or exeception happens." "Determine the call stack to print when error or exeception happens."
// TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0 // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
...@@ -545,7 +578,7 @@ DEFINE_int32( ...@@ -545,7 +578,7 @@ DEFINE_int32(
* Note: If True, gradients are summed by the reverse order of * Note: If True, gradients are summed by the reverse order of
* the forward execution sequence. * the forward execution sequence.
*/ */
DEFINE_bool(sort_sum_gradient, false, PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, false,
"Sum gradients by the reverse order of " "Sum gradients by the reverse order of "
"the forward execution sequence."); "the forward execution sequence.");
...@@ -557,7 +590,7 @@ DEFINE_bool(sort_sum_gradient, false, ...@@ -557,7 +590,7 @@ DEFINE_bool(sort_sum_gradient, false,
* Example: * Example:
* Note: The maximum number of inplace grad_add. * Note: The maximum number of inplace grad_add.
*/ */
DEFINE_int32( PADDLE_DEFINE_EXPORTED_int32(
max_inplace_grad_add, 0, max_inplace_grad_add, 0,
"The maximum number of inplace grad_add. When doing " "The maximum number of inplace grad_add. When doing "
"gradient accumulation, if the number of gradients need to that " "gradient accumulation, if the number of gradients need to that "
...@@ -572,7 +605,7 @@ DEFINE_int32( ...@@ -572,7 +605,7 @@ DEFINE_int32(
* Example: * Example:
* Note: Holds list of operation types with OneDNN kernels to be enabled. * Note: Holds list of operation types with OneDNN kernels to be enabled.
*/ */
DEFINE_string(tracer_mkldnn_ops_on, "", PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, "",
"List of OneDNN operation types to be turned on"); "List of OneDNN operation types to be turned on");
/** /**
...@@ -583,7 +616,8 @@ DEFINE_string(tracer_mkldnn_ops_on, "", ...@@ -583,7 +616,8 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
* Example: * Example:
* Note: Holds list of operation types with OneDNN kernels to be disabled. * Note: Holds list of operation types with OneDNN kernels to be disabled.
*/ */
DEFINE_string(tracer_mkldnn_ops_off, "", PADDLE_DEFINE_EXPORTED_string(
tracer_mkldnn_ops_off, "",
"List of OneDNN operation types to be turned off"); "List of OneDNN operation types to be turned off");
/** /**
...@@ -595,7 +629,8 @@ DEFINE_string(tracer_mkldnn_ops_off, "", ...@@ -595,7 +629,8 @@ DEFINE_string(tracer_mkldnn_ops_off, "",
* Note: Check kernel launch status after every kernel compute. * Note: Check kernel launch status after every kernel compute.
*/ */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool(check_kernel_launch, false, PADDLE_DEFINE_EXPORTED_bool(
check_kernel_launch, false,
"Check kernel launch status after every kernel compute"); "Check kernel launch status after every kernel compute");
#endif #endif
...@@ -608,7 +643,8 @@ DEFINE_bool(check_kernel_launch, false, ...@@ -608,7 +643,8 @@ DEFINE_bool(check_kernel_launch, false,
* Note: Disable cudnn in conv2d. * Note: Disable cudnn in conv2d.
*/ */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false,
"Disable cudnn in conv2d");
#endif #endif
/** /**
...@@ -621,7 +657,7 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); ...@@ -621,7 +657,7 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
*/ */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
DEFINE_int32(get_host_by_name_time, 120, PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120,
"The maximum time for get host by name time"); "The maximum time for get host by name time");
#endif #endif
...@@ -634,6 +670,6 @@ DEFINE_int32(get_host_by_name_time, 120, ...@@ -634,6 +670,6 @@ DEFINE_int32(get_host_by_name_time, 120,
* program when using Fleet APIs. * program when using Fleet APIs.
* Note: Apply IR pass to program. Be only useful when using Fleet APIs. * Note: Apply IR pass to program. Be only useful when using Fleet APIs.
*/ */
DEFINE_bool( PADDLE_DEFINE_EXPORTED_bool(
apply_pass_to_program, false, apply_pass_to_program, false,
"It controls whether to apply IR pass to program when using Fleet APIs"); "It controls whether to apply IR pass to program when using Fleet APIs");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <map>
#include <string>
#include <type_traits>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace platform {
struct FlagInfo {
using ValueType =
boost::variant<bool, int32_t, int64_t, uint64_t, double, std::string>;
std::string name;
mutable void *value_ptr;
ValueType default_value;
std::string doc;
bool is_writable;
};
using ExportedFlagInfoMap = std::map<std::string, FlagInfo>;
const ExportedFlagInfoMap &GetExportedFlagInfoMap();
ExportedFlagInfoMap *GetMutableExportedFlagInfoMap();
#define __PADDLE_DEFINE_EXPORTED_FLAG(__name, __is_writable, __cpp_type, \
__gflag_type, __default_value, __doc) \
DEFINE_##__gflag_type(__name, __default_value, __doc); \
struct __PaddleRegisterFlag_##__name { \
__PaddleRegisterFlag_##__name() { \
using FlagDeclaredType = \
typename std::remove_reference<decltype(FLAGS_##__name)>::type; \
static_assert(std::is_same<FlagDeclaredType, ::std::string>::value || \
std::is_arithmetic<FlagDeclaredType>::value, \
"FLAGS should be std::string or arithmetic type"); \
auto *instance = ::paddle::platform::GetMutableExportedFlagInfoMap(); \
auto &info = (*instance)[#__name]; \
info.name = #__name; \
info.value_ptr = &(FLAGS_##__name); \
info.default_value = static_cast<__cpp_type>(__default_value); \
info.doc = __doc; \
info.is_writable = __is_writable; \
} \
int Touch() const { return 0; } \
}; \
static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name; \
int TouchPaddleFlagRegister_##__name() { \
return __PaddleRegisterFlag_instance##__name.Touch(); \
} \
static_assert(std::is_same<__PaddleRegisterFlag_##__name, \
::__PaddleRegisterFlag_##__name>::value, \
"FLAGS should define in global namespace")
#define PADDLE_FORCE_LINK_FLAG(__name) \
extern int TouchPaddleFlagRegister_##__name(); \
UNUSED static int __paddle_use_flag_##__name = \
TouchPaddleFlagRegister_##__name()
#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, \
doc)
#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, ::std::string, string, \
default_value, doc)
} // namespace platform
} // namespace paddle
...@@ -43,7 +43,8 @@ limitations under the License. */ ...@@ -43,7 +43,8 @@ limitations under the License. */
#endif #endif
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
DEFINE_int32(multiple_of_cupti_buffer_size, 1, PADDLE_DEFINE_EXPORTED_int32(
multiple_of_cupti_buffer_size, 1,
"Multiple of the CUPTI device buffer size. If the timestamps have " "Multiple of the CUPTI device buffer size. If the timestamps have "
"been dropped when you are profiling, try increasing this value."); "been dropped when you are profiling, try increasing this value.");
......
...@@ -14,7 +14,8 @@ limitations under the License. */ ...@@ -14,7 +14,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
DEFINE_bool(benchmark, false, PADDLE_DEFINE_EXPORTED_bool(
benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs." "and add some memory usage logs."
"Default cuda is asynchronous device, set to True will" "Default cuda is asynchronous device, set to True will"
......
...@@ -24,7 +24,8 @@ limitations under the License. */ ...@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/nvtx.h" #include "paddle/fluid/platform/dynload/nvtx.h"
#endif #endif
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
"Enable rpc profiler or not.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -18,7 +18,8 @@ limitations under the License. */ ...@@ -18,7 +18,8 @@ limitations under the License. */
#include "paddle/fluid/platform/xpu/xpu_header.h" #include "paddle/fluid/platform/xpu/xpu_header.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
DEFINE_string(selected_xpus, "", PADDLE_DEFINE_EXPORTED_string(
selected_xpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. " "A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and " "This option is useful when doing multi process training and "
"each process have only one device (XPU). If you want to use " "each process have only one device (XPU). If you want to use "
......
...@@ -29,82 +29,38 @@ ...@@ -29,82 +29,38 @@
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
// data processing // FIXME(zengjinle): these 2 flags may be removed by the linker when compiling
DECLARE_bool(use_mkldnn); // CPU-only Paddle. It is because they are only used in
DECLARE_string(tracer_mkldnn_ops_on); // AutoGrowthBestFitAllocator, but AutoGrowthBestFitAllocator is not used
DECLARE_string(tracer_mkldnn_ops_off); // (in the translation unit level) when compiling CPU-only Paddle. I do not
// want to add PADDLE_FORCE_LINK_FLAG, but I have not found any other methods
// to solve this problem.
PADDLE_FORCE_LINK_FLAG(free_idle_chunk);
PADDLE_FORCE_LINK_FLAG(free_when_no_cache_hit);
// debug // debug
DECLARE_bool(check_nan_inf);
DECLARE_bool(cpu_deterministic); DECLARE_bool(cpu_deterministic);
DECLARE_bool(enable_rpc_profiler);
DECLARE_int32(multiple_of_cupti_buffer_size); // IR
DECLARE_bool(reader_queue_speed_test_mode); DECLARE_bool(convert_all_blocks);
DECLARE_int32(call_stack_level);
DECLARE_bool(sort_sum_gradient);
DECLARE_bool(check_kernel_launch);
// device management
DECLARE_int32(paddle_num_threads);
// executor // executor
DECLARE_bool(enable_parallel_graph); DECLARE_bool(enable_parallel_graph);
DECLARE_string(pe_profile_fname); DECLARE_string(pe_profile_fname);
DECLARE_string(print_sub_graph_dir); DECLARE_string(print_sub_graph_dir);
DECLARE_bool(use_ngraph); DECLARE_bool(new_executor_use_inplace);
// memory management // memory management
DECLARE_string(allocator_strategy); DECLARE_bool(eager_delete_scope);
DECLARE_double(eager_delete_tensor_gb);
DECLARE_double(fraction_of_cpu_memory_to_use);
DECLARE_bool(free_idle_chunk);
DECLARE_bool(free_when_no_cache_hit);
DECLARE_int32(fuse_parameter_groups_size); DECLARE_int32(fuse_parameter_groups_size);
DECLARE_double(fuse_parameter_memory_size); DECLARE_double(fuse_parameter_memory_size);
DECLARE_bool(init_allocated_mem);
DECLARE_uint64(initial_cpu_memory_in_mb);
DECLARE_double(memory_fraction_of_eager_deletion);
DECLARE_bool(use_pinned_memory);
DECLARE_bool(use_system_allocator);
// others // others
DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism); DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add); DECLARE_bool(enable_unused_var_check);
DECLARE_string(tracer_profile_fname);
DECLARE_bool(apply_pass_to_program);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
DECLARE_bool(cudnn_deterministic);
DECLARE_bool(cudnn_exhaustive_search);
DECLARE_bool(conv2d_disable_cudnn);
// data processing
DECLARE_bool(enable_cublas_tensor_op_math);
// device management
DECLARE_string(selected_gpus);
// memory management
DECLARE_bool(eager_delete_scope);
DECLARE_bool(fast_eager_deletion_mode);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(gpu_memory_limit_mb);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
// others
DECLARE_bool(sync_nccl_allreduce);
#endif
#ifdef PADDLE_WITH_XPU
// device management
DECLARE_string(selected_xpus);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// device management
DECLARE_string(selected_npus);
// set minmum loss scaling value
DECLARE_int32(min_loss_scaling);
#endif
// NOTE: where are these 2 flags from?
#ifdef PADDLE_WITH_DISTRIBUTE #ifdef PADDLE_WITH_DISTRIBUTE
DECLARE_int32(rpc_send_thread_num);
DECLARE_int32(rpc_get_thread_num); DECLARE_int32(rpc_get_thread_num);
DECLARE_int32(rpc_prefetch_thread_num); DECLARE_int32(rpc_prefetch_thread_num);
#endif #endif
...@@ -181,7 +137,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { ...@@ -181,7 +137,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
PADDLE_ENFORCE_NOT_NULL(setter, PADDLE_ENFORCE_NOT_NULL(setter,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Setter of %s should not be null", name)); "Setter of %s should not be null", name));
var_infos_.insert({name, VarInfo(is_public, getter, setter)}); var_infos_.insert({name, VarInfo(is_public, getter, setter)});
} }
...@@ -243,81 +198,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry { ...@@ -243,81 +198,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
GlobalVarGetterSetterRegistry GlobalVarGetterSetterRegistry::instance_; GlobalVarGetterSetterRegistry GlobalVarGetterSetterRegistry::instance_;
class GlobalVarGetterSetterRegistryHelper {
public:
GlobalVarGetterSetterRegistryHelper(bool is_public, bool is_writable,
const std::string &var_names)
: is_public_(is_public),
is_writable_(is_writable),
var_names_(SplitVarNames(var_names)) {}
template <typename... Args>
void Register(Args &&... args) const {
Impl<0, sizeof...(args) == 1, Args...>::Register(
is_public_, is_writable_, var_names_, std::forward<Args>(args)...);
}
private:
static std::vector<std::string> SplitVarNames(const std::string &names) {
auto valid_char = [](char ch) { return !std::isspace(ch) && ch != ','; };
std::vector<std::string> ret;
size_t i = 0, j = 0, n = names.size();
while (i < n) {
for (; i < n && !valid_char(names[i]); ++i) {
}
for (j = i + 1; j < n && valid_char(names[j]); ++j) {
}
if (i < n && j <= n) {
auto substring = names.substr(i, j - i);
VLOG(10) << "Get substring: \"" << substring << "\"";
ret.emplace_back(substring);
}
i = j + 1;
}
return ret;
}
private:
template <size_t kIdx, bool kIsStop, typename T, typename... Args>
struct Impl {
static void Register(bool is_public, bool is_writable,
const std::vector<std::string> &var_names, T &&var,
Args &&... args) {
PADDLE_ENFORCE_EQ(kIdx + 1 + sizeof...(args), var_names.size(),
platform::errors::InvalidArgument(
"Argument number not match name number"));
Impl<kIdx, true, T>::Register(is_public, is_writable, var_names, var);
Impl<kIdx + 1, sizeof...(Args) == 1, Args...>::Register(
is_public, is_writable, var_names, std::forward<Args>(args)...);
}
};
template <size_t kIdx, typename T>
struct Impl<kIdx, true, T> {
static void Register(bool is_public, bool is_writable,
const std::vector<std::string> &var_names, T &&var) {
auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
if (is_writable) {
instance->Register(
var_names[kIdx], is_public,
GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)),
GlobalVarGetterSetterRegistry::CreateSetter(&var));
} else {
instance->Register(
var_names[kIdx], is_public,
GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)));
}
}
};
private:
const bool is_public_;
const bool is_writable_;
const std::vector<std::string> var_names_;
};
static void RegisterGlobalVarGetterSetter(); static void RegisterGlobalVarGetterSetter();
void BindGlobalValueGetterSetter(pybind11::module *module) { void BindGlobalValueGetterSetter(pybind11::module *module) {
...@@ -338,65 +218,69 @@ void BindGlobalValueGetterSetter(pybind11::module *module) { ...@@ -338,65 +218,69 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
} }
/* Public vars are designed to be writable. */ /* Public vars are designed to be writable. */
#define REGISTER_PUBLIC_GLOBAL_VAR(...) \ #define REGISTER_PUBLIC_GLOBAL_VAR(var) \
do { \ do { \
GlobalVarGetterSetterRegistryHelper(/*is_public=*/true, \ auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); \
/*is_writable=*/true, "" #__VA_ARGS__) \ instance->Register(#var, /*is_public=*/true, \
.Register(__VA_ARGS__); \ GlobalVarGetterSetterRegistry::CreateGetter(var), \
GlobalVarGetterSetterRegistry::CreateSetter(&var)); \
} while (0) } while (0)
#define REGISTER_PRIVATE_GLOBAL_VAR(is_writable, ...) \ struct RegisterGetterSetterVisitor : public boost::static_visitor<void> {
do { \ RegisterGetterSetterVisitor(const std::string &name, bool is_writable,
GlobalVarGetterSetterRegistryHelper(/*is_public=*/false, is_writable, \ void *value_ptr)
"" #__VA_ARGS__) \ : name_(name), is_writable_(is_writable), value_ptr_(value_ptr) {}
.Register(__VA_ARGS__); \
} while (0)
static void RegisterGlobalVarGetterSetter() { template <typename T>
REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk, void operator()(const T &) const {
FLAGS_free_when_no_cache_hit); auto &value = *static_cast<T *>(value_ptr_);
auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
REGISTER_PUBLIC_GLOBAL_VAR( bool is_public = is_writable_; // currently, all writable vars are public
FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph, if (is_writable_) {
FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf, instance->Register(name_, is_public,
FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic, GlobalVarGetterSetterRegistry::CreateGetter(value),
FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size, GlobalVarGetterSetterRegistry::CreateSetter(&value));
FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname, } else {
FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use, instance->Register(name_, is_public,
FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size, GlobalVarGetterSetterRegistry::CreateGetter(value));
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, }
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, }
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off,
FLAGS_apply_pass_to_program);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
FLAGS_cudnn_exhaustive_search, FLAGS_eager_delete_scope,
FLAGS_fast_eager_deletion_mode,
FLAGS_fraction_of_cuda_pinned_memory_to_use,
FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
#endif
#ifdef PADDLE_WITH_XPU
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
#endif
#ifdef PADDLE_WITH_ASCEND_CL private:
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus); std::string name_;
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_min_loss_scaling); bool is_writable_;
#endif void *value_ptr_;
};
static void RegisterGlobalVarGetterSetter() {
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_cpu_deterministic);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_convert_all_blocks);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_parallel_graph);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_pe_profile_fname);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_print_sub_graph_dir);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_new_executor_use_inplace);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_eager_delete_scope);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_groups_size);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_memory_size);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_inner_op_parallelism);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_unused_var_check);
#ifdef PADDLE_WITH_DITRIBUTE #ifdef PADDLE_WITH_DITRIBUTE
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num, REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_get_thread_num);
FLAGS_rpc_get_thread_num, REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_prefetch_thread_num);
FLAGS_rpc_prefetch_thread_num);
#endif #endif
const auto &flag_map = platform::GetExportedFlagInfoMap();
for (const auto &pair : flag_map) {
const std::string &name = pair.second.name;
bool is_writable = pair.second.is_writable;
void *value_ptr = pair.second.value_ptr;
const auto &default_value = pair.second.default_value;
RegisterGetterSetterVisitor visitor("FLAGS_" + name, is_writable,
value_ptr);
boost::apply_visitor(visitor, default_value);
}
} }
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -32,7 +32,8 @@ ...@@ -32,7 +32,8 @@
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
DEFINE_bool(reader_queue_speed_test_mode, false, PADDLE_DEFINE_EXPORTED_bool(
reader_queue_speed_test_mode, false,
"If set true, the queue.pop will only get data from queue but not " "If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing"); "remove the data from queue for speed testing");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册