未验证 提交 2c781455 编写于 作者: Z Zeng Jinle 提交者: GitHub

Make flag adding easier (#35823)

* make flag setter easier

* update

* rename macro name

* fix bug of public/writable

* update to pass CI

* polish

* fix CPU link error
上级 71e01d3f
......@@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc)
cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
if(NOT WIN32)
if(WITH_NCCL OR WITH_RCCL)
cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
......
......@@ -13,9 +13,9 @@
// limitations under the License.
#include "paddle/fluid/imperative/flags.h"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/flags.h"
DEFINE_uint64(dygraph_debug, 0,
PADDLE_DEFINE_EXPORTED_uint64(dygraph_debug, 0,
"Debug level of dygraph. This flag is not "
"open to users");
......
......@@ -19,9 +19,9 @@
#endif
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "gflags/gflags.h"
#include "paddle/fluid/platform/flags.h"
DEFINE_string(
PADDLE_DEFINE_EXPORTED_string(
tracer_profile_fname, "xxgperf",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
......
......@@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc
cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags)
cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
......
......@@ -37,12 +37,13 @@
#endif
#include "paddle/fluid/platform/npu_info.h"
DEFINE_int64(
PADDLE_DEFINE_EXPORTED_int64(
gpu_allocator_retry_time, 10000,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0");
DEFINE_bool(use_system_allocator, false,
PADDLE_DEFINE_EXPORTED_bool(
use_system_allocator, false,
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests.");
......
......@@ -17,14 +17,17 @@
#include <algorithm>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/platform/flags.h"
DEFINE_bool(free_idle_chunk, false,
PADDLE_DEFINE_EXPORTED_READONLY_bool(
free_idle_chunk, false,
"Whether to free idle chunk when each allocation is freed. "
"If false, all freed allocation would be cached to speed up next "
"allocation request. If true, no allocation would be cached. This "
"flag only works when FLAGS_allocator_strategy=auto_growth.");
DEFINE_bool(free_when_no_cache_hit, false,
PADDLE_DEFINE_EXPORTED_READONLY_bool(
free_when_no_cache_hit, false,
"Whether to free idle chunks when no cache hit. If true, idle "
"chunk would be freed when no cache hit; if false, idle "
"chunk would be freed when out of memory occurs. This flag "
......
......@@ -34,7 +34,8 @@
#include "paddle/fluid/platform/xpu/xpu_header.h"
#endif
DEFINE_bool(init_allocated_mem, false,
PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
......
......@@ -15,7 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
#include "paddle/fluid/framework/op_registry.h"
DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
PADDLE_DEFINE_EXPORTED_int32(rpc_send_thread_num, 12,
"number of threads for rpc send");
namespace paddle {
namespace operators {
......
......@@ -37,13 +37,13 @@ if (WITH_PYTHON)
endif(NOT WIN32)
endif()
cc_library(flags SRCS flags.cc DEPS gflags)
cc_library(flags SRCS flags.cc DEPS gflags boost)
cc_library(denormal SRCS denormal.cc DEPS)
cc_library(errors SRCS errors.cc DEPS error_codes_proto)
cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
set(enforce_deps flags errors boost)
set(enforce_deps flags errors boost flags)
if(WITH_GPU)
set(enforce_deps ${enforce_deps} external_error_proto)
endif()
......
......@@ -31,7 +31,7 @@ limitations under the License. */
#endif // _WIN32
#include <algorithm>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/flags.h"
DECLARE_double(fraction_of_cpu_memory_to_use);
DECLARE_uint64(initial_cpu_memory_in_mb);
......@@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory, true,
"If set, allocate cpu pinned memory.");
namespace paddle {
namespace platform {
......@@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() {
mib[1] = HW_MEMSIZE;
int64_t size = 0;
size_t len = sizeof(size);
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) {
return static_cast<size_t>(size);
}
return 0L;
#elif defined(_WIN32)
MEMORYSTATUSEX sMeminfo;
......
......@@ -101,6 +101,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/type_defs.h"
#endif
#include "paddle/fluid/platform/flags.h"
namespace paddle {
namespace platform {
......
......@@ -12,11 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gflags/gflags.h"
#include "paddle/fluid/platform/flags.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif
namespace paddle {
namespace platform {
const ExportedFlagInfoMap &GetExportedFlagInfoMap() {
return *GetMutableExportedFlagInfoMap();
}
ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
static ExportedFlagInfoMap g_exported_flag_info_map;
return &g_exported_flag_info_map;
}
} // namespace platform
} // namespace paddle
/**
* NOTE(paddle-dev): This file is designed to define all public FLAGS.
*/
......@@ -30,7 +45,7 @@
* instance to 2
* Note:
*/
DEFINE_int32(paddle_num_threads, 1,
PADDLE_DEFINE_EXPORTED_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance.");
/**
......@@ -41,7 +56,8 @@ DEFINE_int32(paddle_num_threads, 1,
* Example:
* Note: Used to debug. Checking whether operator produce NAN/INF or not.
*/
DEFINE_bool(check_nan_inf, false,
PADDLE_DEFINE_EXPORTED_bool(
check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
......@@ -58,7 +74,7 @@ DEFINE_bool(check_nan_inf, false,
* Example:
* Note: whether to use Tensor Core, faster but it may loss precision.
*/
DEFINE_bool(
PADDLE_DEFINE_EXPORTED_bool(
enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"but it may loss precision. Currently, There are two CUDA libraries that"
......@@ -77,7 +93,8 @@ DEFINE_bool(
* cards
* Note: A list of device ids separated by comma, like: 0,1,2,3
*/
DEFINE_string(selected_gpus, "",
PADDLE_DEFINE_EXPORTED_string(
selected_gpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (GPU). If you want to use "
......@@ -88,19 +105,22 @@ DEFINE_string(selected_gpus, "",
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
DEFINE_string(selected_npus, "",
PADDLE_DEFINE_EXPORTED_string(
selected_npus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, true,
PADDLE_DEFINE_EXPORTED_bool(
hccl_check_nan, true,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value");
DEFINE_string(
PADDLE_DEFINE_EXPORTED_string(
npu_config_path, "",
"The absolute path of configuration json file, like: /tmp/config.json. "
"If proveided, it will be passed to aclInit().");
DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1,
"set minmum loss scaling value!");
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -113,7 +133,8 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
* Note: whether to use deterministic algorithm in cudnn.
* If true, it will slow down some operators such as conv and pooling.
*/
DEFINE_bool(cudnn_deterministic, false,
PADDLE_DEFINE_EXPORTED_bool(
cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic.");
......@@ -130,7 +151,8 @@ DEFINE_bool(cudnn_deterministic, false,
* increased.
* Users need to balance memory and speed.
*/
DEFINE_uint64(conv_workspace_size_limit,
PADDLE_DEFINE_EXPORTED_uint64(
conv_workspace_size_limit,
paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
"cuDNN convolution workspace limit in MB unit.");
......@@ -148,7 +170,8 @@ DEFINE_uint64(conv_workspace_size_limit,
* layer specification. Once you change the layer specifications
* (such as batch size, feature map size), it will search again.
*/
DEFINE_bool(cudnn_exhaustive_search, false,
PADDLE_DEFINE_EXPORTED_bool(
cudnn_exhaustive_search, false,
"Whether enable exhaustive search for cuDNN convolution or "
"not, default is False.");
......@@ -160,7 +183,7 @@ DEFINE_bool(cudnn_exhaustive_search, false,
* Example:
* Note: only used to predict for advanced developer
*/
DEFINE_int64(cudnn_exhaustive_search_times, -1,
PADDLE_DEFINE_EXPORTED_int64(cudnn_exhaustive_search_times, -1,
"Exhaustive search times for cuDNN convolution, "
"default is -1, not exhaustive search");
......@@ -180,7 +203,8 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1,
* certain
* input data range.
*/
DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
PADDLE_DEFINE_EXPORTED_bool(
cudnn_batchnorm_spatial_persistent, false,
"Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
"batch_norm, default is False.");
#endif
......@@ -197,7 +221,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
* https://github.com/PaddlePaddle/Paddle/issues/15049
* If you want to change this default value, why?(gongwb)
*/
DEFINE_bool(
PADDLE_DEFINE_EXPORTED_bool(
sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios.");
......@@ -215,9 +239,10 @@ DEFINE_bool(
* into the queue, and then the communicator takes the gradients out
* of the queue and sends them after merging.
*/
DEFINE_int32(communicator_max_merge_var_num, 20,
PADDLE_DEFINE_EXPORTED_int32(communicator_max_merge_var_num, 20,
"max var num to merge and send");
DEFINE_bool(communicator_is_sgd_optimizer, true,
PADDLE_DEFINE_EXPORTED_bool(
communicator_is_sgd_optimizer, true,
"gradient sent to the server is the sum of the gradients "
"calculated by each thread if optimizer is sgd");
/**
......@@ -233,7 +258,7 @@ DEFINE_bool(communicator_is_sgd_optimizer, true,
* space. It is used to avoid training much faster than communication,
* so that too many gradients are not sent out in time.
*/
DEFINE_int32(communicator_send_queue_size, 20,
PADDLE_DEFINE_EXPORTED_int32(communicator_send_queue_size, 20,
"queue size to recv gradient before send");
#endif
......@@ -246,7 +271,8 @@ DEFINE_int32(communicator_send_queue_size, 20,
* Note: Control the number of threads used for distributed modules.
* If it is not set, it is set to a hard thread.
*/
DEFINE_int32(dist_threadpool_size, 0,
PADDLE_DEFINE_EXPORTED_int32(
dist_threadpool_size, 0,
"number of threads used for distributed executed.");
/**
......@@ -272,7 +298,7 @@ static const double kDefaultEagerDeleteTensorGB = -1;
static const double kDefaultEagerDeleteTensorGB = 0;
#endif
DEFINE_double(
PADDLE_DEFINE_EXPORTED_double(
eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB,
"Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0");
......@@ -289,7 +315,8 @@ DEFINE_double(
* has finished, which will make the garbage collection strategy faster.
* Only works when garbage collection strategy is enabled.
*/
DEFINE_bool(fast_eager_deletion_mode, true,
PADDLE_DEFINE_EXPORTED_bool(
fast_eager_deletion_mode, true,
"Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends.");
......@@ -311,7 +338,8 @@ DEFINE_bool(fast_eager_deletion_mode, true,
* largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
* The flag is only valid when running parallel data compilers.
*/
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
PADDLE_DEFINE_EXPORTED_double(
memory_fraction_of_eager_deletion, 1.0,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
......@@ -331,7 +359,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit";
#else
static constexpr char kDefaultAllocatorStrategy[] = "auto_growth";
#endif
DEFINE_string(
PADDLE_DEFINE_EXPORTED_string(
allocator_strategy, kDefaultAllocatorStrategy,
"The allocation strategy, enum in [naive_best_fit, auto_growth]. "
"naive_best_fit means the original pre-allocated allocator of Paddle. "
......@@ -358,7 +386,7 @@ DEFINE_string(
* size as the memory block will be allocated from the CUDA pinned
* request util the CPU does not have enough memory.
*/
DEFINE_double(fraction_of_cpu_memory_to_use, 1,
PADDLE_DEFINE_EXPORTED_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
......@@ -374,7 +402,8 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
* FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
* as memory block sizes.
*/
DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
PADDLE_DEFINE_EXPORTED_uint64(
initial_cpu_memory_in_mb, 500ul,
"Initial CPU memory for PaddlePaddle, in MD unit.");
/**
......@@ -390,7 +419,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
* size as the memory block will be allocated from the CPU
* request util the CPU does not have enough memory.
*/
DEFINE_double(
PADDLE_DEFINE_EXPORTED_double(
fraction_of_cuda_pinned_memory_to_use, 0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
......@@ -425,7 +454,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
// which may lead to insufficient memory left for paddle
constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
#endif
DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
PADDLE_DEFINE_EXPORTED_double(
fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated "
"from the trunk. If the trunk doesn't have enough gpu memory, "
......@@ -444,7 +474,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
* FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
* the GPU has no remaining memory.
*/
DEFINE_uint64(
PADDLE_DEFINE_EXPORTED_uint64(
initial_gpu_memory_in_mb, 0ul,
"Allocate a trunk of gpu memory whose byte size is specified by "
"the flag. Future memory usage will be allocated from the "
......@@ -466,12 +496,14 @@ DEFINE_uint64(
* Note: If the allocated GPU memory blocks are exhausted,
* additional GPU memory blocks are reallocated
*/
DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
PADDLE_DEFINE_EXPORTED_uint64(
reallocate_gpu_memory_in_mb, 0ul,
"If this flag is set, Paddle will reallocate the gpu memory with "
"size specified by this flag. Else Paddle will reallocate by "
"FLAGS_fraction_of_gpu_memory_to_use");
DEFINE_uint64(gpu_memory_limit_mb, 0UL,
PADDLE_DEFINE_EXPORTED_uint64(
gpu_memory_limit_mb, 0UL,
"The maximum gpu memory limit that the process can allocate. "
"If it is equal to 0, there would be no limit and all gpu memory "
"would be available to the process. If it is larger than 0, "
......@@ -489,7 +521,8 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL,
* Example:
* Note:
*/
DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes
PADDLE_DEFINE_EXPORTED_double(
local_exe_sub_scope_limit, 256.0, // MBytes
"The memory up limit of sub-scopes of local execution scope for "
"each CUDAPlace. If you don't need to limit the memory, "
"you should set FLAGS_local_exe_sub_scope_limit=-1. "
......@@ -503,7 +536,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes
* Example:
* Note:
*/
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
/**
* Debug related FLAG
......@@ -525,7 +558,7 @@ static const int32_t kDefaultCallStackLevel = 2;
static const int32_t kDefaultCallStackLevel = 1;
#endif
DEFINE_int32(
PADDLE_DEFINE_EXPORTED_int32(
call_stack_level, kDefaultCallStackLevel,
"Determine the call stack to print when error or exeception happens."
// TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
......@@ -545,7 +578,7 @@ DEFINE_int32(
* Note: If True, gradients are summed by the reverse order of
* the forward execution sequence.
*/
DEFINE_bool(sort_sum_gradient, false,
PADDLE_DEFINE_EXPORTED_bool(sort_sum_gradient, false,
"Sum gradients by the reverse order of "
"the forward execution sequence.");
......@@ -557,7 +590,7 @@ DEFINE_bool(sort_sum_gradient, false,
* Example:
* Note: The maximum number of inplace grad_add.
*/
DEFINE_int32(
PADDLE_DEFINE_EXPORTED_int32(
max_inplace_grad_add, 0,
"The maximum number of inplace grad_add. When doing "
"gradient accumulation, if the number of gradients need to that "
......@@ -572,7 +605,7 @@ DEFINE_int32(
* Example:
* Note: Holds list of operation types with OneDNN kernels to be enabled.
*/
DEFINE_string(tracer_mkldnn_ops_on, "",
PADDLE_DEFINE_EXPORTED_string(tracer_mkldnn_ops_on, "",
"List of OneDNN operation types to be turned on");
/**
......@@ -583,7 +616,8 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
* Example:
* Note: Holds list of operation types with OneDNN kernels to be disabled.
*/
DEFINE_string(tracer_mkldnn_ops_off, "",
PADDLE_DEFINE_EXPORTED_string(
tracer_mkldnn_ops_off, "",
"List of OneDNN operation types to be turned off");
/**
......@@ -595,7 +629,8 @@ DEFINE_string(tracer_mkldnn_ops_off, "",
* Note: Check kernel launch status after every kernel compute.
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool(check_kernel_launch, false,
PADDLE_DEFINE_EXPORTED_bool(
check_kernel_launch, false,
"Check kernel launch status after every kernel compute");
#endif
......@@ -608,7 +643,8 @@ DEFINE_bool(check_kernel_launch, false,
* Note: Disable cudnn in conv2d.
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
PADDLE_DEFINE_EXPORTED_bool(conv2d_disable_cudnn, false,
"Disable cudnn in conv2d");
#endif
/**
......@@ -621,7 +657,7 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
DEFINE_int32(get_host_by_name_time, 120,
PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120,
"The maximum time for get host by name time");
#endif
......@@ -634,6 +670,6 @@ DEFINE_int32(get_host_by_name_time, 120,
* program when using Fleet APIs.
* Note: Apply IR pass to program. Be only useful when using Fleet APIs.
*/
DEFINE_bool(
PADDLE_DEFINE_EXPORTED_bool(
apply_pass_to_program, false,
"It controls whether to apply IR pass to program when using Fleet APIs");
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <map>
#include <string>
#include <type_traits>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace platform {
struct FlagInfo {
using ValueType =
boost::variant<bool, int32_t, int64_t, uint64_t, double, std::string>;
std::string name;
mutable void *value_ptr;
ValueType default_value;
std::string doc;
bool is_writable;
};
using ExportedFlagInfoMap = std::map<std::string, FlagInfo>;
const ExportedFlagInfoMap &GetExportedFlagInfoMap();
ExportedFlagInfoMap *GetMutableExportedFlagInfoMap();
#define __PADDLE_DEFINE_EXPORTED_FLAG(__name, __is_writable, __cpp_type, \
__gflag_type, __default_value, __doc) \
DEFINE_##__gflag_type(__name, __default_value, __doc); \
struct __PaddleRegisterFlag_##__name { \
__PaddleRegisterFlag_##__name() { \
using FlagDeclaredType = \
typename std::remove_reference<decltype(FLAGS_##__name)>::type; \
static_assert(std::is_same<FlagDeclaredType, ::std::string>::value || \
std::is_arithmetic<FlagDeclaredType>::value, \
"FLAGS should be std::string or arithmetic type"); \
auto *instance = ::paddle::platform::GetMutableExportedFlagInfoMap(); \
auto &info = (*instance)[#__name]; \
info.name = #__name; \
info.value_ptr = &(FLAGS_##__name); \
info.default_value = static_cast<__cpp_type>(__default_value); \
info.doc = __doc; \
info.is_writable = __is_writable; \
} \
int Touch() const { return 0; } \
}; \
static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name; \
int TouchPaddleFlagRegister_##__name() { \
return __PaddleRegisterFlag_instance##__name.Touch(); \
} \
static_assert(std::is_same<__PaddleRegisterFlag_##__name, \
::__PaddleRegisterFlag_##__name>::value, \
"FLAGS should define in global namespace")
#define PADDLE_FORCE_LINK_FLAG(__name) \
extern int TouchPaddleFlagRegister_##__name(); \
UNUSED static int __paddle_use_flag_##__name = \
TouchPaddleFlagRegister_##__name()
#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_READONLY_bool(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, \
doc)
#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, ::std::string, string, \
default_value, doc)
} // namespace platform
} // namespace paddle
......@@ -43,7 +43,8 @@ limitations under the License. */
#endif
DECLARE_int32(paddle_num_threads);
DEFINE_int32(multiple_of_cupti_buffer_size, 1,
PADDLE_DEFINE_EXPORTED_int32(
multiple_of_cupti_buffer_size, 1,
"Multiple of the CUPTI device buffer size. If the timestamps have "
"been dropped when you are profiling, try increasing this value.");
......
......@@ -14,7 +14,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
DEFINE_bool(benchmark, false,
PADDLE_DEFINE_EXPORTED_bool(
benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
......
......@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
"Enable rpc profiler or not.");
namespace paddle {
namespace platform {
......
......@@ -18,7 +18,8 @@ limitations under the License. */
#include "paddle/fluid/platform/xpu/xpu_header.h"
#include "paddle/fluid/string/split.h"
DEFINE_string(selected_xpus, "",
PADDLE_DEFINE_EXPORTED_string(
selected_xpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (XPU). If you want to use "
......
......@@ -29,82 +29,38 @@
#include "paddle/fluid/platform/macros.h"
#include "pybind11/stl.h"
// data processing
DECLARE_bool(use_mkldnn);
DECLARE_string(tracer_mkldnn_ops_on);
DECLARE_string(tracer_mkldnn_ops_off);
// FIXME(zengjinle): these 2 flags may be removed by the linker when compiling
// CPU-only Paddle. It is because they are only used in
// AutoGrowthBestFitAllocator, but AutoGrowthBestFitAllocator is not used
// (in the translation unit level) when compiling CPU-only Paddle. I do not
// want to add PADDLE_FORCE_LINK_FLAG, but I have not found any other methods
// to solve this problem.
PADDLE_FORCE_LINK_FLAG(free_idle_chunk);
PADDLE_FORCE_LINK_FLAG(free_when_no_cache_hit);
// debug
DECLARE_bool(check_nan_inf);
DECLARE_bool(cpu_deterministic);
DECLARE_bool(enable_rpc_profiler);
DECLARE_int32(multiple_of_cupti_buffer_size);
DECLARE_bool(reader_queue_speed_test_mode);
DECLARE_int32(call_stack_level);
DECLARE_bool(sort_sum_gradient);
DECLARE_bool(check_kernel_launch);
// device management
DECLARE_int32(paddle_num_threads);
// IR
DECLARE_bool(convert_all_blocks);
// executor
DECLARE_bool(enable_parallel_graph);
DECLARE_string(pe_profile_fname);
DECLARE_string(print_sub_graph_dir);
DECLARE_bool(use_ngraph);
DECLARE_bool(new_executor_use_inplace);
// memory management
DECLARE_string(allocator_strategy);
DECLARE_double(eager_delete_tensor_gb);
DECLARE_double(fraction_of_cpu_memory_to_use);
DECLARE_bool(free_idle_chunk);
DECLARE_bool(free_when_no_cache_hit);
DECLARE_bool(eager_delete_scope);
DECLARE_int32(fuse_parameter_groups_size);
DECLARE_double(fuse_parameter_memory_size);
DECLARE_bool(init_allocated_mem);
DECLARE_uint64(initial_cpu_memory_in_mb);
DECLARE_double(memory_fraction_of_eager_deletion);
DECLARE_bool(use_pinned_memory);
DECLARE_bool(use_system_allocator);
// others
DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add);
DECLARE_string(tracer_profile_fname);
DECLARE_bool(apply_pass_to_program);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
DECLARE_bool(cudnn_deterministic);
DECLARE_bool(cudnn_exhaustive_search);
DECLARE_bool(conv2d_disable_cudnn);
// data processing
DECLARE_bool(enable_cublas_tensor_op_math);
// device management
DECLARE_string(selected_gpus);
// memory management
DECLARE_bool(eager_delete_scope);
DECLARE_bool(fast_eager_deletion_mode);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(gpu_memory_limit_mb);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
// others
DECLARE_bool(sync_nccl_allreduce);
#endif
#ifdef PADDLE_WITH_XPU
// device management
DECLARE_string(selected_xpus);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// device management
DECLARE_string(selected_npus);
// set minmum loss scaling value
DECLARE_int32(min_loss_scaling);
#endif
DECLARE_bool(enable_unused_var_check);
// NOTE: where are these 2 flags from?
#ifdef PADDLE_WITH_DISTRIBUTE
DECLARE_int32(rpc_send_thread_num);
DECLARE_int32(rpc_get_thread_num);
DECLARE_int32(rpc_prefetch_thread_num);
#endif
......@@ -181,7 +137,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
PADDLE_ENFORCE_NOT_NULL(setter,
platform::errors::InvalidArgument(
"Setter of %s should not be null", name));
var_infos_.insert({name, VarInfo(is_public, getter, setter)});
}
......@@ -243,81 +198,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
GlobalVarGetterSetterRegistry GlobalVarGetterSetterRegistry::instance_;
class GlobalVarGetterSetterRegistryHelper {
public:
GlobalVarGetterSetterRegistryHelper(bool is_public, bool is_writable,
const std::string &var_names)
: is_public_(is_public),
is_writable_(is_writable),
var_names_(SplitVarNames(var_names)) {}
template <typename... Args>
void Register(Args &&... args) const {
Impl<0, sizeof...(args) == 1, Args...>::Register(
is_public_, is_writable_, var_names_, std::forward<Args>(args)...);
}
private:
static std::vector<std::string> SplitVarNames(const std::string &names) {
auto valid_char = [](char ch) { return !std::isspace(ch) && ch != ','; };
std::vector<std::string> ret;
size_t i = 0, j = 0, n = names.size();
while (i < n) {
for (; i < n && !valid_char(names[i]); ++i) {
}
for (j = i + 1; j < n && valid_char(names[j]); ++j) {
}
if (i < n && j <= n) {
auto substring = names.substr(i, j - i);
VLOG(10) << "Get substring: \"" << substring << "\"";
ret.emplace_back(substring);
}
i = j + 1;
}
return ret;
}
private:
template <size_t kIdx, bool kIsStop, typename T, typename... Args>
struct Impl {
static void Register(bool is_public, bool is_writable,
const std::vector<std::string> &var_names, T &&var,
Args &&... args) {
PADDLE_ENFORCE_EQ(kIdx + 1 + sizeof...(args), var_names.size(),
platform::errors::InvalidArgument(
"Argument number not match name number"));
Impl<kIdx, true, T>::Register(is_public, is_writable, var_names, var);
Impl<kIdx + 1, sizeof...(Args) == 1, Args...>::Register(
is_public, is_writable, var_names, std::forward<Args>(args)...);
}
};
template <size_t kIdx, typename T>
struct Impl<kIdx, true, T> {
static void Register(bool is_public, bool is_writable,
const std::vector<std::string> &var_names, T &&var) {
auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
if (is_writable) {
instance->Register(
var_names[kIdx], is_public,
GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)),
GlobalVarGetterSetterRegistry::CreateSetter(&var));
} else {
instance->Register(
var_names[kIdx], is_public,
GlobalVarGetterSetterRegistry::CreateGetter(std::forward<T>(var)));
}
}
};
private:
const bool is_public_;
const bool is_writable_;
const std::vector<std::string> var_names_;
};
static void RegisterGlobalVarGetterSetter();
void BindGlobalValueGetterSetter(pybind11::module *module) {
......@@ -338,65 +218,69 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
}
/* Public vars are designed to be writable. */
#define REGISTER_PUBLIC_GLOBAL_VAR(...) \
#define REGISTER_PUBLIC_GLOBAL_VAR(var) \
do { \
GlobalVarGetterSetterRegistryHelper(/*is_public=*/true, \
/*is_writable=*/true, "" #__VA_ARGS__) \
.Register(__VA_ARGS__); \
auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); \
instance->Register(#var, /*is_public=*/true, \
GlobalVarGetterSetterRegistry::CreateGetter(var), \
GlobalVarGetterSetterRegistry::CreateSetter(&var)); \
} while (0)
#define REGISTER_PRIVATE_GLOBAL_VAR(is_writable, ...) \
do { \
GlobalVarGetterSetterRegistryHelper(/*is_public=*/false, is_writable, \
"" #__VA_ARGS__) \
.Register(__VA_ARGS__); \
} while (0)
struct RegisterGetterSetterVisitor : public boost::static_visitor<void> {
RegisterGetterSetterVisitor(const std::string &name, bool is_writable,
void *value_ptr)
: name_(name), is_writable_(is_writable), value_ptr_(value_ptr) {}
static void RegisterGlobalVarGetterSetter() {
REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk,
FLAGS_free_when_no_cache_hit);
REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off,
FLAGS_apply_pass_to_program);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
FLAGS_cudnn_exhaustive_search, FLAGS_eager_delete_scope,
FLAGS_fast_eager_deletion_mode,
FLAGS_fraction_of_cuda_pinned_memory_to_use,
FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
#endif
#ifdef PADDLE_WITH_XPU
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
#endif
template <typename T>
void operator()(const T &) const {
auto &value = *static_cast<T *>(value_ptr_);
auto *instance = GlobalVarGetterSetterRegistry::MutableInstance();
bool is_public = is_writable_; // currently, all writable vars are public
if (is_writable_) {
instance->Register(name_, is_public,
GlobalVarGetterSetterRegistry::CreateGetter(value),
GlobalVarGetterSetterRegistry::CreateSetter(&value));
} else {
instance->Register(name_, is_public,
GlobalVarGetterSetterRegistry::CreateGetter(value));
}
}
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_min_loss_scaling);
#endif
private:
std::string name_;
bool is_writable_;
void *value_ptr_;
};
static void RegisterGlobalVarGetterSetter() {
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_cpu_deterministic);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_convert_all_blocks);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_parallel_graph);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_pe_profile_fname);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_print_sub_graph_dir);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_new_executor_use_inplace);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_eager_delete_scope);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_groups_size);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_fuse_parameter_memory_size);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_inner_op_parallelism);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_enable_unused_var_check);
#ifdef PADDLE_WITH_DITRIBUTE
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
FLAGS_rpc_get_thread_num,
FLAGS_rpc_prefetch_thread_num);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_get_thread_num);
REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_prefetch_thread_num);
#endif
const auto &flag_map = platform::GetExportedFlagInfoMap();
for (const auto &pair : flag_map) {
const std::string &name = pair.second.name;
bool is_writable = pair.second.is_writable;
void *value_ptr = pair.second.value_ptr;
const auto &default_value = pair.second.default_value;
RegisterGetterSetterVisitor visitor("FLAGS_" + name, is_writable,
value_ptr);
boost::apply_visitor(visitor, default_value);
}
}
} // namespace pybind
} // namespace paddle
......@@ -32,7 +32,8 @@
#include "paddle/fluid/platform/place.h"
#include "pybind11/stl.h"
DEFINE_bool(reader_queue_speed_test_mode, false,
PADDLE_DEFINE_EXPORTED_bool(
reader_queue_speed_test_mode, false,
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册