Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ca0136a6
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ca0136a6
编写于
9月 16, 2021
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
make flag setter easier
上级
e93c18a3
变更
28
显示空白变更内容
内联
并排
Showing
28 changed file
with
375 addition
and
462 deletion
+375
-462
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+1
-1
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+8
-7
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+2
-2
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+3
-3
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+2
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+7
-5
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+1
-1
paddle/fluid/framework/unused_var_check.cc
paddle/fluid/framework/unused_var_check.cc
+5
-4
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+1
-1
paddle/fluid/imperative/flags.cc
paddle/fluid/imperative/flags.cc
+4
-4
paddle/fluid/imperative/profiler.cc
paddle/fluid/imperative/profiler.cc
+2
-2
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+1
-1
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+5
-4
paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
...fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+15
-12
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+7
-6
paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+2
-1
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+2
-2
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+6
-3
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+1
-0
paddle/fluid/platform/flags.cc
paddle/fluid/platform/flags.cc
+139
-107
paddle/fluid/platform/flags.h
paddle/fluid/platform/flags.h
+85
-0
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+4
-3
paddle/fluid/platform/place.cc
paddle/fluid/platform/place.cc
+6
-5
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+2
-1
paddle/fluid/platform/xpu/xpu_info.cc
paddle/fluid/platform/xpu/xpu_info.cc
+9
-8
paddle/fluid/pybind/global_value_getter_setter.cc
paddle/fluid/pybind/global_value_getter_setter.cc
+39
-203
paddle/fluid/pybind/reader_py.cc
paddle/fluid/pybind/reader_py.cc
+4
-3
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+12
-72
未找到文件。
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
ca0136a6
...
...
@@ -19,7 +19,7 @@
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE
_bool
(
PADDLE_DEFINE_EXPORTED
_bool
(
cpu_deterministic
,
false
,
"Whether to make the result of computation deterministic in CPU side."
);
...
...
paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
浏览文件 @
ca0136a6
...
...
@@ -25,13 +25,14 @@ class VarDesc;
}
// namespace framework
}
// namespace paddle
DEFINE_double
(
fuse_parameter_memory_size
,
-
1.0
,
// MBytes
PADDLE_DEFINE_EXPORTED_double
(
fuse_parameter_memory_size
,
-
1.0
,
// MBytes
"fuse_parameter_memory_size is up limited memory size(MB)"
"of one group parameters' gradient which is the input "
"of communication calling(e.g NCCLAllReduce). "
"The default value is 0, it means that "
"not set group according to memory_size."
);
DEFINE
_int32
(
PADDLE_DEFINE_EXPORTED
_int32
(
fuse_parameter_groups_size
,
1
,
"fuse_parameter_groups_size is the up limited size of one group "
"parameters' gradient. "
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
ca0136a6
...
...
@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/operator.h"
DEFINE
_bool
(
convert_all_blocks
,
true
,
PADDLE_DEFINE_EXPORTED
_bool
(
convert_all_blocks
,
true
,
"Convert all blocks in program into SSAgraphs"
);
namespace
paddle
{
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
ca0136a6
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_proto_maker.h"
DECLARE_bool
(
convert_all_blocks
);
DEFINE
_string
(
print_sub_graph_dir
,
""
,
PADDLE_DEFINE_EXPORTED
_string
(
print_sub_graph_dir
,
""
,
"FLAGS_print_sub_graph_dir is used "
"to print the nodes of sub_graphs."
);
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
ca0136a6
...
...
@@ -47,7 +47,8 @@ class LoDTensor;
DECLARE_bool
(
benchmark
);
DECLARE_bool
(
check_nan_inf
);
DECLARE_bool
(
enable_unused_var_check
);
DEFINE_int32
(
inner_op_parallelism
,
0
,
"number of threads for inner op"
);
PADDLE_DEFINE_EXPORTED_int32
(
inner_op_parallelism
,
0
,
"number of threads for inner op"
);
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
ca0136a6
...
...
@@ -46,10 +46,12 @@ DECLARE_double(eager_delete_tensor_gb);
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
DEFINE_string
(
pe_profile_fname
,
""
,
PADDLE_DEFINE_EXPORTED_string
(
pe_profile_fname
,
""
,
"Profiler filename for PE, which generated by gperftools."
"Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."
);
DEFINE_bool
(
enable_parallel_graph
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
enable_parallel_graph
,
false
,
"Force disable parallel graph execution mode if set false."
);
namespace
paddle
{
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
ca0136a6
...
...
@@ -19,7 +19,7 @@ limitations under the License. */
DECLARE_bool
(
benchmark
);
DEFINE
_bool
(
PADDLE_DEFINE_EXPORTED
_bool
(
eager_delete_scope
,
true
,
"Delete local scope eagerly. It will reduce GPU memory usage but "
"slow down the destruction of variables.(around 1% performance harm)"
);
...
...
paddle/fluid/framework/unused_var_check.cc
浏览文件 @
ca0136a6
...
...
@@ -17,13 +17,14 @@ limitations under the License. */
#include <glog/logging.h>
#include <string>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/flags.h"
DEFINE_bool
(
enable_unused_var_check
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
enable_unused_var_check
,
false
,
"Checking whether operator contains unused inputs, "
"especially for grad operator. It should be in unittest."
);
...
...
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
ca0136a6
...
...
@@ -11,7 +11,7 @@ cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
cc_library
(
tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal
)
cc_library
(
basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator
)
cc_library
(
engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator
)
cc_library
(
imperative_profiler SRCS profiler.cc
)
cc_library
(
imperative_profiler SRCS profiler.cc
DEPS flags
)
if
(
NOT WIN32
)
if
(
WITH_NCCL OR WITH_RCCL
)
cc_library
(
imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor
)
...
...
paddle/fluid/imperative/flags.cc
浏览文件 @
ca0136a6
...
...
@@ -13,9 +13,9 @@
// limitations under the License.
#include "paddle/fluid/imperative/flags.h"
#include "
gflags/g
flags.h"
#include "
paddle/fluid/platform/
flags.h"
DEFINE
_uint64
(
dygraph_debug
,
0
,
PADDLE_DEFINE_EXPORTED
_uint64
(
dygraph_debug
,
0
,
"Debug level of dygraph. This flag is not "
"open to users"
);
...
...
paddle/fluid/imperative/profiler.cc
浏览文件 @
ca0136a6
...
...
@@ -19,9 +19,9 @@
#endif
#include <glog/logging.h>
#include <mutex> // NOLINT
#include "
gflags/g
flags.h"
#include "
paddle/fluid/platform/
flags.h"
DEFINE
_string
(
PADDLE_DEFINE_EXPORTED
_string
(
tracer_profile_fname
,
"xxgperf"
,
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."
);
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
ca0136a6
...
...
@@ -99,7 +99,7 @@ cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc
cc_test
(
allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade
)
cc_library
(
auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator
)
cc_library
(
auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator
flags
)
cc_test
(
auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator
)
cc_test
(
auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
ca0136a6
...
...
@@ -37,12 +37,13 @@
#endif
#include "paddle/fluid/platform/npu_info.h"
DEFINE
_int64
(
PADDLE_DEFINE_EXPORTED
_int64
(
gpu_allocator_retry_time
,
10000
,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0"
);
DEFINE_bool
(
use_system_allocator
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
use_system_allocator
,
false
,
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests."
);
...
...
paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
浏览文件 @
ca0136a6
...
...
@@ -17,14 +17,17 @@
#include <algorithm>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/platform/flags.h"
DEFINE_bool
(
free_idle_chunk
,
false
,
PADDLE_DEFINE_READONLY_EXPORTED_bool
(
free_idle_chunk
,
false
,
"Whether to free idle chunk when each allocation is freed. "
"If false, all freed allocation would be cached to speed up next "
"allocation request. If true, no allocation would be cached. This "
"flag only works when FLAGS_allocator_strategy=auto_growth."
);
DEFINE_bool
(
free_when_no_cache_hit
,
false
,
PADDLE_DEFINE_READONLY_EXPORTED_bool
(
free_when_no_cache_hit
,
false
,
"Whether to free idle chunks when no cache hit. If true, idle "
"chunk would be freed when no cache hit; if false, idle "
"chunk would be freed when out of memory occurs. This flag "
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
ca0136a6
...
...
@@ -34,7 +34,8 @@
#include "paddle/fluid/platform/xpu/xpu_header.h"
#endif
DEFINE_bool
(
init_allocated_mem
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
init_allocated_mem
,
false
,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
...
...
paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
浏览文件 @
ca0136a6
...
...
@@ -15,7 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
#include "paddle/fluid/framework/op_registry.h"
DEFINE_int32
(
rpc_send_thread_num
,
12
,
"number of threads for rpc send"
);
PADDLE_DEFINE_EXPORTED_int32
(
rpc_send_thread_num
,
12
,
"number of threads for rpc send"
);
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
ca0136a6
...
...
@@ -37,13 +37,13 @@ if (WITH_PYTHON)
endif
(
NOT WIN32
)
endif
()
cc_library
(
flags SRCS flags.cc DEPS gflags
)
cc_library
(
flags SRCS flags.cc DEPS gflags
boost
)
cc_library
(
denormal SRCS denormal.cc DEPS
)
cc_library
(
errors SRCS errors.cc DEPS error_codes_proto
)
cc_test
(
errors_test SRCS errors_test.cc DEPS errors enforce
)
set
(
enforce_deps flags errors boost
)
set
(
enforce_deps flags errors boost
flags
)
if
(
WITH_GPU
)
set
(
enforce_deps
${
enforce_deps
}
external_error_proto
)
endif
()
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
ca0136a6
...
...
@@ -31,7 +31,7 @@ limitations under the License. */
#endif // _WIN32
#include <algorithm>
#include "
gflags/g
flags.h"
#include "
paddle/fluid/platform/
flags.h"
DECLARE_double
(
fraction_of_cpu_memory_to_use
);
DECLARE_uint64
(
initial_cpu_memory_in_mb
);
...
...
@@ -42,7 +42,8 @@ DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
PADDLE_DEFINE_EXPORTED_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -54,7 +55,9 @@ size_t CpuTotalPhysicalMemory() {
mib
[
1
]
=
HW_MEMSIZE
;
int64_t
size
=
0
;
size_t
len
=
sizeof
(
size
);
if
(
sysctl
(
mib
,
2
,
&
size
,
&
len
,
NULL
,
0
)
==
0
)
return
(
size_t
)
size
;
if
(
sysctl
(
mib
,
2
,
&
size
,
&
len
,
NULL
,
0
)
==
0
)
{
return
static_cast
<
size_t
>
(
size
);
}
return
0L
;
#elif defined(_WIN32)
MEMORYSTATUSEX
sMeminfo
;
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
ca0136a6
...
...
@@ -101,6 +101,7 @@ limitations under the License. */
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/type_defs.h"
#endif
#include "paddle/fluid/platform/flags.h"
namespace
paddle
{
namespace
platform
{
...
...
paddle/fluid/platform/flags.cc
浏览文件 @
ca0136a6
...
...
@@ -12,11 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "
gflags/g
flags.h"
#include "
paddle/fluid/platform/
flags.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#endif
namespace
paddle
{
namespace
platform
{
const
ExportedFlagInfoMap
&
GetExportedFlagInfoMap
()
{
static
ExportedFlagInfoMap
g_exported_flag_info_map
;
return
g_exported_flag_info_map
;
}
}
// namespace platform
}
// namespace paddle
/**
* NOTE(paddle-dev): This file is designed to define all public FLAGS.
*/
...
...
@@ -30,7 +41,7 @@
* instance to 2
* Note:
*/
DEFINE
_int32
(
paddle_num_threads
,
1
,
PADDLE_DEFINE_EXPORTED
_int32
(
paddle_num_threads
,
1
,
"Number of threads for each paddle instance."
);
/**
...
...
@@ -41,7 +52,8 @@ DEFINE_int32(paddle_num_threads, 1,
* Example:
* Note: Used to debug. Checking whether operator produce NAN/INF or not.
*/
DEFINE_bool
(
check_nan_inf
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
check_nan_inf
,
false
,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."
);
...
...
@@ -58,7 +70,7 @@ DEFINE_bool(check_nan_inf, false,
* Example:
* Note: whether to use Tensor Core, faster but it may loss precision.
*/
DEFINE
_bool
(
PADDLE_DEFINE_EXPORTED
_bool
(
enable_cublas_tensor_op_math
,
false
,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"but it may loss precision. Currently, There are two CUDA libraries that"
...
...
@@ -77,7 +89,8 @@ DEFINE_bool(
* cards
* Note: A list of device ids separated by comma, like: 0,1,2,3
*/
DEFINE_string
(
selected_gpus
,
""
,
PADDLE_DEFINE_EXPORTED_string
(
selected_gpus
,
""
,
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (GPU). If you want to use "
...
...
@@ -88,19 +101,22 @@ DEFINE_string(selected_gpus, "",
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
DEFINE_string
(
selected_npus
,
""
,
PADDLE_DEFINE_EXPORTED_string
(
selected_npus
,
""
,
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string."
);
DEFINE_bool
(
hccl_check_nan
,
true
,
PADDLE_DEFINE_EXPORTED_bool
(
hccl_check_nan
,
true
,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value"
);
DEFINE
_string
(
PADDLE_DEFINE_EXPORTED
_string
(
npu_config_path
,
""
,
"The absolute path of configuration json file, like: /tmp/config.json. "
"If proveided, it will be passed to aclInit()."
);
DEFINE_int32
(
min_loss_scaling
,
1
,
"set minmum loss scaling value!"
);
PADDLE_DEFINE_EXPORTED_int32
(
min_loss_scaling
,
1
,
"set minmum loss scaling value!"
);
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
@@ -113,7 +129,8 @@ DEFINE_int32(min_loss_scaling, 1, "set minmum loss scaling value!");
* Note: whether to use deterministic algorithm in cudnn.
* If true, it will slow down some operators such as conv and pooling.
*/
DEFINE_bool
(
cudnn_deterministic
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
cudnn_deterministic
,
false
,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic."
);
...
...
@@ -130,7 +147,8 @@ DEFINE_bool(cudnn_deterministic, false,
* increased.
* Users need to balance memory and speed.
*/
DEFINE_uint64
(
conv_workspace_size_limit
,
PADDLE_DEFINE_EXPORTED_uint64
(
conv_workspace_size_limit
,
paddle
::
platform
::
kDefaultConvWorkspaceSizeLimitMB
,
"cuDNN convolution workspace limit in MB unit."
);
...
...
@@ -148,7 +166,8 @@ DEFINE_uint64(conv_workspace_size_limit,
* layer specification. Once you change the layer specifications
* (such as batch size, feature map size), it will search again.
*/
DEFINE_bool
(
cudnn_exhaustive_search
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
cudnn_exhaustive_search
,
false
,
"Whether enable exhaustive search for cuDNN convolution or "
"not, default is False."
);
...
...
@@ -160,7 +179,7 @@ DEFINE_bool(cudnn_exhaustive_search, false,
* Example:
* Note: only used to predict for advanced developer
*/
DEFINE
_int64
(
cudnn_exhaustive_search_times
,
-
1
,
PADDLE_DEFINE_EXPORTED
_int64
(
cudnn_exhaustive_search_times
,
-
1
,
"Exhaustive search times for cuDNN convolution, "
"default is -1, not exhaustive search"
);
...
...
@@ -180,7 +199,8 @@ DEFINE_int64(cudnn_exhaustive_search_times, -1,
* certain
* input data range.
*/
DEFINE_bool
(
cudnn_batchnorm_spatial_persistent
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
cudnn_batchnorm_spatial_persistent
,
false
,
"Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
"batch_norm, default is False."
);
#endif
...
...
@@ -197,7 +217,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
* https://github.com/PaddlePaddle/Paddle/issues/15049
* If you want to change this default value, why?(gongwb)
*/
DEFINE
_bool
(
PADDLE_DEFINE_EXPORTED
_bool
(
sync_nccl_allreduce
,
true
,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios."
);
...
...
@@ -215,9 +235,10 @@ DEFINE_bool(
* into the queue, and then the communicator takes the gradients out
* of the queue and sends them after merging.
*/
DEFINE
_int32
(
communicator_max_merge_var_num
,
20
,
PADDLE_DEFINE_EXPORTED
_int32
(
communicator_max_merge_var_num
,
20
,
"max var num to merge and send"
);
DEFINE_bool
(
communicator_is_sgd_optimizer
,
true
,
PADDLE_DEFINE_EXPORTED_bool
(
communicator_is_sgd_optimizer
,
true
,
"gradient sent to the server is the sum of the gradients "
"calculated by each thread if optimizer is sgd"
);
/**
...
...
@@ -233,7 +254,7 @@ DEFINE_bool(communicator_is_sgd_optimizer, true,
* space. It is used to avoid training much faster than communication,
* so that too many gradients are not sent out in time.
*/
DEFINE
_int32
(
communicator_send_queue_size
,
20
,
PADDLE_DEFINE_EXPORTED
_int32
(
communicator_send_queue_size
,
20
,
"queue size to recv gradient before send"
);
#endif
...
...
@@ -246,7 +267,8 @@ DEFINE_int32(communicator_send_queue_size, 20,
* Note: Control the number of threads used for distributed modules.
* If it is not set, it is set to a hard thread.
*/
DEFINE_int32
(
dist_threadpool_size
,
0
,
PADDLE_DEFINE_EXPORTED_int32
(
dist_threadpool_size
,
0
,
"number of threads used for distributed executed."
);
/**
...
...
@@ -272,7 +294,7 @@ static const double kDefaultEagerDeleteTensorGB = -1;
static
const
double
kDefaultEagerDeleteTensorGB
=
0
;
#endif
DEFINE
_double
(
PADDLE_DEFINE_EXPORTED
_double
(
eager_delete_tensor_gb
,
kDefaultEagerDeleteTensorGB
,
"Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0"
);
...
...
@@ -289,7 +311,8 @@ DEFINE_double(
* has finished, which will make the garbage collection strategy faster.
* Only works when garbage collection strategy is enabled.
*/
DEFINE_bool
(
fast_eager_deletion_mode
,
true
,
PADDLE_DEFINE_EXPORTED_bool
(
fast_eager_deletion_mode
,
true
,
"Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends."
);
...
...
@@ -311,7 +334,8 @@ DEFINE_bool(fast_eager_deletion_mode, true,
* largest FLAGS_memory_fraction_of_eager_deletion ratio will be released.
* The flag is only valid when running parallel data compilers.
*/
DEFINE_double
(
memory_fraction_of_eager_deletion
,
1.0
,
PADDLE_DEFINE_EXPORTED_double
(
memory_fraction_of_eager_deletion
,
1.0
,
"Fraction of eager deletion. If less than 1.0, all variables in "
"the program would be sorted according to its memory size, and "
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
...
...
@@ -331,7 +355,7 @@ static constexpr char kDefaultAllocatorStrategy[] = "naive_best_fit";
#else
static
constexpr
char
kDefaultAllocatorStrategy
[]
=
"auto_growth"
;
#endif
DEFINE
_string
(
PADDLE_DEFINE_EXPORTED
_string
(
allocator_strategy
,
kDefaultAllocatorStrategy
,
"The allocation strategy, enum in [naive_best_fit, auto_growth]. "
"naive_best_fit means the original pre-allocated allocator of Paddle. "
...
...
@@ -358,7 +382,7 @@ DEFINE_string(
* size as the memory block will be allocated from the CUDA pinned
* request util the CPU does not have enough memory.
*/
DEFINE
_double
(
fraction_of_cpu_memory_to_use
,
1
,
PADDLE_DEFINE_EXPORTED
_double
(
fraction_of_cpu_memory_to_use
,
1
,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
...
...
@@ -374,7 +398,8 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
* FLAGS_fraction_of_cpu_memory_to_use*(total physical memory)
* as memory block sizes.
*/
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
500ul
,
PADDLE_DEFINE_EXPORTED_uint64
(
initial_cpu_memory_in_mb
,
500ul
,
"Initial CPU memory for PaddlePaddle, in MD unit."
);
/**
...
...
@@ -390,7 +415,7 @@ DEFINE_uint64(initial_cpu_memory_in_mb, 500ul,
* size as the memory block will be allocated from the CPU
* request util the CPU does not have enough memory.
*/
DEFINE
_double
(
PADDLE_DEFINE_EXPORTED
_double
(
fraction_of_cuda_pinned_memory_to_use
,
0.5
,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
...
...
@@ -425,7 +450,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
// which may lead to insufficient memory left for paddle
constexpr
static
float
fraction_of_gpu_memory_to_use
=
0.5
f
;
#endif
DEFINE_double
(
fraction_of_gpu_memory_to_use
,
fraction_of_gpu_memory_to_use
,
PADDLE_DEFINE_EXPORTED_double
(
fraction_of_gpu_memory_to_use
,
fraction_of_gpu_memory_to_use
,
"Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated "
"from the trunk. If the trunk doesn't have enough gpu memory, "
...
...
@@ -444,7 +470,7 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
* FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until
* the GPU has no remaining memory.
*/
DEFINE
_uint64
(
PADDLE_DEFINE_EXPORTED
_uint64
(
initial_gpu_memory_in_mb
,
0ul
,
"Allocate a trunk of gpu memory whose byte size is specified by "
"the flag. Future memory usage will be allocated from the "
...
...
@@ -466,12 +492,14 @@ DEFINE_uint64(
* Note: If the allocated GPU memory blocks are exhausted,
* additional GPU memory blocks are reallocated
*/
DEFINE_uint64
(
reallocate_gpu_memory_in_mb
,
0ul
,
PADDLE_DEFINE_EXPORTED_uint64
(
reallocate_gpu_memory_in_mb
,
0ul
,
"If this flag is set, Paddle will reallocate the gpu memory with "
"size specified by this flag. Else Paddle will reallocate by "
"FLAGS_fraction_of_gpu_memory_to_use"
);
DEFINE_uint64
(
gpu_memory_limit_mb
,
0UL
,
PADDLE_DEFINE_EXPORTED_uint64
(
gpu_memory_limit_mb
,
0UL
,
"The maximum gpu memory limit that the process can allocate. "
"If it is equal to 0, there would be no limit and all gpu memory "
"would be available to the process. If it is larger than 0, "
...
...
@@ -489,7 +517,8 @@ DEFINE_uint64(gpu_memory_limit_mb, 0UL,
* Example:
* Note:
*/
DEFINE_double
(
local_exe_sub_scope_limit
,
256.0
,
// MBytes
PADDLE_DEFINE_EXPORTED_double
(
local_exe_sub_scope_limit
,
256.0
,
// MBytes
"The memory up limit of sub-scopes of local execution scope for "
"each CUDAPlace. If you don't need to limit the memory, "
"you should set FLAGS_local_exe_sub_scope_limit=-1. "
...
...
@@ -503,7 +532,7 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes
* Example:
* Note:
*/
DEFINE
_bool
(
use_mkldnn
,
false
,
"Use MKLDNN to run"
);
PADDLE_DEFINE_EXPORTED
_bool
(
use_mkldnn
,
false
,
"Use MKLDNN to run"
);
/**
* Debug related FLAG
...
...
@@ -525,7 +554,7 @@ static const int32_t kDefaultCallStackLevel = 2;
static
const
int32_t
kDefaultCallStackLevel
=
1
;
#endif
DEFINE
_int32
(
PADDLE_DEFINE_EXPORTED
_int32
(
call_stack_level
,
kDefaultCallStackLevel
,
"Determine the call stack to print when error or exeception happens."
// TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
...
...
@@ -545,7 +574,7 @@ DEFINE_int32(
* Note: If True, gradients are summed by the reverse order of
* the forward execution sequence.
*/
DEFINE
_bool
(
sort_sum_gradient
,
false
,
PADDLE_DEFINE_EXPORTED
_bool
(
sort_sum_gradient
,
false
,
"Sum gradients by the reverse order of "
"the forward execution sequence."
);
...
...
@@ -557,7 +586,7 @@ DEFINE_bool(sort_sum_gradient, false,
* Example:
* Note: The maximum number of inplace grad_add.
*/
DEFINE
_int32
(
PADDLE_DEFINE_EXPORTED
_int32
(
max_inplace_grad_add
,
0
,
"The maximum number of inplace grad_add. When doing "
"gradient accumulation, if the number of gradients need to that "
...
...
@@ -572,7 +601,7 @@ DEFINE_int32(
* Example:
* Note: Holds list of operation types with OneDNN kernels to be enabled.
*/
DEFINE
_string
(
tracer_mkldnn_ops_on
,
""
,
PADDLE_DEFINE_EXPORTED
_string
(
tracer_mkldnn_ops_on
,
""
,
"List of OneDNN operation types to be turned on"
);
/**
...
...
@@ -583,7 +612,8 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
* Example:
* Note: Holds list of operation types with OneDNN kernels to be disabled.
*/
DEFINE_string
(
tracer_mkldnn_ops_off
,
""
,
PADDLE_DEFINE_EXPORTED_string
(
tracer_mkldnn_ops_off
,
""
,
"List of OneDNN operation types to be turned off"
);
/**
...
...
@@ -595,7 +625,8 @@ DEFINE_string(tracer_mkldnn_ops_off, "",
* Note: Check kernel launch status after every kernel compute.
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool
(
check_kernel_launch
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
check_kernel_launch
,
false
,
"Check kernel launch status after every kernel compute"
);
#endif
...
...
@@ -608,7 +639,8 @@ DEFINE_bool(check_kernel_launch, false,
* Note: Disable cudnn in conv2d.
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool
(
conv2d_disable_cudnn
,
false
,
"Disable cudnn in conv2d"
);
PADDLE_DEFINE_EXPORTED_bool
(
conv2d_disable_cudnn
,
false
,
"Disable cudnn in conv2d"
);
#endif
/**
...
...
@@ -621,7 +653,7 @@ DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
DEFINE
_int32
(
get_host_by_name_time
,
120
,
PADDLE_DEFINE_EXPORTED
_int32
(
get_host_by_name_time
,
120
,
"The maximum time for get host by name time"
);
#endif
...
...
@@ -634,6 +666,6 @@ DEFINE_int32(get_host_by_name_time, 120,
* program when using Fleet APIs.
* Note: Apply IR pass to program. Be only useful when using Fleet APIs.
*/
DEFINE
_bool
(
PADDLE_DEFINE_EXPORTED
_bool
(
apply_pass_to_program
,
false
,
"It controls whether to apply IR pass to program when using Fleet APIs"
);
paddle/fluid/platform/flags.h
0 → 100644
浏览文件 @
ca0136a6
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <map>
#include <string>
#include <type_traits>
#include <typeindex>
#include "boost/variant.hpp"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/macros.h"
namespace
paddle
{
namespace
platform
{
struct
FlagInfo
{
using
ValueType
=
boost
::
variant
<
bool
,
int32_t
,
int64_t
,
uint64_t
,
double
,
std
::
string
>
;
std
::
string
name
;
void
*
value_ptr
;
ValueType
default_value
;
std
::
string
doc
;
bool
is_writable
;
};
using
ExportedFlagInfoMap
=
std
::
map
<
std
::
string
,
FlagInfo
>
;
const
ExportedFlagInfoMap
&
GetExportedFlagInfoMap
();
#define __PADDLE_DEFINE_EXPORTED_FLAG(__name, __is_writable, __cpp_type, \
__gflag_type, __default_value, __doc) \
DEFINE_##__gflag_type(__name, __default_value, __doc); \
struct __PaddleRegisterFlag_##__name { \
__PaddleRegisterFlag_##__name() { \
const auto &instance = ::paddle::platform::GetExportedFlagInfoMap(); \
using Type = ::paddle::platform::ExportedFlagInfoMap; \
auto &info = const_cast<Type &>(instance)[#__name]; \
info.name = #__name; \
info.value_ptr = &(FLAGS_##__name); \
info.default_value = static_cast<__cpp_type>(__default_value); \
info.doc = __doc; \
info.is_writable = __is_writable; \
} \
}; \
static_assert(std::is_same<__PaddleRegisterFlag_##__name, \
::__PaddleRegisterFlag_##__name>::value, \
"FLAGS should define in global namespace"); \
static __PaddleRegisterFlag_##__name __PaddleRegisterFlag_instance##__name
#define PADDLE_DEFINE_EXPORTED_bool(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, bool, bool, default_value, doc)
#define PADDLE_DEFINE_READONLY_EXPORTED_bool(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, false, bool, bool, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_int32(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, int32_t, int32, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_int64(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, int64_t, int64, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_uint64(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, uint64_t, uint64, default_value, \
doc)
#define PADDLE_DEFINE_EXPORTED_double(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, double, double, default_value, doc)
#define PADDLE_DEFINE_EXPORTED_string(name, default_value, doc) \
__PADDLE_DEFINE_EXPORTED_FLAG(name, true, ::std::string, string, \
default_value, doc)
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/init.cc
浏览文件 @
ca0136a6
...
...
@@ -43,7 +43,8 @@ limitations under the License. */
#endif
DECLARE_int32
(
paddle_num_threads
);
DEFINE_int32
(
multiple_of_cupti_buffer_size
,
1
,
PADDLE_DEFINE_EXPORTED_int32
(
multiple_of_cupti_buffer_size
,
1
,
"Multiple of the CUPTI device buffer size. If the timestamps have "
"been dropped when you are profiling, try increasing this value."
);
...
...
paddle/fluid/platform/place.cc
浏览文件 @
ca0136a6
...
...
@@ -14,7 +14,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
DEFINE_bool
(
benchmark
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
benchmark
,
false
,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
ca0136a6
...
...
@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
DEFINE_bool
(
enable_rpc_profiler
,
false
,
"Enable rpc profiler or not."
);
PADDLE_DEFINE_EXPORTED_bool
(
enable_rpc_profiler
,
false
,
"Enable rpc profiler or not."
);
namespace
paddle
{
namespace
platform
{
...
...
paddle/fluid/platform/xpu/xpu_info.cc
浏览文件 @
ca0136a6
...
...
@@ -18,7 +18,8 @@ limitations under the License. */
#include "paddle/fluid/platform/xpu/xpu_header.h"
#include "paddle/fluid/string/split.h"
DEFINE_string
(
selected_xpus
,
""
,
PADDLE_DEFINE_EXPORTED_string
(
selected_xpus
,
""
,
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (XPU). If you want to use "
...
...
paddle/fluid/pybind/global_value_getter_setter.cc
浏览文件 @
ca0136a6
...
...
@@ -29,82 +29,8 @@
#include "paddle/fluid/platform/macros.h"
#include "pybind11/stl.h"
// data processing
DECLARE_bool
(
use_mkldnn
);
DECLARE_string
(
tracer_mkldnn_ops_on
);
DECLARE_string
(
tracer_mkldnn_ops_off
);
// debug
DECLARE_bool
(
check_nan_inf
);
DECLARE_bool
(
cpu_deterministic
);
DECLARE_bool
(
enable_rpc_profiler
);
DECLARE_int32
(
multiple_of_cupti_buffer_size
);
DECLARE_bool
(
reader_queue_speed_test_mode
);
DECLARE_int32
(
call_stack_level
);
DECLARE_bool
(
sort_sum_gradient
);
DECLARE_bool
(
check_kernel_launch
);
// device management
DECLARE_int32
(
paddle_num_threads
);
// executor
DECLARE_bool
(
enable_parallel_graph
);
DECLARE_string
(
pe_profile_fname
);
DECLARE_string
(
print_sub_graph_dir
);
DECLARE_bool
(
use_ngraph
);
// memory management
DECLARE_string
(
allocator_strategy
);
DECLARE_double
(
eager_delete_tensor_gb
);
DECLARE_double
(
fraction_of_cpu_memory_to_use
);
DECLARE_bool
(
free_idle_chunk
);
DECLARE_bool
(
free_when_no_cache_hit
);
DECLARE_int32
(
fuse_parameter_groups_size
);
DECLARE_double
(
fuse_parameter_memory_size
);
DECLARE_bool
(
init_allocated_mem
);
DECLARE_uint64
(
initial_cpu_memory_in_mb
);
DECLARE_double
(
memory_fraction_of_eager_deletion
);
DECLARE_bool
(
use_pinned_memory
);
DECLARE_bool
(
use_system_allocator
);
// others
DECLARE_bool
(
benchmark
);
DECLARE_int32
(
inner_op_parallelism
);
DECLARE_int32
(
max_inplace_grad_add
);
DECLARE_string
(
tracer_profile_fname
);
DECLARE_bool
(
apply_pass_to_program
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
DECLARE_uint64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_batchnorm_spatial_persistent
);
DECLARE_bool
(
cudnn_deterministic
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_bool
(
conv2d_disable_cudnn
);
// data processing
DECLARE_bool
(
enable_cublas_tensor_op_math
);
// device management
DECLARE_string
(
selected_gpus
);
// memory management
DECLARE_bool
(
eager_delete_scope
);
DECLARE_bool
(
fast_eager_deletion_mode
);
DECLARE_double
(
fraction_of_cuda_pinned_memory_to_use
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_uint64
(
gpu_memory_limit_mb
);
DECLARE_uint64
(
initial_gpu_memory_in_mb
);
DECLARE_uint64
(
reallocate_gpu_memory_in_mb
);
// others
DECLARE_bool
(
sync_nccl_allreduce
);
#endif
#ifdef PADDLE_WITH_XPU
// device management
DECLARE_string
(
selected_xpus
);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// device management
DECLARE_string
(
selected_npus
);
// set minmum loss scaling value
DECLARE_int32
(
min_loss_scaling
);
#endif
// NOTE: where is these 2 flags from?
#ifdef PADDLE_WITH_DISTRIBUTE
DECLARE_int32
(
rpc_send_thread_num
);
DECLARE_int32
(
rpc_get_thread_num
);
DECLARE_int32
(
rpc_prefetch_thread_num
);
#endif
...
...
@@ -181,7 +107,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
PADDLE_ENFORCE_NOT_NULL
(
setter
,
platform
::
errors
::
InvalidArgument
(
"Setter of %s should not be null"
,
name
));
var_infos_
.
insert
({
name
,
VarInfo
(
is_public
,
getter
,
setter
)});
}
...
...
@@ -243,81 +168,6 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
GlobalVarGetterSetterRegistry
GlobalVarGetterSetterRegistry
::
instance_
;
class
GlobalVarGetterSetterRegistryHelper
{
public:
GlobalVarGetterSetterRegistryHelper
(
bool
is_public
,
bool
is_writable
,
const
std
::
string
&
var_names
)
:
is_public_
(
is_public
),
is_writable_
(
is_writable
),
var_names_
(
SplitVarNames
(
var_names
))
{}
template
<
typename
...
Args
>
void
Register
(
Args
&&
...
args
)
const
{
Impl
<
0
,
sizeof
...(
args
)
==
1
,
Args
...
>::
Register
(
is_public_
,
is_writable_
,
var_names_
,
std
::
forward
<
Args
>
(
args
)...);
}
private:
static
std
::
vector
<
std
::
string
>
SplitVarNames
(
const
std
::
string
&
names
)
{
auto
valid_char
=
[](
char
ch
)
{
return
!
std
::
isspace
(
ch
)
&&
ch
!=
','
;
};
std
::
vector
<
std
::
string
>
ret
;
size_t
i
=
0
,
j
=
0
,
n
=
names
.
size
();
while
(
i
<
n
)
{
for
(;
i
<
n
&&
!
valid_char
(
names
[
i
]);
++
i
)
{
}
for
(
j
=
i
+
1
;
j
<
n
&&
valid_char
(
names
[
j
]);
++
j
)
{
}
if
(
i
<
n
&&
j
<=
n
)
{
auto
substring
=
names
.
substr
(
i
,
j
-
i
);
VLOG
(
10
)
<<
"Get substring:
\"
"
<<
substring
<<
"
\"
"
;
ret
.
emplace_back
(
substring
);
}
i
=
j
+
1
;
}
return
ret
;
}
private:
template
<
size_t
kIdx
,
bool
kIsStop
,
typename
T
,
typename
...
Args
>
struct
Impl
{
static
void
Register
(
bool
is_public
,
bool
is_writable
,
const
std
::
vector
<
std
::
string
>
&
var_names
,
T
&&
var
,
Args
&&
...
args
)
{
PADDLE_ENFORCE_EQ
(
kIdx
+
1
+
sizeof
...(
args
),
var_names
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Argument number not match name number"
));
Impl
<
kIdx
,
true
,
T
>::
Register
(
is_public
,
is_writable
,
var_names
,
var
);
Impl
<
kIdx
+
1
,
sizeof
...(
Args
)
==
1
,
Args
...
>::
Register
(
is_public
,
is_writable
,
var_names
,
std
::
forward
<
Args
>
(
args
)...);
}
};
template
<
size_t
kIdx
,
typename
T
>
struct
Impl
<
kIdx
,
true
,
T
>
{
static
void
Register
(
bool
is_public
,
bool
is_writable
,
const
std
::
vector
<
std
::
string
>
&
var_names
,
T
&&
var
)
{
auto
*
instance
=
GlobalVarGetterSetterRegistry
::
MutableInstance
();
if
(
is_writable
)
{
instance
->
Register
(
var_names
[
kIdx
],
is_public
,
GlobalVarGetterSetterRegistry
::
CreateGetter
(
std
::
forward
<
T
>
(
var
)),
GlobalVarGetterSetterRegistry
::
CreateSetter
(
&
var
));
}
else
{
instance
->
Register
(
var_names
[
kIdx
],
is_public
,
GlobalVarGetterSetterRegistry
::
CreateGetter
(
std
::
forward
<
T
>
(
var
)));
}
}
};
private:
const
bool
is_public_
;
const
bool
is_writable_
;
const
std
::
vector
<
std
::
string
>
var_names_
;
};
static
void
RegisterGlobalVarGetterSetter
();
void
BindGlobalValueGetterSetter
(
pybind11
::
module
*
module
)
{
...
...
@@ -338,65 +188,51 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
}
/* Public vars are designed to be writable. */
#define REGISTER_PUBLIC_GLOBAL_VAR(
...)
\
#define REGISTER_PUBLIC_GLOBAL_VAR(
var)
\
do { \
GlobalVarGetterSetterRegistryHelper(
/*is_public=*/
true, \
/*is_writable=*/
true, "" #__VA_ARGS__) \
.Register(__VA_ARGS__); \
auto *instance = GlobalVarGetterSetterRegistry::MutableInstance(); \
instance->Register(#var,
/*is_public=*/
true, \
GlobalVarGetterSetterRegistry::CreateGetter(var), \
GlobalVarGetterSetterRegistry::CreateSetter(&var)); \
} while (0)
#define REGISTER_PRIVATE_GLOBAL_VAR(is_writable, ...) \
do { \
GlobalVarGetterSetterRegistryHelper(
/*is_public=*/
false, is_writable, \
"" #__VA_ARGS__) \
.Register(__VA_ARGS__); \
} while (0)
struct
RegisterGetterSetterVisitor
:
public
boost
::
static_visitor
<
void
>
{
RegisterGetterSetterVisitor
(
const
std
::
string
&
name
,
bool
is_public
,
void
*
value_ptr
)
:
name_
(
name
),
value_ptr_
(
value_ptr
)
{}
static
void
RegisterGlobalVarGetterSetter
()
{
REGISTER_PRIVATE_GLOBAL_VAR
(
/*is_writable=*/
false
,
FLAGS_free_idle_chunk
,
FLAGS_free_when_no_cache_hit
);
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_eager_delete_tensor_gb
,
FLAGS_enable_parallel_graph
,
FLAGS_allocator_strategy
,
FLAGS_use_system_allocator
,
FLAGS_check_nan_inf
,
FLAGS_call_stack_level
,
FLAGS_sort_sum_gradient
,
FLAGS_cpu_deterministic
,
FLAGS_enable_rpc_profiler
,
FLAGS_multiple_of_cupti_buffer_size
,
FLAGS_reader_queue_speed_test_mode
,
FLAGS_pe_profile_fname
,
FLAGS_print_sub_graph_dir
,
FLAGS_fraction_of_cpu_memory_to_use
,
FLAGS_fuse_parameter_groups_size
,
FLAGS_fuse_parameter_memory_size
,
FLAGS_init_allocated_mem
,
FLAGS_initial_cpu_memory_in_mb
,
FLAGS_memory_fraction_of_eager_deletion
,
FLAGS_use_pinned_memory
,
FLAGS_benchmark
,
FLAGS_inner_op_parallelism
,
FLAGS_tracer_profile_fname
,
FLAGS_paddle_num_threads
,
FLAGS_use_mkldnn
,
FLAGS_max_inplace_grad_add
,
FLAGS_tracer_mkldnn_ops_on
,
FLAGS_tracer_mkldnn_ops_off
,
FLAGS_apply_pass_to_program
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_gpu_memory_limit_mb
,
FLAGS_cudnn_deterministic
,
FLAGS_conv_workspace_size_limit
,
FLAGS_cudnn_batchnorm_spatial_persistent
,
FLAGS_cudnn_exhaustive_search
,
FLAGS_eager_delete_scope
,
FLAGS_fast_eager_deletion_mode
,
FLAGS_fraction_of_cuda_pinned_memory_to_use
,
FLAGS_fraction_of_gpu_memory_to_use
,
FLAGS_initial_gpu_memory_in_mb
,
FLAGS_reallocate_gpu_memory_in_mb
,
FLAGS_enable_cublas_tensor_op_math
,
FLAGS_selected_gpus
,
FLAGS_sync_nccl_allreduce
,
FLAGS_conv2d_disable_cudnn
,
FLAGS_check_kernel_launch
);
#endif
#ifdef PADDLE_WITH_XPU
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_selected_xpus
);
#endif
template
<
typename
T
>
void
operator
()(
const
T
&
)
const
{
auto
&
value
=
*
static_cast
<
T
*>
(
value_ptr_
);
auto
*
instance
=
GlobalVarGetterSetterRegistry
::
MutableInstance
();
instance
->
Register
(
name_
,
is_public_
,
GlobalVarGetterSetterRegistry
::
CreateGetter
(
value
),
GlobalVarGetterSetterRegistry
::
CreateSetter
(
&
value
));
}
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_selected_npus
);
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_min_loss_scaling
);
#endif
private:
std
::
string
name_
;
bool
is_public_
;
void
*
value_ptr_
;
};
static
void
RegisterGlobalVarGetterSetter
()
{
#ifdef PADDLE_WITH_DITRIBUTE
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_rpc_send_thread_num
,
FLAGS_rpc_get_thread_num
,
FLAGS_rpc_prefetch_thread_num
);
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_rpc_get_thread_num
);
REGISTER_PUBLIC_GLOBAL_VAR
(
FLAGS_rpc_prefetch_thread_num
);
#endif
const
auto
&
flag_map
=
platform
::
GetExportedFlagInfoMap
();
for
(
const
auto
&
pair
:
flag_map
)
{
const
std
::
string
&
name
=
pair
.
second
.
name
;
bool
is_writable
=
pair
.
second
.
is_writable
;
void
*
value_ptr
=
const_cast
<
void
*>
(
pair
.
second
.
value_ptr
);
const
auto
&
default_value
=
pair
.
second
.
default_value
;
RegisterGetterSetterVisitor
visitor
(
"FLAGS_"
+
name
,
is_writable
,
value_ptr
);
boost
::
apply_visitor
(
visitor
,
default_value
);
}
}
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/reader_py.cc
浏览文件 @
ca0136a6
...
...
@@ -32,7 +32,8 @@
#include "paddle/fluid/platform/place.h"
#include "pybind11/stl.h"
DEFINE_bool
(
reader_queue_speed_test_mode
,
false
,
PADDLE_DEFINE_EXPORTED_bool
(
reader_queue_speed_test_mode
,
false
,
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing"
);
...
...
python/paddle/fluid/__init__.py
浏览文件 @
ca0136a6
...
...
@@ -176,83 +176,23 @@ def __bootstrap__():
print
(
'PLEASE USE OMP_NUM_THREADS WISELY.'
,
file
=
sys
.
stderr
)
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
sysstr
=
platform
.
system
()
flag_prefix
=
'FLAGS_'
read_env_flags
=
[
'check_nan_inf'
,
'convert_all_blocks'
,
'benchmark'
,
'eager_delete_scope'
,
'fraction_of_cpu_memory_to_use'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'paddle_num_threads'
,
'dist_threadpool_size'
,
'eager_delete_tensor_gb'
,
'fast_eager_deletion_mode'
,
'memory_fraction_of_eager_deletion'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
,
'pe_profile_fname'
,
'inner_op_parallelism'
,
'enable_parallel_graph'
,
'fuse_parameter_groups_size'
,
'multiple_of_cupti_buffer_size'
,
'fuse_parameter_memory_size'
,
'tracer_profile_fname'
,
'dygraph_debug'
,
'use_system_allocator'
,
'enable_unused_var_check'
,
'free_idle_chunk'
,
'free_when_no_cache_hit'
,
'call_stack_level'
,
'sort_sum_gradient'
,
'max_inplace_grad_add'
,
'apply_pass_to_program'
,
'new_executor_use_inplace'
,
key
[
len
(
flag_prefix
):]
for
key
in
core
.
globals
().
keys
()
if
key
.
startswith
(
flag_prefix
)
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'cpu_deterministic'
)
if
core
.
is_compiled_with_mkldnn
():
read_env_flags
.
append
(
'use_mkldnn'
)
read_env_flags
.
append
(
'tracer_mkldnn_ops_on'
)
read_env_flags
.
append
(
'tracer_mkldnn_ops_off'
)
def
remove_flag_if_exists
(
name
):
if
name
in
read_env_flags
:
read_env_flags
.
remove
(
name
)
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'initial_gpu_memory_in_mb'
,
'reallocate_gpu_memory_in_mb'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'selected_gpus'
,
'sync_nccl_allreduce'
,
'cudnn_batchnorm_spatial_persistent'
,
'gpu_allocator_retry_time'
,
'local_exe_sub_scope_limit'
,
'gpu_memory_limit_mb'
,
'conv2d_disable_cudnn'
,
'get_host_by_name_time'
,
]
sysstr
=
platform
.
system
()
if
'Darwin'
in
sysstr
:
remove_flags_if_exists
(
'use_pinned_memory'
)
if
core
.
is_compiled_with_npu
():
read_env_flags
+=
[
'selected_npus'
,
'fraction_of_gpu_memory_to_use'
,
'initial_gpu_memory_in_mb'
,
'reallocate_gpu_memory_in_mb'
,
'gpu_memory_limit_mb'
,
'npu_config_path'
,
'get_host_by_name_time'
,
'hccl_check_nan'
,
'min_loss_scaling'
,
]
if
os
.
name
==
'nt'
:
remove_flags_if_exists
(
'cpu_deterministic'
)
core
.
init_gflags
([
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
# Note(zhouwei25): sys may not have argv in some cases,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录