Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
4d95c8c7
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4d95c8c7
编写于
4月 30, 2021
作者:
F
Feiyu Chan
提交者:
GitHub
4月 30, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
avoid polluting logging's root logger (#32673)
avoid polluting logging's root logger
上级
109fdf14
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
64 addition
and
52 deletion
+64
-52
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+46
-43
python/paddle/distributed/fleet/utils/recompute.py
python/paddle/distributed/fleet/utils/recompute.py
+7
-4
python/paddle/fluid/incubate/fleet/utils/utils.py
python/paddle/fluid/incubate/fleet/utils/utils.py
+5
-2
python/paddle/utils/cpp_extension/extension_utils.py
python/paddle/utils/cpp_extension/extension_utils.py
+6
-3
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
4d95c8c7
...
...
@@ -29,9 +29,12 @@ from paddle.fluid.framework import Program, Variable, name_scope, default_main_p
from
paddle.fluid
import
layers
import
logging
logging
.
basicConfig
(
format
=
'%(asctime)s %(levelname)-8s %(message)s'
,
datefmt
=
'%Y-%m-%d %H:%M:%S'
)
logger
=
logging
.
getLogger
(
__name__
)
formatter
=
logging
.
Formatter
(
fmt
=
'%(asctime)s %(levelname)-8s %(message)s'
,
datefmt
=
'%Y-%m-%d %H:%M:%S'
)
ch
=
logging
.
StreamHandler
()
ch
.
setFormatter
(
formatter
)
logger
.
addHandler
(
ch
)
from
functools
import
reduce
__all__
=
[
"ShardingOptimizer"
]
...
...
@@ -136,7 +139,7 @@ class ShardingOptimizer(MetaOptimizerBase):
# FIXME (JZ-LIANG) deprecated hybrid_dp
if
self
.
user_defined_strategy
.
sharding_configs
[
"hybrid_dp"
]:
logg
ing
.
warning
(
logg
er
.
warning
(
"[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically"
)
assert
self
.
dp_degree
>=
1
...
...
@@ -174,7 +177,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self
.
_gradient_merge_acc_step
=
self
.
user_defined_strategy
.
pipeline_configs
[
'accumulate_steps'
]
if
self
.
_gradient_merge_acc_step
>
1
:
logg
ing
.
info
(
"Gradient merge in [{}], acc step = [{}]"
.
format
(
logg
er
.
info
(
"Gradient merge in [{}], acc step = [{}]"
.
format
(
self
.
gradient_merge_mode
,
self
.
_gradient_merge_acc_step
))
# optimize offload
...
...
@@ -338,7 +341,7 @@ class ShardingOptimizer(MetaOptimizerBase):
# opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100)
# sync its memcpy could not be overlap with calc, otherwise it will slower down training severely.
if
self
.
optimize_offload
:
logg
ing
.
info
(
"Sharding with optimize offload !"
)
logg
er
.
info
(
"Sharding with optimize offload !"
)
offload_helper
=
OffloadHelper
()
offload_helper
.
offload
(
main_block
,
startup_block
)
offload_helper
.
offload_fp32param
(
main_block
,
startup_block
)
...
...
@@ -641,15 +644,15 @@ class ShardingOptimizer(MetaOptimizerBase):
for
varname
in
sorted
(
var2broadcast_time
,
key
=
var2broadcast_time
.
get
,
reverse
=
True
):
logg
ing
.
info
(
"Sharding broadcast: [{}] times [{}]"
.
format
(
logg
er
.
info
(
"Sharding broadcast: [{}] times [{}]"
.
format
(
var2broadcast_time
[
varname
],
varname
))
for
idx_
in
range
(
len
(
self
.
_segments
)):
logg
ing
.
info
(
"segment [{}] :"
.
format
(
idx_
))
logg
ing
.
info
(
"start op: [{}] [{}]"
.
format
(
block
.
ops
[
logg
er
.
info
(
"segment [{}] :"
.
format
(
idx_
))
logg
er
.
info
(
"start op: [{}] [{}]"
.
format
(
block
.
ops
[
self
.
_segments
[
idx_
].
_start_idx
].
desc
.
type
(),
block
.
ops
[
self
.
_segments
[
idx_
].
_start_idx
].
desc
.
input_arg_names
(
)))
logg
ing
.
info
(
"end op: [{}] [{}]"
.
format
(
block
.
ops
[
logg
er
.
info
(
"end op: [{}] [{}]"
.
format
(
block
.
ops
[
self
.
_segments
[
idx_
].
_end_idx
].
desc
.
type
(),
block
.
ops
[
self
.
_segments
[
idx_
].
_end_idx
].
desc
.
input_arg_names
()))
return
...
...
@@ -1108,7 +1111,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self
.
dp_group_endpoints
.
append
(
self
.
global_endpoints
[
dp_first_rank_idx
+
dp_offset
*
i
])
assert
self
.
current_endpoint
in
self
.
dp_group_endpoints
logg
ing
.
info
(
"Hybrid DP mode turn on !"
)
logg
er
.
info
(
"Hybrid DP mode turn on !"
)
else
:
self
.
dp_ring_id
=
-
1
self
.
dp_rank
=
-
1
...
...
@@ -1119,40 +1122,40 @@ class ShardingOptimizer(MetaOptimizerBase):
# NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
self
.
global_ring_id
=
3
logg
ing
.
info
(
"global word size: {}"
.
format
(
self
.
global_word_size
))
logg
ing
.
info
(
"global rank: {}"
.
format
(
self
.
global_rank
))
logg
ing
.
info
(
"global endpoints: {}"
.
format
(
self
.
global_endpoints
))
logg
ing
.
info
(
"global ring id: {}"
.
format
(
self
.
global_ring_id
))
logg
ing
.
info
(
"#####"
*
6
)
logg
ing
.
info
(
"mp group size: {}"
.
format
(
self
.
mp_degree
))
logg
ing
.
info
(
"mp rank: {}"
.
format
(
self
.
mp_rank
))
logg
ing
.
info
(
"mp group id: {}"
.
format
(
self
.
mp_group_id
))
logg
ing
.
info
(
"mp group endpoints: {}"
.
format
(
self
.
mp_group_endpoints
))
logg
ing
.
info
(
"mp ring id: {}"
.
format
(
self
.
mp_ring_id
))
logg
ing
.
info
(
"#####"
*
6
)
logg
ing
.
info
(
"sharding group size: {}"
.
format
(
self
.
sharding_degree
))
logg
ing
.
info
(
"sharding rank: {}"
.
format
(
self
.
sharding_rank
))
logg
ing
.
info
(
"sharding group id: {}"
.
format
(
self
.
sharding_group_id
))
logg
ing
.
info
(
"sharding group endpoints: {}"
.
format
(
logg
er
.
info
(
"global word size: {}"
.
format
(
self
.
global_word_size
))
logg
er
.
info
(
"global rank: {}"
.
format
(
self
.
global_rank
))
logg
er
.
info
(
"global endpoints: {}"
.
format
(
self
.
global_endpoints
))
logg
er
.
info
(
"global ring id: {}"
.
format
(
self
.
global_ring_id
))
logg
er
.
info
(
"#####"
*
6
)
logg
er
.
info
(
"mp group size: {}"
.
format
(
self
.
mp_degree
))
logg
er
.
info
(
"mp rank: {}"
.
format
(
self
.
mp_rank
))
logg
er
.
info
(
"mp group id: {}"
.
format
(
self
.
mp_group_id
))
logg
er
.
info
(
"mp group endpoints: {}"
.
format
(
self
.
mp_group_endpoints
))
logg
er
.
info
(
"mp ring id: {}"
.
format
(
self
.
mp_ring_id
))
logg
er
.
info
(
"#####"
*
6
)
logg
er
.
info
(
"sharding group size: {}"
.
format
(
self
.
sharding_degree
))
logg
er
.
info
(
"sharding rank: {}"
.
format
(
self
.
sharding_rank
))
logg
er
.
info
(
"sharding group id: {}"
.
format
(
self
.
sharding_group_id
))
logg
er
.
info
(
"sharding group endpoints: {}"
.
format
(
self
.
sharding_group_endpoints
))
logg
ing
.
info
(
"sharding ring id: {}"
.
format
(
self
.
sharding_ring_id
))
logg
ing
.
info
(
"#####"
*
6
)
logg
ing
.
info
(
"pp group size: {}"
.
format
(
self
.
pp_degree
))
logg
ing
.
info
(
"pp rank: {}"
.
format
(
self
.
pp_rank
))
logg
ing
.
info
(
"pp group id: {}"
.
format
(
self
.
pp_group_id
))
logg
ing
.
info
(
"pp group endpoints: {}"
.
format
(
self
.
pp_group_endpoints
))
logg
ing
.
info
(
"pp ring id: {}"
.
format
(
self
.
pp_ring_id
))
logg
ing
.
info
(
"#####"
*
6
)
logg
ing
.
info
(
"pure dp group size: {}"
.
format
(
self
.
dp_degree
))
logg
ing
.
info
(
"pure dp rank: {}"
.
format
(
self
.
dp_rank
))
logg
ing
.
info
(
"pure dp group endpoints: {}"
.
format
(
logg
er
.
info
(
"sharding ring id: {}"
.
format
(
self
.
sharding_ring_id
))
logg
er
.
info
(
"#####"
*
6
)
logg
er
.
info
(
"pp group size: {}"
.
format
(
self
.
pp_degree
))
logg
er
.
info
(
"pp rank: {}"
.
format
(
self
.
pp_rank
))
logg
er
.
info
(
"pp group id: {}"
.
format
(
self
.
pp_group_id
))
logg
er
.
info
(
"pp group endpoints: {}"
.
format
(
self
.
pp_group_endpoints
))
logg
er
.
info
(
"pp ring id: {}"
.
format
(
self
.
pp_ring_id
))
logg
er
.
info
(
"#####"
*
6
)
logg
er
.
info
(
"pure dp group size: {}"
.
format
(
self
.
dp_degree
))
logg
er
.
info
(
"pure dp rank: {}"
.
format
(
self
.
dp_rank
))
logg
er
.
info
(
"pure dp group endpoints: {}"
.
format
(
self
.
dp_group_endpoints
))
logg
ing
.
info
(
"pure dp ring id: {}"
.
format
(
self
.
dp_ring_id
))
logg
ing
.
info
(
"#####"
*
6
)
logg
er
.
info
(
"pure dp ring id: {}"
.
format
(
self
.
dp_ring_id
))
logg
er
.
info
(
"#####"
*
6
)
return
...
...
python/paddle/distributed/fleet/utils/recompute.py
浏览文件 @
4d95c8c7
...
...
@@ -19,9 +19,12 @@ from paddle.fluid import framework
import
contextlib
import
logging
logging
.
basicConfig
(
format
=
'%(asctime)s %(levelname)-8s %(message)s'
,
datefmt
=
'%Y-%m-%d %H:%M:%S'
)
logger
=
logging
.
getLogger
(
__name__
)
formatter
=
logging
.
Formatter
(
fmt
=
'%(asctime)s %(levelname)-8s %(message)s'
,
datefmt
=
'%Y-%m-%d %H:%M:%S'
)
ch
=
logging
.
StreamHandler
()
ch
.
setFormatter
(
formatter
)
logger
.
addHandler
(
ch
)
def
detach_variable
(
inputs
):
...
...
@@ -40,7 +43,7 @@ def detach_variable(inputs):
def
check_recompute_necessary
(
inputs
):
if
not
any
(
input_
.
stop_gradient
==
False
for
input_
in
inputs
if
isinstance
(
input_
,
paddle
.
Tensor
)):
logg
ing
.
warn
(
logg
er
.
warn
(
"[Recompute]: None of the inputs to current recompute block need grad, "
"therefore there is NO need to recompute this block in backward !"
)
...
...
python/paddle/fluid/incubate/fleet/utils/utils.py
浏览文件 @
4d95c8c7
...
...
@@ -34,9 +34,12 @@ __all__ = [
"graphviz"
]
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
fmt
=
'%(asctime)s - %(levelname)s - %(message)s'
)
ch
=
logging
.
StreamHandler
()
ch
.
setFormatter
(
formatter
)
logger
.
addHandler
(
ch
)
persistable_vars_out_fn
=
"vars_persistable.log"
all_vars_out_fn
=
"vars_all.log"
...
...
python/paddle/utils/cpp_extension/extension_utils.py
浏览文件 @
4d95c8c7
...
...
@@ -32,9 +32,12 @@ from ...fluid import core
from
...fluid.framework
import
OpProtoHolder
from
...sysconfig
import
get_include
,
get_lib
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
"utils.cpp_extension"
)
logger
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
fmt
=
'%(asctime)s - %(levelname)s - %(message)s'
)
ch
=
logging
.
StreamHandler
()
ch
.
setFormatter
(
formatter
)
logger
.
addHandler
(
ch
)
OS_NAME
=
sys
.
platform
IS_WINDOWS
=
OS_NAME
.
startswith
(
'win'
)
...
...
@@ -1125,4 +1128,4 @@ def log_v(info, verbose=True):
Print log information on stdout.
"""
if
verbose
:
logg
ing
.
info
(
info
)
logg
er
.
info
(
info
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录