Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f8e1f452
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f8e1f452
编写于
3月 18, 2021
作者:
A
An Improved PeleeNet Algorithm with Feature Pyramid Networks for Image Detection
提交者:
GitHub
3月 18, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ascend_communicate (#31708)
上级
faf40da5
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
176 addition
and
109 deletion
+176
-109
python/paddle/distributed/fleet/meta_optimizers/common.py
python/paddle/distributed/fleet/meta_optimizers/common.py
+44
-27
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
...e/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+44
-27
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+1
-1
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+1
-1
python/paddle/fluid/transpiler/collective.py
python/paddle/fluid/transpiler/collective.py
+44
-27
python/paddle/hapi/model.py
python/paddle/hapi/model.py
+42
-26
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/common.py
浏览文件 @
f8e1f452
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
__future__
import
print_function
import
os
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
,
unique_name
...
...
@@ -70,34 +71,50 @@ class CollectiveHelper(object):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
if
core
.
is_compiled_with_cuda
():
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
elif
core
.
is_compiled_with_npu
():
endpoint_to_index_map
=
{
e
:
idx
for
idx
,
e
in
enumerate
(
endpoints
)
}
block
.
append_op
(
type
=
'c_comm_init_hcom'
,
inputs
=
{},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
'device_id'
:
int
(
os
.
getenv
(
"FLAGS_selected_npus"
)),
'rank_ids'
:
[
endpoint_to_index_map
[
e
]
for
e
in
endpoints
],
OP_ROLE_KEY
:
OpRole
.
Forward
})
def
_wait
(
self
,
current_endpoint
,
endpoints
):
assert
(
self
.
wait_port
)
...
...
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
浏览文件 @
f8e1f452
...
...
@@ -13,6 +13,7 @@
from
__future__
import
print_function
from
__future__
import
division
import
os
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
,
unique_name
...
...
@@ -78,34 +79,50 @@ class PipelineHelper(object):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
if
core
.
is_compiled_with_cuda
():
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
elif
core
.
is_compiled_with_npu
():
endpoint_to_index_map
=
{
e
:
idx
for
idx
,
e
in
enumerate
(
endpoints
)
}
block
.
append_op
(
type
=
'c_comm_init_hcom'
,
inputs
=
{},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
'device_id'
:
int
(
os
.
getenv
(
"FLAGS_selected_npus"
)),
'rank_ids'
:
[
endpoint_to_index_map
[
e
]
for
e
in
endpoints
],
OP_ROLE_KEY
:
OpRole
.
Forward
})
def
_broadcast_params
(
self
,
ring_id
):
block
=
self
.
startup_program
.
global_block
()
...
...
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
f8e1f452
...
...
@@ -265,7 +265,7 @@ class ShardingOptimizer(MetaOptimizerBase):
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
op
.
type
in
[
"c_allreduce_sum"
,
"c_sync_comm_stream"
,
"c_calc_comm_stream"
,
"c_gen_nccl_id"
,
"c_comm_init"
"c_calc_comm_stream"
,
"c_gen_nccl_id"
,
"c_comm_init
, c_comm_init_hcom
"
]:
pass
elif
op
.
type
==
"conditional_block"
:
...
...
python/paddle/fluid/framework.py
浏览文件 @
f8e1f452
...
...
@@ -2053,7 +2053,7 @@ class Operator(object):
'feed'
,
'fetch'
,
'recurrent'
,
'go'
,
'rnn_memory_helper_grad'
,
'conditional_block'
,
'while'
,
'send'
,
'recv'
,
'listen_and_serv'
,
'fl_listen_and_serv'
,
'ncclInit'
,
'select'
,
'checkpoint_notify'
,
'gen_nccl_id'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_sync_calc_stream'
,
'gen_nccl_id'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_
comm_init_hcom'
,
'c_
sync_calc_stream'
,
'c_sync_comm_stream'
,
'queue_generator'
,
'dequeue'
,
'enqueue'
,
'heter_listen_and_serv'
}
...
...
python/paddle/fluid/transpiler/collective.py
浏览文件 @
f8e1f452
...
...
@@ -17,6 +17,7 @@ from __future__ import print_function
import
sys
import
math
from
functools
import
reduce
import
os
import
collections
import
six
...
...
@@ -101,34 +102,50 @@ class Collective(object):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
self
.
op_role_key
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
self
.
op_role_key
:
OpRole
.
Forward
})
if
core
.
is_compiled_with_cuda
():
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
self
.
op_role_key
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
self
.
op_role_key
:
OpRole
.
Forward
})
elif
core
.
is_compiled_with_npu
():
endpoint_to_index_map
=
{
e
:
idx
for
idx
,
e
in
enumerate
(
endpoints
)
}
block
.
append_op
(
type
=
'c_comm_init_hcom'
,
inputs
=
{},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
'device_id'
:
int
(
os
.
getenv
(
"FLAGS_selected_npus"
)),
'rank_ids'
:
[
endpoint_to_index_map
[
e
]
for
e
in
endpoints
],
self
.
op_role_key
:
OpRole
.
Forward
})
def
_broadcast_params
(
self
):
block
=
self
.
startup_program
.
global_block
()
...
...
python/paddle/hapi/model.py
浏览文件 @
f8e1f452
...
...
@@ -133,33 +133,49 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
return
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
fluid
.
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
fluid
.
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
0
,
})
if
core
.
is_compiled_with_cuda
():
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
nccl_id_var
=
block
.
create_var
(
name
=
fluid
.
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
fluid
.
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
0
,
})
elif
core
.
is_compiled_with_npu
():
endpoint_to_index_map
=
{
e
:
idx
for
idx
,
e
in
enumerate
(
endpoints
)
}
block
.
append_op
(
type
=
'c_comm_init_hcom'
,
inputs
=
{},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
0
,
'device_id'
:
int
(
os
.
getenv
(
"FLAGS_selected_npus"
)),
'rank_ids'
:
[
endpoint_to_index_map
[
e
]
for
e
in
endpoints
],
})
def
prepare_distributed_context
(
place
=
None
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录