Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
7bb4a4e8
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7bb4a4e8
编写于
8月 31, 2020
作者:
S
seiriosPlus
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rectification init_worker and exe.run startup program
上级
fef6f6f9
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
39 addition
and
39 deletion
+39
-39
paddle/fluid/operators/distributed_ops/recv_op.cc
paddle/fluid/operators/distributed_ops/recv_op.cc
+2
-9
python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
...dle/distributed/fleet/runtime/parameter_server_runtime.py
+3
-3
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
.../fleet/parameter_server/distribute_transpiler/__init__.py
+3
-1
python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
.../fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+16
-16
python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+1
-1
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+4
-2
python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+3
-2
python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+3
-2
python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
.../fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+2
-1
python/paddle/fluid/tests/unittests/test_communicator_geo.py
python/paddle/fluid/tests/unittests/test_communicator_geo.py
+1
-1
python/paddle/fluid/tests/unittests/test_communicator_half_async.py
...dle/fluid/tests/unittests/test_communicator_half_async.py
+1
-1
未找到文件。
paddle/fluid/operators/distributed_ops/recv_op.cc
浏览文件 @
7bb4a4e8
...
...
@@ -37,12 +37,6 @@ class RecvOp : public framework::OperatorBase {
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
int
do_not_run
=
Attr
<
int
>
(
"do_not_run"
);
if
(
do_not_run
)
{
VLOG
(
3
)
<<
"recv do not run!"
;
return
;
}
std
::
vector
<
std
::
string
>
epmap
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
std
::
vector
<
std
::
string
>
varnames
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"varnames"
);
...
...
@@ -63,11 +57,10 @@ class RecvOp : public framework::OperatorBase {
if
(
recv_varnames
.
size
()
>
0
)
{
auto
*
communicator
=
distributed
::
Communicator
::
GetInstance
();
if
(
communicator
=
=
nullptr
)
{
if
(
communicator
!
=
nullptr
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"
need run fleet.init_worker first
"
));
"
execute startup program must before fleet.init_worker
"
));
}
communicator
->
RecvNoBarrier
();
}
else
{
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
if
(
with_barrier
)
{
...
...
python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
浏览文件 @
7bb4a4e8
...
...
@@ -216,12 +216,12 @@ class ParameterServerRuntime(RuntimeBase):
else
:
model_dirname
=
None
if
self
.
role_maker
.
_is_heter_worker
():
self
.
_init_worker
()
executor
=
self
.
_get_executor
()
executor
.
run
(
fluid
.
default_startup_program
())
if
self
.
role_maker
.
_is_heter_worker
():
self
.
_init_worker
()
if
self
.
role_maker
.
_is_heter_worker
():
return
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
浏览文件 @
7bb4a4e8
...
...
@@ -191,12 +191,14 @@ class FleetTranspiler(Fleet):
self
.
_communicator
=
Communicator
(
trainer_config
.
mode
,
kwargs
,
trainer_config
.
get_communicator_flags
())
self
.
_communicator
.
init_with_ctx
(
send_ctx
,
recv_ctx
)
if
not
self
.
_communicator
.
is_running
():
self
.
_communicator
.
start
()
else
:
warnings
.
warn
(
"communicator has been initialized, skip"
)
raise
ValueError
(
"Communicator can only be inited once, please check"
)
def
init_worker
(
self
):
"""
...
...
python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
浏览文件 @
7bb4a4e8
...
...
@@ -222,22 +222,22 @@ def append_send_ops_pass(program, config):
def
init_from_server_pass
(
program
,
config
):
fetch_barrier_out
=
program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
#
#
recv_ctx = config.get_communicator_recv_context(recv_type=1)
#
recv_varnames = []
#
#
for name, ctxs in recv_ctx.items():
#
recv_varnames.extend(ctxs.origin_varnames())
#
#
program.global_block().append_op(
#
type="recv",
#
inputs={"X": []},
#
outputs={"Out": []},
#
attrs={
#
"recv_varnames": recv_varnames,
#
"trainer_id": config.get_role_id(),
#
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
#
})
recv_ctx
=
config
.
get_communicator_recv_context
(
recv_type
=
1
)
recv_varnames
=
[]
for
name
,
ctxs
in
recv_ctx
.
items
():
recv_varnames
.
extend
(
ctxs
.
origin_varnames
())
program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{
"X"
:
[]},
outputs
=
{
"Out"
:
[]},
attrs
=
{
"recv_varnames"
:
recv_varnames
,
"trainer_id"
:
config
.
get_role_id
(),
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
...
...
python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
浏览文件 @
7bb4a4e8
...
...
@@ -164,8 +164,8 @@ def train(args):
elif
fleet
.
is_worker
():
logger
.
info
(
"run trainer"
)
fleet
.
init_worker
()
exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
thread_num
=
2
filelist
=
[]
...
...
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
浏览文件 @
7bb4a4e8
...
...
@@ -161,8 +161,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
"""
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
fleet
.
init_worker
()
exe
.
run
(
fluid
.
default_startup_program
())
fleet
.
init_worker
()
batch_size
=
4
train_reader
=
paddle
.
batch
(
fake_ctr_reader
(),
batch_size
=
batch_size
)
self
.
reader
.
decorate_sample_list_generator
(
train_reader
)
...
...
@@ -201,8 +203,8 @@ class TestDistCTR2x2(FleetDistRunnerBase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
fleet
.
init_worker
()
exe
.
run
(
fluid
.
default_startup_program
())
fleet
.
init_worker
()
thread_num
=
2
batch_size
=
128
...
...
python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
浏览文件 @
7bb4a4e8
...
...
@@ -60,8 +60,9 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
device_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
place
=
fluid
.
CUDAPlace
(
device_id
)
exe
=
fluid
.
Executor
(
place
)
fleet
.
init_worker
()
exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
batch_size
=
4
train_reader
=
paddle
.
batch
(
fake_ctr_reader
(),
batch_size
=
batch_size
)
...
...
@@ -104,8 +105,8 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
place
=
fluid
.
CUDAPlace
(
device_id
)
exe
=
fluid
.
Executor
(
place
)
fleet
.
init_worker
()
exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
thread_num
=
2
batch_size
=
128
...
...
python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
浏览文件 @
7bb4a4e8
...
...
@@ -150,8 +150,9 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
"""
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
fleet
.
init_worker
()
exe
.
run
(
fluid
.
default_startup_program
())
fleet
.
init_worker
()
batch_size
=
4
train_reader
=
paddle
.
batch
(
fake_ctr_reader
(),
batch_size
=
batch_size
)
self
.
reader
.
decorate_sample_list_generator
(
train_reader
)
...
...
@@ -174,8 +175,8 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
fleet
.
init_worker
()
exe
.
run
(
fluid
.
default_startup_program
())
fleet
.
init_worker
()
thread_num
=
1
batch_size
=
128
...
...
python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
浏览文件 @
7bb4a4e8
...
...
@@ -151,8 +151,9 @@ class TestDistCTR2x2(FleetDistRunnerBase):
"""
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
fleet
.
init_worker
()
exe
.
run
(
fluid
.
default_startup_program
())
fleet
.
init_worker
()
batch_size
=
4
...
...
python/paddle/fluid/tests/unittests/test_communicator_geo.py
浏览文件 @
7bb4a4e8
...
...
@@ -81,8 +81,8 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
fleet
.
init_worker
()
exe
.
run
(
fluid
.
default_startup_program
())
fleet
.
init_worker
()
train_reader
=
paddle
.
batch
(
self
.
fake_reader
(),
batch_size
=
24
)
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
x
,
z
,
y
])
...
...
python/paddle/fluid/tests/unittests/test_communicator_half_async.py
浏览文件 @
7bb4a4e8
...
...
@@ -69,8 +69,8 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
fleet
.
init_worker
()
exe
.
run
(
fleet
.
startup_program
)
fleet
.
init_worker
()
train_reader
=
paddle
.
batch
(
self
.
fake_reader
(),
batch_size
=
24
)
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
x
,
y
])
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录