Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
291e1594
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
291e1594
编写于
9月 25, 2020
作者:
C
chengmo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
revert performance code
上级
d3cda7f7
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
33 addition
and
78 deletion
+33
-78
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+0
-1
paddle/fluid/operators/distributed/parameter_recv.cc
paddle/fluid/operators/distributed/parameter_recv.cc
+7
-7
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+13
-13
python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
...fleet/meta_optimizers/parameter_server_graph_optimizer.py
+0
-4
python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
...buted/fleet/meta_optimizers/parameter_server_optimizer.py
+0
-13
python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
...dle/distributed/fleet/runtime/parameter_server_runtime.py
+11
-16
python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
...paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+2
-24
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
291e1594
...
...
@@ -97,7 +97,6 @@ message AsyncConfig {
optional
int32
thread_pool_size
=
6
[
default
=
1
];
optional
int32
send_wait_times
=
7
[
default
=
1
];
optional
bool
runtime_split_send_recv
=
8
[
default
=
false
];
optional
string
heter_worker_device
=
9
[
default
=
'cpu'
];
}
message
PipelineConfig
{
optional
int32
micro_batch
=
1
[
default
=
1
];
}
...
...
paddle/fluid/operators/distributed/parameter_recv.cc
浏览文件 @
291e1594
...
...
@@ -112,6 +112,10 @@ void RecvSelectedRows(const CommContext &rpc_ctx,
template
<
typename
T
>
void
RecvLodTensor
(
const
CommContext
&
rpc_ctx
,
const
framework
::
Scope
&
scope
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
cpu_place
=
platform
::
CPUPlace
();
auto
&
cpu_ctx
=
*
pool
.
Get
(
cpu_place
);
distributed
::
RPCClient
*
rpc_client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
rpc_ctx
.
trainer_id
);
...
...
@@ -121,14 +125,10 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
if
(
rpc_ctx
.
origin_varnames
.
size
()
==
1
&&
rpc_ctx
.
splited_varnames
.
size
()
==
1
)
{
auto
varname
=
rpc_ctx
.
origin_varnames
[
0
];
const
auto
place
=
scope
.
FindVar
(
varname
)
->
Get
<
framework
::
LoDTensor
>
().
place
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
VLOG
(
4
)
<<
"recv "
<<
varname
<<
" from "
<<
rpc_ctx
.
epmap
[
0
]
<<
" in gpu? "
<<
platform
::
is_gpu_place
(
place
);
rets
.
push_back
(
rpc_client
->
AsyncGetVarNoBarrier
(
rpc_ctx
.
epmap
[
0
],
ctx
,
VLOG
(
4
)
<<
"recv "
<<
varname
<<
" from "
<<
rpc_ctx
.
epmap
[
0
];
rets
.
push_back
(
rpc_client
->
AsyncGetVarNoBarrier
(
rpc_ctx
.
epmap
[
0
],
cpu_ctx
,
scope
,
varname
,
varname
));
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
PADDLE_ENFORCE_NE
(
rets
[
i
]
->
Wait
(),
0U
,
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
291e1594
...
...
@@ -259,7 +259,7 @@ class DistributedStrategy(object):
def
a_sync
(
self
,
flag
):
if
isinstance
(
flag
,
bool
):
self
.
strategy
.
a_sync
=
flag
self
.
a_sync_configs
=
{
"k_steps"
:
0
,
"worker_device"
:
'cpu'
}
self
.
a_sync_configs
=
{
"k_steps"
:
0
}
else
:
raise
ValueError
(
"The type of `flag` is invalid, expected type is bool, but received %s"
.
...
...
python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
浏览文件 @
291e1594
...
...
@@ -31,10 +31,6 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
if
k_steps
<
0
:
return
False
device
=
self
.
user_defined_strategy
.
a_sync_configs
[
"worker_device"
]
if
device
.
upper
()
!=
'CPU'
:
return
False
if
self
.
role_maker
.
_is_server
():
return
False
...
...
python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
浏览文件 @
291e1594
...
...
@@ -13,7 +13,6 @@
from
paddle
import
fluid
from
.meta_optimizer_base
import
MetaOptimizerBase
from
..base.private_helper_function
import
wait_server_ready
from
paddle.fluid
import
core
import
subprocess
import
re
...
...
@@ -75,8 +74,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
_startup
=
worker
.
delet_extra_optimizes_pass
(
_startup
,
compiled_config
)
compiled_config
.
set_origin_ps_main_program
(
_main
)
compiled_config
.
set_origin_ps_startup_program
(
_startup
)
# for heter program
if
self
.
role_maker
.
_is_heter_parameter_server_mode
:
from
paddle.fluid.incubate.fleet.parameter_server.ir
import
heter_trainer_pass
as
heter_worker
...
...
@@ -94,16 +91,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
else
:
_main
=
worker
.
append_send_ops_pass
(
_main
,
compiled_config
)
_startup
=
_startup
compiled_config
.
set_origin_ps_main_program
(
_main
)
compiled_config
.
set_origin_ps_startup_program
(
_startup
)
# for trainer wait server ready
wait_server_ready
(
self
.
role_maker
.
_get_pserver_endpoints
())
# for ps-heter mode, wait heter worker ready
if
self
.
role_maker
.
_is_heter_parameter_server_mode
and
self
.
role_maker
.
_is_worker
(
):
wait_server_ready
(
self
.
role_maker
.
_get_heter_worker_endpoints
())
return
_main
,
_startup
...
...
python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
浏览文件 @
291e1594
...
...
@@ -198,21 +198,16 @@ class ParameterServerRuntime(RuntimeBase):
warnings
.
warn
(
"communicator has been initialized, skip"
)
def
_get_executor
(
self
):
if
self
.
role_maker
.
_is_heter_worker
():
heter_worker_device
=
self
.
context
[
"valid_strategy"
].
a_sync_configs
[
"heter_worker_device"
].
upper
()
if
heter_worker_device
==
"GPU"
:
if
self
.
role_maker
.
_get_heter_worker_device
()
==
"GPU"
:
gpu_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
executor
=
Executor
(
fluid
.
CUDAPlace
(
gpu_id
))
elif
heter_worker_device
==
"XPU"
:
elif
self
.
role_maker
.
_get_heter_worker_device
()
==
"XPU"
:
xpu_id
=
int
(
os
.
getenv
(
"FLAGS_selected_xpus"
,
"0"
))
executor
=
Executor
(
fluid
.
XPUPlace
(
xpu_id
))
elif
heter_worker_device
==
"CPU"
:
fluid
.
Executor
(
fluid
.
CPUPlace
())
else
:
raise
ValueError
(
"
Heter Worker
Not Support Device {}"
.
format
(
heter_worker_device
))
raise
ValueError
(
"Not Support Device {}"
.
format
(
self
.
role_maker
.
_get_heter_worker_device
()
))
else
:
executor
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
return
executor
...
...
@@ -463,13 +458,13 @@ class ParameterServerRuntime(RuntimeBase):
def
_save_distributed_persistables
(
self
,
executor
,
dirname
,
main_program
):
dense_ctx
=
self
.
compiled_strategy
.
get_communicator_recv_context
(
recv_type
=
1
,
use_origin_program
=
True
)
recv_type
=
1
)
sparse_ctx
=
self
.
compiled_strategy
.
get_communicator_recv_context
(
recv_type
=
2
,
use_origin_program
=
True
)
recv_type
=
2
)
distributed_ctx
=
self
.
compiled_strategy
.
get_communicator_recv_context
(
recv_type
=
3
,
use_origin_program
=
True
)
recv_type
=
3
)
recv_dense_varnames
=
self
.
_save_dense_params
(
executor
,
dirname
,
dense_ctx
,
main_program
)
...
...
@@ -521,7 +516,7 @@ class ParameterServerRuntime(RuntimeBase):
)
if
main_program
is
None
:
main_program
=
self
.
compiled_strategy
.
get_origin_ps
_main_program
()
main_program
=
fluid
.
default
_main_program
()
if
isinstance
(
main_program
,
CompiledProgram
):
raise
TypeError
(
...
...
python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
浏览文件 @
291e1594
...
...
@@ -133,8 +133,6 @@ class CompileTimeStrategy(object):
self
.
origin_main_program
=
main_program
self
.
origin_startup_program
=
startup_program
self
.
origin_ps_main_program
=
main_program
self
.
origin_ps_startup_program
=
startup_program
self
.
strategy
=
strategy
self
.
role_maker
=
role_maker
...
...
@@ -155,11 +153,6 @@ class CompileTimeStrategy(object):
self
.
_build_var_distributed
()
# for heter-ps save variables
self
.
origin_merged_variables_pairs
=
list
(
self
.
merged_variables_pairs
)
self
.
origin_merged_dense_pairs
=
list
(
self
.
merged_dense_pairs
)
self
.
origin_merged_sparse_pairs
=
list
(
self
.
merged_sparse_pairs
)
def
get_distributed_mode
(
self
):
trainer
=
self
.
strategy
.
get_trainer_runtime_config
()
return
trainer
.
mode
...
...
@@ -221,18 +214,6 @@ class CompileTimeStrategy(object):
def
get_origin_startup_program
(
self
):
return
self
.
origin_startup_program
def
set_origin_ps_main_program
(
self
,
program
):
self
.
origin_ps_main_program
=
program
def
set_origin_ps_startup_program
(
self
,
program
):
self
.
origin_ps_startup_program
=
program
def
get_origin_ps_main_program
(
self
):
return
self
.
origin_ps_main_program
def
get_origin_ps_startup_program
(
self
):
return
self
.
origin_ps_startup_program
def
get_sparse_varname_on_ps
(
self
,
is_distributed
,
endpoint
=
None
):
if
not
endpoint
:
endpoint
=
self
.
get_ps_endpoint
()
...
...
@@ -397,9 +378,7 @@ class CompileTimeStrategy(object):
send_ctx
[
name
]
=
ctx
return
send_ctx
def
get_communicator_recv_context
(
self
,
recv_type
=
1
,
use_origin_program
=
False
):
def
get_communicator_recv_context
(
self
,
recv_type
=
1
):
# recv_type
# 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL
distibuted_varnames
=
get_sparse_tablenames
(
self
.
origin_main_program
,
...
...
@@ -413,8 +392,7 @@ class CompileTimeStrategy(object):
sparse_recv_ctx
=
{}
distributed_recv_ctx
=
{}
variables_pairs
=
self
.
merged_variables_pairs
if
not
use_origin_program
else
self
.
origin_merged_variables_pairs
for
merged
in
variables_pairs
:
for
merged
in
self
.
merged_variables_pairs
:
params
=
merged
[
0
]
if
params
.
merged_var
.
name
in
sparse_varnames
:
continue
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录