Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
821acdb3
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
821acdb3
编写于
5月 18, 2018
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update op to trianer and pserver
上级
f688652f
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
72 addition
and
27 deletion
+72
-27
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+72
-27
未找到文件。
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
821acdb3
...
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
import
os
import
math
import
distributed_splitter
as
splitter
...
...
@@ -26,6 +27,10 @@ LOOKUP_TABLE_TYPE = "lookup_table"
LOOKUP_TABLE_GRAD_TYPE
=
"lookup_table_grad"
RPC_CLIENT_VAR_NAME
=
"RPC_CLIENT_VAR"
# for checkpoint
SUCCESS
=
"_SUCCESS"
SERIAL_VAR_NAME
=
"SERIAL_NUMBER"
class
VarBlock
:
def
__init__
(
self
,
varname
,
offset
,
size
):
...
...
@@ -153,7 +158,8 @@ class DistributeTranspiler:
pservers
=
"127.0.0.1:6174"
,
trainers
=
1
,
split_method
=
splitter
.
round_robin
,
sync_mode
=
True
):
sync_mode
=
True
,
checkpoint_dir
=
None
):
"""
Transpile the program to distributed data-parallelism programs.
The main_program will be transformed to use a remote parameter server
...
...
@@ -315,22 +321,22 @@ class DistributeTranspiler:
"sync_mode"
:
self
.
sync_mode
})
serial_var
=
program
.
global_block
().
create_var
(
name
=
"SERIAL_NUMBER"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
if
checkpoint_dir
and
self
.
is_chief
:
program
.
global_block
().
create_var
(
name
=
SERIAL_VAR_NAME
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
save_vars
=
[]
for
var
in
self
.
origin_program
.
list_vars
():
if
self
.
is_persistable
(
var
):
save_vars
.
append
(
var
.
name
)
save_vars
=
[]
for
var
in
self
.
origin_program
.
list_vars
():
if
self
.
_
is_persistable
(
var
):
save_vars
.
append
(
var
.
name
)
program
.
global_block
().
append_op
(
type
=
"checkpoint_save"
,
inputs
=
{
"X"
:
save_vars
},
outputs
=
{
"Serial"
:
serial_var
},
attrs
=
{
"overwrite"
:
False
,
"dir"
:
"/workspace/ckpt/"
})
program
.
global_block
().
append_op
(
type
=
"checkpoint_save"
,
inputs
=
{
"X"
:
save_vars
},
attrs
=
{
"overwrite"
:
True
,
"dir"
:
checkpoint_dir
})
# step4: Concat the parameters splits together after recv.
for
varname
,
splited_var
in
param_var_mapping
.
iteritems
():
...
...
@@ -512,13 +518,6 @@ class DistributeTranspiler:
pserver_program
.
sync_with_cpp
()
return
pserver_program
def
is_persistable
(
self
,
var
):
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
:
return
False
return
var
.
persistable
def
get_train_startup_program
(
self
,
checkpoint_load_dir
=
None
):
"""
Get train startup program.
...
...
@@ -532,13 +531,16 @@ class DistributeTranspiler:
load_vars
=
[]
for
var
in
startup_prog
.
list_vars
():
if
self
.
is_persistable
(
var
):
if
self
.
_
is_persistable
(
var
):
load_vars
.
append
(
var
.
name
)
serial_number
=
self
.
_get_lastest_checkpoint_dir
(
checkpoint_load_dir
)
startup_prog
.
global_block
().
append_op
(
type
=
"checkpoint_load"
,
outputs
=
{
"Out"
:
load_vars
},
attrs
=
{
"dir"
:
checkpoint_load_dir
})
inputs
=
{
"X"
:
load_vars
},
attrs
=
{
"dir"
:
checkpoint_load_dir
,
"Serial"
:
serial_number
})
return
startup_prog
def
get_startup_program
(
self
,
...
...
@@ -599,16 +601,59 @@ class DistributeTranspiler:
attrs
=
op
.
attrs
)
for
var
in
new_outputs
.
values
():
load_vars
.
append
(
var
.
name
)
# add checkpoint op
# add checkpoint op
if
not
checkpoint_load_dir
:
return
s_prog
serial_number
=
self
.
_get_lastest_checkpoint_dir
(
checkpoint_load_dir
)
s_prog
.
global_block
().
append_op
(
type
=
"checkpoint_load"
,
inputs
=
{
"X"
:
load_vars
},
attrs
=
{
"dir"
:
checkpoint_load_dir
})
attrs
=
{
"dir"
:
checkpoint_load_dir
,
"Serial"
:
serial_number
})
return
s_prog
def
_is_persistable
(
self
,
var
):
"""only save LodTensor variable"""
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
:
return
False
return
var
.
persistable
def
_get_lastest_checkpoint_dir
(
self
,
checkpoint_dir
):
"""
get the biggest number in checkpoint_dir, which has _SUCCESS
"""
if
not
checkpoint_dir
.
strip
():
return
""
def
has_success
(
checkpoint_dir
,
cur_dir
):
"""
is _SUCCESS in this dir
"""
if
not
os
.
path
.
isdir
(
cur_dir
):
return
-
1
try
:
int
(
cur_dir
)
except
ValueError
:
return
-
1
success_path
=
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
,
SUCCESS
)
if
os
.
path
.
isfile
(
success_path
):
return
int
(
cur_dir
)
current_dir
=
0
dirs
=
os
.
listdir
(
checkpoint_dir
)
for
cur_dir
in
dirs
:
success_num
=
has_success
(
checkpoint_dir
,
cur_dir
)
if
success_num
>
current_dir
:
current_dir
=
success_num
return
str
(
current_dir
)
# transpiler function for dis lookup_table
def
_replace_lookup_table_op_with_prefetch
(
self
,
program
,
rpc_client_var
,
eplist
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录