Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2f4c039e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2f4c039e
编写于
5月 14, 2018
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rename, modify ckpt structure
上级
461d2fc0
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
24 addition
and
24 deletion
+24
-24
paddle/fluid/operators/checkpoint_save_op.cc
paddle/fluid/operators/checkpoint_save_op.cc
+11
-23
paddle/fluid/operators/checkpoint_save_op_test.cc
paddle/fluid/operators/checkpoint_save_op_test.cc
+1
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+12
-0
未找到文件。
paddle/fluid/operators/checkpoint_save_op.cc
浏览文件 @
2f4c039e
...
...
@@ -68,19 +68,16 @@ class CheckpointSaveOp : public framework::OperatorBase {
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
filename
=
Attr
<
std
::
string
>
(
"file_path
"
);
auto
dir
=
Attr
<
std
::
string
>
(
"dir
"
);
auto
overwrite
=
Attr
<
bool
>
(
"overwrite"
);
bool
is_present
=
FileExists
(
filename
);
bool
is_present
=
FileExists
(
dir
);
if
(
is_present
&&
!
overwrite
)
{
PADDLE_THROW
(
"%s exists!, cannot save_combine to it when overwrite=false"
,
filename
,
overwrite
);
dir
,
overwrite
);
}
MkDirRecursively
(
DirName
(
filename
).
c_str
());
std
::
ofstream
fout
(
filename
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fout
),
"Cannot open %s to write"
,
filename
);
MkDirRecursively
(
dir
.
c_str
());
auto
inp_var_names
=
Inputs
(
"X"
);
PADDLE_ENFORCE_GT
(
static_cast
<
int
>
(
inp_var_names
.
size
()),
0
,
...
...
@@ -92,6 +89,10 @@ class CheckpointSaveOp : public framework::OperatorBase {
for
(
size_t
i
=
0
;
i
<
inp_var_names
.
size
();
i
++
)
{
auto
*
var
=
scope
.
FindVar
(
inp_var_names
[
i
]);
std
::
string
var_file
;
var_file
.
append
(
dir
);
var_file
.
append
(
"/"
);
var_file
.
append
(
inp_var_names
[
i
]);
PADDLE_ENFORCE
(
var
!=
nullptr
,
"Cannot find variable %s for save_combine_op"
,
...
...
@@ -103,23 +104,10 @@ class CheckpointSaveOp : public framework::OperatorBase {
auto
&
tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
// Serialize tensors one by one
// Check types to see if a fp16 transformation is required
auto
in_dtype
=
framework
::
ToDataType
(
tensor
.
type
());
auto
out_dtype
=
in_dtype
;
if
(
in_dtype
!=
out_dtype
)
{
auto
in_kernel_type
=
framework
::
OpKernelType
(
in_dtype
,
place
);
auto
out_kernel_type
=
framework
::
OpKernelType
(
out_dtype
,
place
);
framework
::
LoDTensor
out
;
// copy LoD info to the new tensor
out
.
set_lod
(
tensor
.
lod
());
framework
::
TransDataType
(
in_kernel_type
,
out_kernel_type
,
tensor
,
&
out
);
framework
::
SerializeToStream
(
fout
,
out
,
dev_ctx
);
}
else
{
framework
::
SerializeToStream
(
fout
,
tensor
,
dev_ctx
);
}
std
::
ofstream
fout
(
var_file
);
framework
::
SerializeToStream
(
fout
,
tensor
,
dev_ctx
);
fout
.
close
();
}
fout
.
close
();
}
};
...
...
paddle/fluid/operators/che
'ck'po'in'
t_save_op_test.cc
→
paddle/fluid/operators/che
ckpoin
t_save_op_test.cc
浏览文件 @
2f4c039e
...
...
@@ -38,7 +38,7 @@ TEST(CheckpointSaveOp, CPU) {
}
paddle
::
framework
::
AttributeMap
attrs
;
attrs
.
insert
({
"
file_path"
,
std
::
string
(
"tensor.save
"
)});
attrs
.
insert
({
"
dir"
,
std
::
string
(
"tensor/ckpt
"
)});
auto
save_op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
"checkpoint_save"
,
{{
"X"
,
{
"test_var"
}}},
{},
attrs
);
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
2f4c039e
...
...
@@ -207,6 +207,11 @@ class DistributeTranspiler:
self
.
pserver_endpoints
=
pserver_endpoints
self
.
optimize_ops
,
params_grads
=
self
.
_get_optimize_pass
()
# is_chief (no.0 triner) for checkpoint
# the no.0 trainer will save all variables and its own reader offset to checkpoint
# other trianers will save its own reader offset to checkpoint
self
.
is_chief
=
trainer_id
==
0
# process lookup_table_op
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table.
...
...
@@ -309,6 +314,13 @@ class DistributeTranspiler:
"epmap"
:
eplist
,
"sync_mode"
:
self
.
sync_mode
})
program
.
global_block
().
append_op
(
type
=
"checkpoint_save"
,
inputs
=
{
"X"
:
send_outputs
},
attrs
=
{
"overwrite"
:
True
,
"file_path"
:
"/workspace/ckpt/"
})
# step4: Concat the parameters splits together after recv.
for
varname
,
splited_var
in
param_var_mapping
.
iteritems
():
if
len
(
splited_var
)
<=
1
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录