Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
9d74cfd3
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9d74cfd3
编写于
6月 17, 2020
作者:
W
wilfChen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add load&save ckpt path for distribute training
上级
6e7a38ac
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
11 addition
and
9 deletion
+11
-9
model_zoo/bert/run_pretrain.py
model_zoo/bert/run_pretrain.py
+6
-5
model_zoo/bert/scripts/run_distribute_pretrain.sh
model_zoo/bert/scripts/run_distribute_pretrain.sh
+1
-1
model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
+1
-1
model_zoo/bert/scripts/run_standalone_pretrain.sh
model_zoo/bert/scripts/run_standalone_pretrain.sh
+1
-1
model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
+2
-1
未找到文件。
model_zoo/bert/run_pretrain.py
浏览文件 @
9d74cfd3
...
...
@@ -68,7 +68,8 @@ def run_pretrain():
parser
.
add_argument
(
"--do_shuffle"
,
type
=
str
,
default
=
"true"
,
help
=
"Enable shuffle for dataset, default is true."
)
parser
.
add_argument
(
"--enable_data_sink"
,
type
=
str
,
default
=
"true"
,
help
=
"Enable data sink, default is true."
)
parser
.
add_argument
(
"--data_sink_steps"
,
type
=
int
,
default
=
"1"
,
help
=
"Sink steps for each epoch, default is 1."
)
parser
.
add_argument
(
"--checkpoint_path"
,
type
=
str
,
default
=
""
,
help
=
"Checkpoint file path"
)
parser
.
add_argument
(
"--save_checkpoint_path"
,
type
=
str
,
default
=
""
,
help
=
"Save checkpoint path"
)
parser
.
add_argument
(
"--load_checkpoint_path"
,
type
=
str
,
default
=
""
,
help
=
"Load checkpoint file path"
)
parser
.
add_argument
(
"--save_checkpoint_steps"
,
type
=
int
,
default
=
1000
,
help
=
"Save checkpoint steps, "
"default is 1000."
)
parser
.
add_argument
(
"--train_steps"
,
type
=
int
,
default
=-
1
,
help
=
"Training Steps, default is -1, "
...
...
@@ -81,7 +82,7 @@ def run_pretrain():
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
args_opt
.
device_target
,
device_id
=
args_opt
.
device_id
)
context
.
set_context
(
reserve_class_name_in_scope
=
False
)
ckpt_save_dir
=
args_opt
.
checkpoint_path
ckpt_save_dir
=
args_opt
.
save_
checkpoint_path
if
args_opt
.
distribute
==
"true"
:
if
args_opt
.
device_target
==
'Ascend'
:
D
.
init
(
'hccl'
)
...
...
@@ -91,7 +92,7 @@ def run_pretrain():
D
.
init
(
'nccl'
)
device_num
=
D
.
get_group_size
()
rank
=
D
.
get_rank
()
ckpt_save_dir
=
args_opt
.
checkpoint_path
+
'ckpt_'
+
str
(
rank
)
+
'/'
ckpt_save_dir
=
args_opt
.
save_
checkpoint_path
+
'ckpt_'
+
str
(
rank
)
+
'/'
context
.
reset_auto_parallel_context
()
context
.
set_auto_parallel_context
(
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
mirror_mean
=
True
,
...
...
@@ -150,8 +151,8 @@ def run_pretrain():
ckpoint_cb
=
ModelCheckpoint
(
prefix
=
'checkpoint_bert'
,
directory
=
ckpt_save_dir
,
config
=
config_ck
)
callback
.
append
(
ckpoint_cb
)
if
args_opt
.
checkpoint_path
:
param_dict
=
load_checkpoint
(
args_opt
.
checkpoint_path
)
if
args_opt
.
load_
checkpoint_path
:
param_dict
=
load_checkpoint
(
args_opt
.
load_
checkpoint_path
)
load_param_into_net
(
netwithloss
,
param_dict
)
if
args_opt
.
enable_lossscale
==
"true"
:
...
...
model_zoo/bert/scripts/run_distribute_pretrain.sh
浏览文件 @
9d74cfd3
...
...
@@ -64,7 +64,7 @@ do
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
100
\
--checkpoint_path
=
""
\
--
load_
checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
...
...
model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
浏览文件 @
9d74cfd3
...
...
@@ -36,7 +36,7 @@ mpirun --allow-run-as-root -n $RANK_SIZE \
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--
load_
checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
...
...
model_zoo/bert/scripts/run_standalone_pretrain.sh
浏览文件 @
9d74cfd3
...
...
@@ -38,7 +38,7 @@ python run_pretrain.py \
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--
load_
checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
...
...
model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
浏览文件 @
9d74cfd3
...
...
@@ -40,7 +40,8 @@ python run_pretrain.py \
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--load_checkpoint_path
=
""
\
--save_checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录