Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
正统之独孤求败
mindspore
提交
9ce9c215
M
mindspore
项目概览
正统之独孤求败
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
9ce9c215
编写于
6月 16, 2020
作者:
W
wilfChen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
gpu bert script
上级
5aeba82a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
114 addition
and
6 deletion
+114
-6
model_zoo/bert/run_pretrain.py
model_zoo/bert/run_pretrain.py
+22
-5
model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
+44
-0
model_zoo/bert/scripts/run_standalone_pretrain.sh
model_zoo/bert/scripts/run_standalone_pretrain.sh
+1
-1
model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
+47
-0
未找到文件。
model_zoo/bert/run_pretrain.py
浏览文件 @
9ce9c215
...
...
@@ -21,6 +21,7 @@ import os
import
argparse
import
numpy
import
mindspore.communication.management
as
D
import
mindspore.common.dtype
as
mstype
from
mindspore
import
context
from
mindspore.train.model
import
Model
from
mindspore.train.parallel_utils
import
ParallelMode
...
...
@@ -28,6 +29,7 @@ from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from
mindspore.train.callback
import
Callback
,
ModelCheckpoint
,
CheckpointConfig
,
TimeMonitor
from
mindspore.train.serialization
import
load_checkpoint
,
load_param_into_net
from
mindspore.nn.optim
import
Lamb
,
Momentum
,
AdamWeightDecayDynamicLR
from
mindspore
import
log
as
logger
from
src
import
BertNetworkWithLoss
,
BertTrainOneStepCell
,
BertTrainOneStepWithLossScaleCell
from
src.dataset
import
create_bert_dataset
from
src.config
import
cfg
,
bert_net_cfg
...
...
@@ -55,6 +57,8 @@ class LossCallBack(Callback):
def
run_pretrain
():
"""pre-train bert_clue"""
parser
=
argparse
.
ArgumentParser
(
description
=
'bert pre_training'
)
parser
.
add_argument
(
'--device_target'
,
type
=
str
,
default
=
'Ascend'
,
choices
=
[
'Ascend'
,
'GPU'
],
help
=
'device where the code will be implemented. (Default: Ascend)'
)
parser
.
add_argument
(
"--distribute"
,
type
=
str
,
default
=
"false"
,
help
=
"Run distribute, default is false."
)
parser
.
add_argument
(
"--epoch_size"
,
type
=
int
,
default
=
"1"
,
help
=
"Epoch size, default is 1."
)
parser
.
add_argument
(
"--device_id"
,
type
=
int
,
default
=
0
,
help
=
"Device id, default is 0."
)
...
...
@@ -74,11 +78,21 @@ def run_pretrain():
parser
.
add_argument
(
"--schema_dir"
,
type
=
str
,
default
=
""
,
help
=
"Schema path, it is better to use absolute path"
)
args_opt
=
parser
.
parse_args
()
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"Ascend"
,
device_id
=
args_opt
.
device_id
)
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
args_opt
.
device_target
,
device_id
=
args_opt
.
device_id
)
context
.
set_context
(
reserve_class_name_in_scope
=
False
)
ckpt_save_dir
=
args_opt
.
checkpoint_path
if
args_opt
.
distribute
==
"true"
:
device_num
=
args_opt
.
device_num
if
args_opt
.
device_target
==
'Ascend'
:
D
.
init
(
'hccl'
)
device_num
=
args_opt
.
device_num
rank
=
args_opt
.
device_id
%
device_num
else
:
D
.
init
(
'nccl'
)
device_num
=
D
.
get_group_size
()
rank
=
D
.
get_rank
()
ckpt_save_dir
=
args_opt
.
checkpoint_path
+
'ckpt_'
+
str
(
rank
)
+
'/'
context
.
reset_auto_parallel_context
()
context
.
set_auto_parallel_context
(
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
mirror_mean
=
True
,
device_num
=
device_num
)
...
...
@@ -93,12 +107,15 @@ def run_pretrain():
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
30
,
90
,
150
,
210
,
270
,
330
,
390
,
421
])
else
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
38
,
93
,
148
,
203
,
258
,
313
,
368
,
397
])
D
.
init
()
rank
=
args_opt
.
device_id
%
device_num
else
:
rank
=
0
device_num
=
1
if
args_opt
.
device_target
==
'GPU'
and
bert_net_cfg
.
compute_type
!=
mstype
.
float32
:
logger
.
warning
(
'Gpu only support fp32 temporarily, run with fp32.'
)
bert_net_cfg
.
compute_type
=
mstype
.
float32
ds
,
new_repeat_count
=
create_bert_dataset
(
args_opt
.
epoch_size
,
device_num
,
rank
,
args_opt
.
do_shuffle
,
args_opt
.
enable_data_sink
,
args_opt
.
data_sink_steps
,
args_opt
.
data_dir
,
args_opt
.
schema_dir
)
...
...
@@ -130,7 +147,7 @@ def run_pretrain():
if
args_opt
.
enable_save_ckpt
==
"true"
:
config_ck
=
CheckpointConfig
(
save_checkpoint_steps
=
args_opt
.
save_checkpoint_steps
,
keep_checkpoint_max
=
args_opt
.
save_checkpoint_num
)
ckpoint_cb
=
ModelCheckpoint
(
prefix
=
'checkpoint_bert'
,
config
=
config_ck
)
ckpoint_cb
=
ModelCheckpoint
(
prefix
=
'checkpoint_bert'
,
directory
=
ckpt_save_dir
,
config
=
config_ck
)
callback
.
append
(
ckpoint_cb
)
if
args_opt
.
checkpoint_path
:
...
...
model_zoo/bert/scripts/run_distribute_pretrain_for_gpu.sh
0 → 100644
浏览文件 @
9ce9c215
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
echo
"=============================================================================================================="
echo
"Please run the scipt as: "
echo
"bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR"
echo
"for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json"
echo
"It is better to use absolute path."
echo
"=============================================================================================================="
RANK_SIZE
=
$1
EPOCH_SIZE
=
$2
DATA_DIR
=
$3
SCHEMA_DIR
=
$4
mpirun
--allow-run-as-root
-n
$RANK_SIZE
\
python run_pretrain.py
\
--device_target
=
"GPU"
\
--distribute
=
"true"
\
--epoch_size
=
$EPOCH_SIZE
\
--enable_save_ckpt
=
"true"
\
--enable_lossscale
=
"false"
\
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
--schema_dir
=
$SCHEMA_DIR
>
log.txt 2>&1 &
model_zoo/bert/scripts/run_standalone_pretrain.sh
浏览文件 @
9ce9c215
...
...
@@ -37,7 +37,7 @@ python run_pretrain.py \
--enable_lossscale
=
"true"
\
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
00
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
...
...
model_zoo/bert/scripts/run_standalone_pretrain_for_gpu.sh
0 → 100644
浏览文件 @
9ce9c215
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
echo
"=============================================================================================================="
echo
"Please run the scipt as: "
echo
"bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
echo
"for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
echo
"=============================================================================================================="
DEVICE_ID
=
$1
EPOCH_SIZE
=
$2
DATA_DIR
=
$3
SCHEMA_DIR
=
$4
export
CUDA_VISIBLE_DEVICES
=
$DEVICE_ID
mkdir
-p
ms_log
CUR_DIR
=
`
pwd
`
export
GLOG_log_dir
=
${
CUR_DIR
}
/ms_log
export
GLOG_logtostderr
=
0
python run_pretrain.py
\
--device_target
=
"GPU"
\
--distribute
=
"false"
\
--epoch_size
=
$EPOCH_SIZE
\
--enable_save_ckpt
=
"true"
\
--enable_lossscale
=
"false"
\
--do_shuffle
=
"true"
\
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--save_checkpoint_steps
=
10000
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
--schema_dir
=
$SCHEMA_DIR
>
log.txt 2>&1 &
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录