Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
46c175a5
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
46c175a5
编写于
5月 09, 2020
作者:
W
wsc
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Modify example scripts of bert model
上级
88215d00
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
24 addition
and
44 deletion
+24
-44
example/bert_clue/run_distribute_pretrain.sh
example/bert_clue/run_distribute_pretrain.sh
+6
-3
example/bert_clue/run_pretrain.py
example/bert_clue/run_pretrain.py
+1
-3
example/bert_clue/run_standalone_pretrain.sh
example/bert_clue/run_standalone_pretrain.sh
+1
-1
tests/st/networks/models/bert/bert_tdt_lossscale.py
tests/st/networks/models/bert/bert_tdt_lossscale.py
+16
-37
未找到文件。
example/bert_clue/run_distribute_pretrain.sh
浏览文件 @
46c175a5
...
...
@@ -26,12 +26,16 @@ DATA_DIR=$3
SCHEMA_DIR
=
$4
export
MINDSPORE_HCCL_CONFIG_PATH
=
$5
export
RANK_TABLE_FILE
=
$5
export
RANK_SIZE
=
$1
for
((
i
=
0
;
i<RANK_SIZE
;
i++
))
do
export
DEVICE_ID
=
$i
start
=
`
expr
$i
\*
12
`
export
DEVICE_ID
=
$i
export
RANK_ID
=
$i
export
DEPLOY_MODE
=
0
export
GE_USE_STATIC_MEMORY
=
1
end
=
`
expr
$start
\+
11
`
cmdopt
=
$start
"-"
$end
...
...
@@ -39,7 +43,6 @@ do
mkdir
./LOG
$i
cp
*
.py ./LOG
$i
cd
./LOG
$i
||
exit
export
RANK_ID
=
$i
echo
"start training for rank
$i
, device
$DEVICE_ID
"
env
>
env.log
taskset
-c
$cmdopt
python ../run_pretrain.py
\
...
...
@@ -56,7 +59,7 @@ do
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--save_checkpoint_steps
=
1000
\
--save_checkpoint_steps
=
1000
0
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
--schema_dir
=
$SCHEMA_DIR
>
log.txt 2>&1 &
...
...
example/bert_clue/run_pretrain.py
浏览文件 @
46c175a5
...
...
@@ -84,13 +84,11 @@ def run_pretrain():
if
args_opt
.
distribute
==
"true"
:
device_num
=
args_opt
.
device_num
context
.
reset_auto_parallel_context
()
context
.
set_context
(
enable_hccl
=
True
)
context
.
set_auto_parallel_context
(
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
mirror_mean
=
True
,
device_num
=
device_num
)
D
.
init
()
rank
=
args_opt
.
device_id
%
device_num
else
:
context
.
set_context
(
enable_hccl
=
False
)
rank
=
0
device_num
=
1
...
...
@@ -103,7 +101,7 @@ def run_pretrain():
optimizer
=
Lamb
(
netwithloss
.
trainable_params
(),
decay_steps
=
ds
.
get_dataset_size
()
*
ds
.
get_repeat_count
(),
start_learning_rate
=
cfg
.
Lamb
.
start_learning_rate
,
end_learning_rate
=
cfg
.
Lamb
.
end_learning_rate
,
power
=
cfg
.
Lamb
.
power
,
warmup_steps
=
cfg
.
Lamb
.
warmup_steps
,
weight_decay
=
cfg
.
Lamb
.
weight_decay
,
eps
=
cfg
.
Lamb
.
eps
,
decay_filter
=
cfg
.
Lamb
.
decay_filter
)
eps
=
cfg
.
Lamb
.
eps
)
elif
cfg
.
optimizer
==
'Momentum'
:
optimizer
=
Momentum
(
netwithloss
.
trainable_params
(),
learning_rate
=
cfg
.
Momentum
.
learning_rate
,
momentum
=
cfg
.
Momentum
.
momentum
)
...
...
example/bert_clue/run_standalone_pretrain.sh
浏览文件 @
46c175a5
...
...
@@ -38,7 +38,7 @@ python run_pretrain.py \
--enable_data_sink
=
"true"
\
--data_sink_steps
=
1
\
--checkpoint_path
=
""
\
--save_checkpoint_steps
=
1000
\
--save_checkpoint_steps
=
1000
0
\
--save_checkpoint_num
=
1
\
--data_dir
=
$DATA_DIR
\
--schema_dir
=
$SCHEMA_DIR
>
log.txt 2>&1 &
tests/st/networks/models/bert/bert_tdt_lossscale.py
浏览文件 @
46c175a5
...
...
@@ -76,26 +76,6 @@ def get_config(version='base', batch_size=1):
token_type_ids_from_dataset
=
True
,
dtype
=
mstype
.
float32
,
compute_type
=
mstype
.
float16
)
elif
version
==
'large_mixed'
:
bert_config
=
BertConfig
(
batch_size
=
batch_size
,
seq_length
=
128
,
vocab_size
=
21136
,
hidden_size
=
1024
,
num_hidden_layers
=
24
,
num_attention_heads
=
16
,
intermediate_size
=
4096
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.0
,
attention_probs_dropout_prob
=
0.0
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
,
use_relative_positions
=
True
,
input_mask_from_dataset
=
True
,
token_type_ids_from_dataset
=
True
,
dtype
=
mstype
.
float32
,
compute_type
=
mstype
.
float32
)
else
:
bert_config
=
BertConfig
(
batch_size
=
batch_size
)
return
bert_config
...
...
@@ -136,8 +116,8 @@ class ModelCallback(Callback):
def
step_end
(
self
,
run_context
):
cb_params
=
run_context
.
original_args
()
self
.
loss_list
.
append
(
cb_params
.
net_outputs
[
0
].
asnumpy
()[
0
])
self
.
overflow_list
.
append
(
cb_params
.
net_outputs
[
1
])
self
.
lossscale_list
.
append
(
cb_params
.
net_outputs
[
2
])
self
.
overflow_list
.
append
(
cb_params
.
net_outputs
[
1
]
.
asnumpy
()
)
self
.
lossscale_list
.
append
(
cb_params
.
net_outputs
[
2
]
.
asnumpy
()
)
print
(
"epoch: {}, outputs are: {}"
.
format
(
cb_params
.
cur_epoch_num
,
str
(
cb_params
.
net_outputs
)))
@
pytest
.
mark
.
level0
...
...
@@ -157,7 +137,7 @@ def test_bert_tdt():
netwithloss
=
BertNetworkWithLoss
(
config
,
True
)
optimizer
=
Momentum
(
netwithloss
.
trainable_params
(),
learning_rate
=
2e-5
,
momentum
=
0.9
)
scale_window
=
3
scale_manager
=
DynamicLossScaleManager
(
2
**
32
,
2
,
scale_window
)
scale_manager
=
DynamicLossScaleManager
(
2
**
16
,
2
,
scale_window
)
netwithgrads
=
BertTrainOneStepWithLossScaleCell
(
netwithloss
,
optimizer
=
optimizer
,
scale_update_cell
=
scale_manager
.
get_update_cell
())
netwithgrads
.
set_train
(
True
)
model
=
Model
(
netwithgrads
)
...
...
@@ -182,22 +162,21 @@ def test_bert_tdt():
param
.
default_input
=
weight_variable
(
value
.
asnumpy
().
shape
)
model
.
train
(
ds
.
get_repeat_count
(),
ds
,
callbacks
=
callback
,
dataset_sink_mode
=
False
)
# assertion occurs while the loss_scale value is wrong
count
=
0
for
i
in
range
(
len
(
callback
.
overflow_list
)):
if
callback
.
overflow_list
[
i
]
==
Tensor
(
True
,
mstype
.
bool_
)
and
i
>
0
:
count
=
0
assert
callback
.
lossscale_list
[
i
]
==
callback
.
lossscale_list
[
i
-
1
]
*
Tensor
(
0.5
,
mstype
.
float32
)
if
callback
.
overflow_list
[
i
]
==
Tensor
(
False
,
mstype
.
bool_
):
count
=
count
+
1
if
count
==
scale_window
:
count
=
0
assert
callback
.
lossscale_list
[
i
]
==
callback
.
lossscale_list
[
i
-
1
]
*
Tensor
(
2.0
,
mstype
.
float32
)
# assertion occurs while the loss value is wrong
# assertion occurs while the loss value, overflow state or loss_scale value is wrong
loss_value
=
np
.
array
(
callback
.
loss_list
)
expect_
value
=
[
12.1918125
,
11.966035
,
11.972114
,
11.982671
,
11.976399
,
12.616986
,
12.180658
,
12.850562
,
12.415608
,
12.640145
]
expect_
loss_value
=
[
12.1918125
,
11.966035
,
11.972114
,
11.982188
,
11.974092
,
12.610916
,
12.17565
,
12.840416
,
12.40291
,
12.621661
]
print
(
"loss value: {}"
.
format
(
loss_value
))
assert
np
.
allclose
(
loss_value
,
expect_value
,
0.00001
,
0.00001
)
assert
np
.
allclose
(
loss_value
,
expect_loss_value
,
0.00001
,
0.00001
)
overflow
=
np
.
array
(
callback
.
overflow_list
)
expect_overflow
=
[
True
,
True
,
False
,
False
,
False
,
True
,
False
,
False
,
False
,
True
]
print
(
"overflow: {}"
.
format
(
overflow
))
assert
(
overflow
==
expect_overflow
).
all
()
loss_scale
=
np
.
array
(
callback
.
lossscale_list
)
expect_loss_scale
=
[
32768.0
,
16384.0
,
16384.0
,
16384.0
,
32768.0
,
16384.0
,
16384.0
,
16384.0
,
32768.0
,
16384.0
]
print
(
"loss scale: {}"
.
format
(
loss_scale
))
assert
np
.
allclose
(
loss_scale
,
expect_loss_scale
,
0.00001
,
0.00001
)
if
__name__
==
'__main__'
:
test_bert_tdt
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录