Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
96db04b4
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
96db04b4
编写于
6月 17, 2020
作者:
Y
yuchaojie
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix decoder loop for Transformer model
上级
6e7a38ac
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
25 addition
and
102 deletion
+25
-102
model_zoo/Transformer/src/transformer_model.py
model_zoo/Transformer/src/transformer_model.py
+19
-99
model_zoo/Transformer/train.py
model_zoo/Transformer/train.py
+6
-3
未找到文件。
model_zoo/Transformer/src/transformer_model.py
浏览文件 @
96db04b4
...
...
@@ -781,95 +781,22 @@ class TransformerDecoder(nn.Cell):
super
(
TransformerDecoder
,
self
).
__init__
()
self
.
num_hidden_layers
=
num_hidden_layers
# wait to be supported
# layers = []
# for _ in range(num_hidden_layers):
# layer = DecoderCell(batch_size=batch_size,
# hidden_size=hidden_size,
# seq_length=seq_length,
# enc_seq_length=enc_seq_length,
# num_attention_heads=num_attention_heads,
# intermediate_size=intermediate_size,
# attention_probs_dropout_prob=attention_probs_dropout_prob,
# use_one_hot_embeddings=use_one_hot_embeddings,
# initializer_range=initializer_range,
# hidden_dropout_prob=hidden_dropout_prob,
# hidden_act=hidden_act,
# compute_type=compute_type)
# layers.append(layer)
# self.layers = nn.CellList(layers)
self
.
layer0
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
self
.
layer1
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
self
.
layer2
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
self
.
layer3
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
self
.
layer4
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
self
.
layer5
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
layers
=
[]
for
_
in
range
(
num_hidden_layers
):
layer
=
DecoderCell
(
batch_size
=
batch_size
,
hidden_size
=
hidden_size
,
seq_length
=
seq_length
,
enc_seq_length
=
enc_seq_length
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
use_one_hot_embeddings
=
use_one_hot_embeddings
,
initializer_range
=
initializer_range
,
hidden_dropout_prob
=
hidden_dropout_prob
,
hidden_act
=
hidden_act
,
compute_type
=
compute_type
)
layers
.
append
(
layer
)
self
.
layers
=
nn
.
CellList
(
layers
)
self
.
layer_preprocess
=
LayerPreprocess
(
in_channels
=
hidden_size
)
...
...
@@ -880,16 +807,9 @@ class TransformerDecoder(nn.Cell):
def
construct
(
self
,
input_tensor
,
attention_mask
,
enc_states
,
enc_attention_mask
):
prev_output
=
self
.
reshape
(
input_tensor
,
self
.
shape
)
# wait to be supported
# for layer_module in self.layers:
# layer_output = layer_module(prev_output, attention_mask, enc_states, enc_attention_mask)
# prev_output = layer_output
prev_output
=
self
.
layer0
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
prev_output
=
self
.
layer1
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
prev_output
=
self
.
layer2
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
prev_output
=
self
.
layer3
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
prev_output
=
self
.
layer4
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
prev_output
=
self
.
layer5
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
for
layer_module
in
self
.
layers
:
layer_output
=
layer_module
(
prev_output
,
attention_mask
,
enc_states
,
enc_attention_mask
)
prev_output
=
layer_output
prev_output
=
self
.
layer_preprocess
(
prev_output
)
output
=
self
.
reshape
(
prev_output
,
self
.
out_shape
)
...
...
model_zoo/Transformer/train.py
浏览文件 @
96db04b4
...
...
@@ -16,6 +16,7 @@
import
time
import
argparse
import
random
import
numpy
as
np
import
mindspore.common.dtype
as
mstype
...
...
@@ -26,6 +27,7 @@ from mindspore.train.loss_scale_manager import DynamicLossScaleManager
from
mindspore.train.callback
import
CheckpointConfig
,
ModelCheckpoint
from
mindspore.train.callback
import
Callback
,
TimeMonitor
from
mindspore.train.serialization
import
load_checkpoint
,
load_param_into_net
import
mindspore.dataset.engine
as
de
import
mindspore.communication.management
as
D
from
mindspore.train.parallel_utils
import
ParallelMode
from
mindspore
import
context
...
...
@@ -36,6 +38,10 @@ from src.config import cfg, transformer_net_cfg
from
src.dataset
import
create_transformer_dataset
from
src.lr_schedule
import
create_dynamic_lr
random_seed
=
1
random
.
seed
(
random_seed
)
np
.
random
.
seed
(
random_seed
)
de
.
config
.
set_seed
(
random_seed
)
def
get_ms_timestamp
():
t
=
time
.
time
()
...
...
@@ -161,7 +167,4 @@ def run_transformer_train():
model
.
train
(
repeat_count
,
dataset
,
callbacks
=
callbacks
,
dataset_sink_mode
=
(
args
.
enable_data_sink
==
"true"
))
if
__name__
==
'__main__'
:
random_seed
=
1
np
.
random
.
seed
(
random_seed
)
run_transformer_train
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录