Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
OneFlow-Benchmark
提交
15d80f3f
O
OneFlow-Benchmark
项目概览
Oneflow-Inc
/
OneFlow-Benchmark
上一次同步 接近 3 年
通知
1
Star
92
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
OneFlow-Benchmark
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
15d80f3f
编写于
9月 18, 2019
作者:
S
ShawnXuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support bert
上级
5f3653db
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
688 addition
and
0 deletion
+688
-0
bert_benchmark/bert.py
bert_benchmark/bert.py
+324
-0
bert_benchmark/pretrain.py
bert_benchmark/pretrain.py
+145
-0
bert_benchmark/run_pretraining.py
bert_benchmark/run_pretraining.py
+219
-0
未找到文件。
bert_benchmark/bert.py
0 → 100644
浏览文件 @
15d80f3f
import
oneflow
as
flow
import
oneflow.core.common.data_type_pb2
as
data_type_util
import
oneflow.core.operator.op_conf_pb2
as
op_conf_util
import
math
class
BertBackbone
(
object
):
def
__init__
(
self
,
input_ids_blob
,
input_mask_blob
,
token_type_ids_blob
,
vocab_size
,
seq_length
=
512
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
initializer_range
=
0.02
):
with
flow
.
deprecated
.
variable_scope
(
"bert"
):
with
flow
.
deprecated
.
variable_scope
(
"embeddings"
):
(
self
.
embedding_output_
,
self
.
embedding_table_
)
=
_EmbeddingLookup
(
input_ids_blob
=
input_ids_blob
,
vocab_size
=
vocab_size
,
embedding_size
=
hidden_size
,
initializer_range
=
initializer_range
,
word_embedding_name
=
"word_embeddings"
)
self
.
embedding_output_
=
_EmbeddingPostprocessor
(
input_blob
=
self
.
embedding_output_
,
seq_length
=
seq_length
,
embedding_size
=
hidden_size
,
use_token_type
=
True
,
token_type_ids_blob
=
token_type_ids_blob
,
token_type_vocab_size
=
type_vocab_size
,
token_type_embedding_name
=
"token_type_embeddings"
,
use_position_embeddings
=
True
,
position_embedding_name
=
"position_embeddings"
,
initializer_range
=
initializer_range
,
max_position_embeddings
=
max_position_embeddings
,
dropout_prob
=
hidden_dropout_prob
)
with
flow
.
deprecated
.
variable_scope
(
"encoder"
):
attention_mask_blob
=
_CreateAttentionMaskFromInputMask
(
input_mask_blob
,
from_seq_length
=
seq_length
,
to_seq_length
=
seq_length
)
self
.
all_encoder_layers_
=
_TransformerModel
(
input_blob
=
self
.
embedding_output_
,
attention_mask_blob
=
attention_mask_blob
,
seq_length
=
seq_length
,
hidden_size
=
hidden_size
,
num_hidden_layers
=
num_hidden_layers
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
intermediate_act_fn
=
GetActivation
(
hidden_act
),
hidden_dropout_prob
=
hidden_dropout_prob
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
initializer_range
=
initializer_range
,
do_return_all_layers
=
False
)
self
.
sequence_output_
=
self
.
all_encoder_layers_
[
-
1
]
def
embedding_output
(
self
):
return
self
.
embedding_output_
def
all_encoder_layers
(
self
):
return
self
.
all_encoder_layers_
def
sequence_output
(
self
):
return
self
.
sequence_output_
def
embedding_table
(
self
):
return
self
.
embedding_table_
def
CreateInitializer
(
std
):
return
flow
.
truncated_normal
(
std
)
def
_Gelu
(
in_blob
):
return
flow
.
keras
.
activations
.
gelu
(
in_blob
)
def
_TransformerModel
(
input_blob
,
attention_mask_blob
,
seq_length
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
intermediate_act_fn
=
_Gelu
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
initializer_range
=
0.02
,
do_return_all_layers
=
False
):
assert
hidden_size
%
num_attention_heads
==
0
attention_head_size
=
int
(
hidden_size
/
num_attention_heads
)
input_width
=
hidden_size
prev_output_blob
=
flow
.
reshape
(
input_blob
,
(
-
1
,
input_width
))
all_layer_output_blobs
=
[]
for
layer_idx
in
range
(
num_hidden_layers
):
with
flow
.
deprecated
.
variable_scope
(
"layer_%d"
%
layer_idx
):
layer_input_blob
=
prev_output_blob
with
flow
.
deprecated
.
variable_scope
(
"attention"
):
with
flow
.
deprecated
.
variable_scope
(
"self"
):
attention_output_blob
=
_AttentionLayer
(
from_blob
=
layer_input_blob
,
to_blob
=
layer_input_blob
,
attention_mask_blob
=
attention_mask_blob
,
num_attention_heads
=
num_attention_heads
,
size_per_head
=
attention_head_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
initializer_range
=
initializer_range
,
do_return_2d_tensor
=
True
,
from_seq_length
=
seq_length
,
to_seq_length
=
seq_length
)
with
flow
.
deprecated
.
variable_scope
(
"output"
):
attention_output_blob
=
_FullyConnected
(
attention_output_blob
,
input_size
=
num_attention_heads
*
attention_head_size
,
units
=
hidden_size
,
weight_initializer
=
CreateInitializer
(
initializer_range
),
name
=
'dense'
)
attention_output_blob
=
_Dropout
(
attention_output_blob
,
hidden_dropout_prob
)
attention_output_blob
=
attention_output_blob
+
layer_input_blob
attention_output_blob
=
_LayerNorm
(
attention_output_blob
,
hidden_size
)
with
flow
.
deprecated
.
variable_scope
(
"intermediate"
):
if
callable
(
intermediate_act_fn
):
act_fn
=
op_conf_util
.
kNone
else
:
act_fn
=
intermediate_act_fn
intermediate_output_blob
=
_FullyConnected
(
attention_output_blob
,
input_size
=
num_attention_heads
*
attention_head_size
,
units
=
intermediate_size
,
activation
=
act_fn
,
weight_initializer
=
CreateInitializer
(
initializer_range
),
name
=
'dense'
)
if
callable
(
intermediate_act_fn
):
intermediate_output_blob
=
intermediate_act_fn
(
intermediate_output_blob
)
with
flow
.
deprecated
.
variable_scope
(
"output"
):
layer_output_blob
=
_FullyConnected
(
intermediate_output_blob
,
input_size
=
intermediate_size
,
units
=
hidden_size
,
weight_initializer
=
CreateInitializer
(
initializer_range
),
name
=
'dense'
)
layer_output_blob
=
_Dropout
(
layer_output_blob
,
hidden_dropout_prob
)
layer_output_blob
=
layer_output_blob
+
attention_output_blob
layer_output_blob
=
_LayerNorm
(
layer_output_blob
,
hidden_size
)
prev_output_blob
=
layer_output_blob
all_layer_output_blobs
.
append
(
layer_output_blob
)
input_shape
=
(
-
1
,
seq_length
,
hidden_size
)
if
do_return_all_layers
:
final_output_blobs
=
[]
for
layer_output_blob
in
all_layer_output_blobs
:
final_output_blob
=
flow
.
reshape
(
layer_output_blob
,
input_shape
)
final_output_blobs
.
append
(
final_output_blob
)
return
final_output_blobs
else
:
final_output_blob
=
flow
.
reshape
(
prev_output_blob
,
input_shape
)
return
[
final_output_blob
]
def
_AttentionLayer
(
from_blob
,
to_blob
,
attention_mask_blob
,
num_attention_heads
=
1
,
size_per_head
=
512
,
query_act
=
op_conf_util
.
kNone
,
key_act
=
op_conf_util
.
kNone
,
value_act
=
op_conf_util
.
kNone
,
attention_probs_dropout_prob
=
0.0
,
initializer_range
=
0.02
,
do_return_2d_tensor
=
False
,
batch_size
=
None
,
from_seq_length
=
None
,
to_seq_length
=
None
):
def
TransposeForScores
(
input_blob
,
num_attention_heads
,
seq_length
,
width
):
output_blob
=
flow
.
reshape
(
input_blob
,
[
-
1
,
seq_length
,
num_attention_heads
,
width
])
output_blob
=
flow
.
transpose
(
output_blob
,
perm
=
[
0
,
2
,
1
,
3
])
return
output_blob
from_blob_2d
=
flow
.
reshape
(
from_blob
,
[
-
1
,
num_attention_heads
*
size_per_head
])
to_blob_2d
=
flow
.
reshape
(
to_blob
,
[
-
1
,
num_attention_heads
*
size_per_head
])
query_blob
=
_FullyConnected
(
from_blob_2d
,
input_size
=
num_attention_heads
*
size_per_head
,
units
=
num_attention_heads
*
size_per_head
,
activation
=
query_act
,
name
=
"query"
,
weight_initializer
=
CreateInitializer
(
initializer_range
))
key_blob
=
_FullyConnected
(
to_blob_2d
,
input_size
=
num_attention_heads
*
size_per_head
,
units
=
num_attention_heads
*
size_per_head
,
activation
=
key_act
,
name
=
"key"
,
weight_initializer
=
CreateInitializer
(
initializer_range
))
value_blob
=
_FullyConnected
(
to_blob_2d
,
input_size
=
num_attention_heads
*
size_per_head
,
units
=
num_attention_heads
*
size_per_head
,
activation
=
value_act
,
name
=
"value"
,
weight_initializer
=
CreateInitializer
(
initializer_range
))
query_blob
=
TransposeForScores
(
query_blob
,
num_attention_heads
,
from_seq_length
,
size_per_head
)
key_blob
=
TransposeForScores
(
key_blob
,
num_attention_heads
,
to_seq_length
,
size_per_head
)
attention_scores_blob
=
flow
.
matmul
(
query_blob
,
key_blob
,
transpose_b
=
True
)
attention_scores_blob
=
attention_scores_blob
*
(
1.0
/
math
.
sqrt
(
float
(
size_per_head
)))
attention_mask_blob
=
flow
.
reshape
(
attention_mask_blob
,
[
-
1
,
1
,
from_seq_length
,
to_seq_length
])
attention_mask_blob
=
flow
.
cast
(
attention_mask_blob
,
dtype
=
flow
.
float
)
addr_blob
=
(
attention_mask_blob
-
1.0
)
*
10000.0
attention_scores_blob
=
attention_scores_blob
+
addr_blob
attention_probs_blob
=
flow
.
nn
.
softmax
(
attention_scores_blob
)
attention_probs_blob
=
_Dropout
(
attention_probs_blob
,
attention_probs_dropout_prob
)
value_blob
=
flow
.
reshape
(
value_blob
,
[
-
1
,
to_seq_length
,
num_attention_heads
,
size_per_head
])
value_blob
=
flow
.
transpose
(
value_blob
,
perm
=
[
0
,
2
,
1
,
3
])
context_blob
=
flow
.
matmul
(
attention_probs_blob
,
value_blob
)
context_blob
=
flow
.
transpose
(
context_blob
,
perm
=
[
0
,
2
,
1
,
3
])
if
do_return_2d_tensor
:
context_blob
=
flow
.
reshape
(
context_blob
,
[
-
1
,
num_attention_heads
*
size_per_head
])
else
:
context_blob
=
flow
.
reshape
(
context_blob
,
[
-
1
,
from_seq_length
,
num_attention_heads
*
size_per_head
])
return
context_blob
def
_FullyConnected
(
input_blob
,
input_size
,
units
,
activation
=
None
,
name
=
None
,
weight_initializer
=
None
):
weight_blob
=
flow
.
get_variable
(
name
=
name
+
'-weight'
,
shape
=
[
input_size
,
units
],
dtype
=
input_blob
.
dtype
,
initializer
=
weight_initializer
)
bias_blob
=
flow
.
get_variable
(
name
=
name
+
'-bias'
,
shape
=
[
units
],
dtype
=
input_blob
.
dtype
,
initializer
=
flow
.
constant_initializer
(
0.0
))
output_blob
=
flow
.
matmul
(
input_blob
,
weight_blob
)
output_blob
=
flow
.
nn
.
bias_add
(
output_blob
,
bias_blob
)
return
output_blob
def
_Dropout
(
input_blob
,
dropout_prob
):
if
dropout_prob
==
0.0
:
return
input_blob
return
flow
.
nn
.
dropout
(
input_blob
,
rate
=
dropout_prob
)
def
_LayerNorm
(
input_blob
,
hidden_size
):
return
flow
.
layers
.
layer_norm
(
input_blob
,
name
=
'LayerNorm'
,
begin_norm_axis
=-
1
,
begin_params_axis
=-
1
)
def
_CreateAttentionMaskFromInputMask
(
to_mask_blob
,
from_seq_length
,
to_seq_length
):
output
=
flow
.
cast
(
to_mask_blob
,
dtype
=
flow
.
float
)
output
=
flow
.
reshape
(
output
,
[
-
1
,
1
,
to_seq_length
])
zeros
=
flow
.
constant
(
0.0
,
dtype
=
flow
.
float
,
shape
=
[
from_seq_length
,
to_seq_length
])
output
=
zeros
+
output
return
output
def
_EmbeddingPostprocessor
(
input_blob
,
seq_length
,
embedding_size
,
use_token_type
=
False
,
token_type_ids_blob
=
None
,
token_type_vocab_size
=
16
,
token_type_embedding_name
=
"token_type_embeddings"
,
use_position_embeddings
=
True
,
position_embedding_name
=
"position_embeddings"
,
initializer_range
=
0.02
,
max_position_embeddings
=
512
,
dropout_prob
=
0.1
):
output
=
input_blob
if
use_token_type
:
assert
token_type_ids_blob
is
not
None
token_type_table
=
flow
.
get_variable
(
name
=
token_type_embedding_name
,
shape
=
[
token_type_vocab_size
,
embedding_size
],
dtype
=
input_blob
.
dtype
,
initializer
=
CreateInitializer
(
initializer_range
))
token_type_embeddings
=
flow
.
gather
(
params
=
token_type_table
,
indices
=
token_type_ids_blob
,
axis
=
0
)
output
=
output
+
token_type_embeddings
if
use_position_embeddings
:
position_table
=
flow
.
get_variable
(
name
=
position_embedding_name
,
shape
=
[
1
,
max_position_embeddings
,
embedding_size
],
dtype
=
input_blob
.
dtype
,
initializer
=
CreateInitializer
(
initializer_range
))
assert
seq_length
<=
max_position_embeddings
if
seq_length
!=
max_position_embeddings
:
position_table
=
flow
.
slice
(
position_table
,
begin
=
[
None
,
0
,
0
],
size
=
[
None
,
seq_length
,
-
1
])
output
=
output
+
position_table
output
=
_LayerNorm
(
output
,
embedding_size
)
output
=
_Dropout
(
output
,
dropout_prob
)
return
output
def
_EmbeddingLookup
(
input_ids_blob
,
vocab_size
,
embedding_size
=
128
,
initializer_range
=
0.02
,
word_embedding_name
=
"word_embeddings"
):
embedding_table
=
flow
.
get_variable
(
name
=
word_embedding_name
,
shape
=
[
vocab_size
,
embedding_size
],
dtype
=
flow
.
float
,
initializer
=
CreateInitializer
(
initializer_range
))
output
=
flow
.
gather
(
params
=
embedding_table
,
indices
=
input_ids_blob
,
axis
=
0
)
return
output
,
embedding_table
def
GetActivation
(
name
):
if
name
==
'linear'
:
return
None
elif
name
==
'relu'
:
return
flow
.
keras
.
activations
.
relu
elif
name
==
'tanh'
:
return
flow
.
keras
.
activations
.
tanh
elif
name
==
'gelu'
:
return
flow
.
keras
.
activations
.
gelu
else
:
raise
Exception
(
"unsupported activation"
)
bert_benchmark/pretrain.py
0 → 100644
浏览文件 @
15d80f3f
import
oneflow
as
flow
import
bert
as
bert_util
import
oneflow.core.operator.op_conf_pb2
as
op_conf_util
def
PreTrain
(
input_ids_blob
,
input_mask_blob
,
token_type_ids_blob
,
masked_lm_positions_blob
,
masked_lm_ids_blob
,
masked_lm_weights_blob
,
next_sentence_label_blob
,
vocab_size
,
seq_length
=
512
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
hidden_act
=
'gelu'
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
max_predictions_per_seq
=
20
,
initializer_range
=
0.02
):
backbone
=
bert_util
.
BertBackbone
(
input_ids_blob
=
input_ids_blob
,
input_mask_blob
=
input_mask_blob
,
token_type_ids_blob
=
token_type_ids_blob
,
vocab_size
=
vocab_size
,
seq_length
=
seq_length
,
hidden_size
=
hidden_size
,
num_hidden_layers
=
num_hidden_layers
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
hidden_act
=
hidden_act
,
hidden_dropout_prob
=
hidden_dropout_prob
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
max_position_embeddings
=
max_position_embeddings
,
type_vocab_size
=
type_vocab_size
,
initializer_range
=
initializer_range
)
(
lm_loss
,
_
,
_
)
=
_AddMaskedLanguageModelLoss
(
input_blob
=
backbone
.
sequence_output
(),
output_weights_blob
=
backbone
.
embedding_table
(),
positions_blob
=
masked_lm_positions_blob
,
label_id_blob
=
masked_lm_ids_blob
,
label_weight_blob
=
masked_lm_weights_blob
,
seq_length
=
seq_length
,
hidden_size
=
hidden_size
,
vocab_size
=
vocab_size
,
max_predictions_per_seq
=
max_predictions_per_seq
,
hidden_act
=
bert_util
.
GetActivation
(
hidden_act
),
initializer_range
=
initializer_range
)
pooled_output
=
PooledOutput
(
backbone
.
sequence_output
(),
hidden_size
,
initializer_range
)
(
ns_loss
,
_
,
_
)
=
_AddNextSentenceOutput
(
input_blob
=
pooled_output
,
label_blob
=
next_sentence_label_blob
,
hidden_size
=
hidden_size
,
initializer_range
=
initializer_range
)
with
flow
.
deprecated
.
variable_scope
(
"cls-loss"
):
total_loss
=
lm_loss
+
ns_loss
return
total_loss
def
PooledOutput
(
sequence_output
,
hidden_size
,
initializer_range
):
with
flow
.
deprecated
.
variable_scope
(
"bert-pooler"
):
first_token_tensor
=
flow
.
slice
(
sequence_output
,
[
None
,
0
,
0
],
[
None
,
1
,
-
1
])
first_token_tensor
=
flow
.
reshape
(
first_token_tensor
,
[
-
1
,
hidden_size
])
pooled_output
=
bert_util
.
_FullyConnected
(
first_token_tensor
,
input_size
=
hidden_size
,
units
=
hidden_size
,
weight_initializer
=
bert_util
.
CreateInitializer
(
initializer_range
),
name
=
'dense'
)
pooled_output
=
flow
.
keras
.
activations
.
tanh
(
pooled_output
)
return
pooled_output
def
_AddMaskedLanguageModelLoss
(
input_blob
,
output_weights_blob
,
positions_blob
,
label_id_blob
,
label_weight_blob
,
seq_length
,
hidden_size
,
vocab_size
,
max_predictions_per_seq
,
hidden_act
,
initializer_range
):
with
flow
.
deprecated
.
variable_scope
(
"other"
):
sum_label_weight_blob
=
flow
.
math
.
reduce_sum
(
label_weight_blob
,
axis
=
[
-
1
])
ones
=
sum_label_weight_blob
*
0.0
+
1.0
sum_label_weight_blob
=
flow
.
math
.
reduce_sum
(
sum_label_weight_blob
)
batch_size
=
flow
.
math
.
reduce_sum
(
ones
)
sum_label_weight_blob
=
sum_label_weight_blob
/
batch_size
with
flow
.
deprecated
.
variable_scope
(
"cls-predictions"
):
input_blob
=
_GatherIndexes
(
input_blob
,
positions_blob
,
seq_length
,
hidden_size
)
with
flow
.
deprecated
.
variable_scope
(
"transform"
):
if
callable
(
hidden_act
):
act_fn
=
op_conf_util
.
kNone
else
:
act_fn
=
hidden_act
input_blob
=
bert_util
.
_FullyConnected
(
input_blob
,
input_size
=
hidden_size
,
units
=
hidden_size
,
activation
=
act_fn
,
weight_initializer
=
bert_util
.
CreateInitializer
(
initializer_range
),
name
=
'dense'
)
if
callable
(
hidden_act
):
input_blob
=
hidden_act
(
input_blob
)
input_blob
=
bert_util
.
_LayerNorm
(
input_blob
,
hidden_size
)
output_bias
=
flow
.
get_variable
(
name
=
"output_bias"
,
shape
=
[
vocab_size
],
dtype
=
input_blob
.
dtype
,
initializer
=
flow
.
constant_initializer
(
1.0
))
logit_blob
=
flow
.
matmul
(
input_blob
,
output_weights_blob
,
transpose_b
=
True
)
logit_blob
=
flow
.
nn
.
bias_add
(
logit_blob
,
output_bias
)
label_id_blob
=
flow
.
reshape
(
label_id_blob
,
[
-
1
])
pre_example_loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
logits
=
logit_blob
,
labels
=
label_id_blob
)
pre_example_loss
=
flow
.
reshape
(
pre_example_loss
,
[
-
1
,
max_predictions_per_seq
])
numerator
=
pre_example_loss
*
label_weight_blob
with
flow
.
deprecated
.
variable_scope
(
"loss"
):
numerator
=
flow
.
math
.
reduce_sum
(
numerator
,
axis
=
[
-
1
])
denominator
=
sum_label_weight_blob
+
1e-5
loss
=
numerator
/
denominator
return
loss
,
pre_example_loss
,
logit_blob
def
_GatherIndexes
(
sequence_blob
,
positions_blob
,
seq_length
,
hidden_size
):
output
=
flow
.
gather
(
params
=
sequence_blob
,
indices
=
positions_blob
,
axis
=
2
,
batch_dims
=
2
)
output
=
flow
.
reshape
(
output
,
[
-
1
,
hidden_size
])
return
output
def
_AddNextSentenceOutput
(
input_blob
,
label_blob
,
hidden_size
,
initializer_range
):
with
flow
.
deprecated
.
variable_scope
(
"cls-seq_relationship"
):
output_weight_blob
=
flow
.
get_variable
(
name
=
"output_weights"
,
shape
=
[
2
,
hidden_size
],
dtype
=
input_blob
.
dtype
,
initializer
=
bert_util
.
CreateInitializer
(
initializer_range
))
output_bias_blob
=
flow
.
get_variable
(
name
=
"output_bias"
,
shape
=
[
2
],
dtype
=
input_blob
.
dtype
,
initializer
=
flow
.
constant_initializer
(
0.0
))
logit_blob
=
flow
.
matmul
(
input_blob
,
output_weight_blob
,
transpose_b
=
True
)
logit_blob
=
flow
.
nn
.
bias_add
(
logit_blob
,
output_bias_blob
)
pre_example_loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
logits
=
logit_blob
,
labels
=
label_blob
)
loss
=
pre_example_loss
return
loss
,
pre_example_loss
,
logit_blob
bert_benchmark/run_pretraining.py
0 → 100644
浏览文件 @
15d80f3f
import
os
import
sys
import
time
import
argparse
import
shutil
import
numpy
as
np
from
datetime
import
datetime
import
oneflow
as
flow
from
pretrain
import
PreTrain
#, Eval
_DATA_DIR
=
'/dataset/bert/of_wiki_seq_len_128'
_MODEL_LOAD
=
"/dataset/model_zoo/bert_new_snapshot/of_L-12_H-768_A-12_random_init"
_MODEL_SAVE_DIR
=
"./model_save-{}"
.
format
(
str
(
datetime
.
now
().
strftime
(
"%Y-%m-%d-%H:%M:%S"
))
)
NODE_LIST
=
"192.168.1.15,192.168.1.16"
parser
=
argparse
.
ArgumentParser
(
description
=
"flags for bert"
)
# resouce
parser
.
add_argument
(
"--device_num_per_node"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--node_num"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--node_list"
,
type
=
str
,
default
=
NODE_LIST
)
# train
parser
.
add_argument
(
"--learning_rate"
,
type
=
float
,
default
=
1e-4
,
help
=
"Learning rate"
)
parser
.
add_argument
(
"--weight_l2"
,
type
=
float
,
default
=
0.01
,
help
=
"weight l2 decay parameter"
)
parser
.
add_argument
(
"--batch_size_per_device"
,
type
=
int
,
default
=
24
)
parser
.
add_argument
(
"--iter_num"
,
type
=
int
,
default
=
10
,
help
=
"total iterations to run"
)
parser
.
add_argument
(
"--log_every_n_iter"
,
type
=
int
,
default
=
1
,
help
=
"print loss every n iteration"
)
parser
.
add_argument
(
"--train_dir"
,
type
=
str
,
default
=
_DATA_DIR
)
parser
.
add_argument
(
"--data_part_num"
,
type
=
int
,
default
=
32
,
help
=
"data part number in dataset"
)
parser
.
add_argument
(
"--model_load_dir"
,
type
=
str
,
default
=
_MODEL_LOAD
)
parser
.
add_argument
(
"--model_save_dir"
,
type
=
str
,
default
=
_MODEL_SAVE_DIR
)
# bert
parser
.
add_argument
(
"--seq_length"
,
type
=
int
,
default
=
512
)
parser
.
add_argument
(
"--max_predictions_per_seq"
,
type
=
int
,
default
=
80
)
parser
.
add_argument
(
"--num_hidden_layers"
,
type
=
int
,
default
=
24
)
parser
.
add_argument
(
"--num_attention_heads"
,
type
=
int
,
default
=
16
)
parser
.
add_argument
(
"--max_position_embeddings"
,
type
=
int
,
default
=
512
)
parser
.
add_argument
(
"--type_vocab_size"
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
"--vocab_size"
,
type
=
int
,
default
=
30522
)
parser
.
add_argument
(
"--attention_probs_dropout_prob"
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
"--hidden_dropout_prob"
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
"--hidden_size_per_head"
,
type
=
int
,
default
=
64
)
args
=
parser
.
parse_args
()
def
_blob_conf
(
name
,
shape
,
dtype
=
flow
.
int32
):
return
flow
.
data
.
BlobConf
(
name
=
name
,
shape
=
shape
,
dtype
=
dtype
,
codec
=
flow
.
data
.
RawCodec
())
def
BertDecoder
(
data_dir
=
''
,
batch_size
,
data_part_num
,
seq_length
,
max_predictions_per_seq
):
blob_confs
=
[]
blob_confs
.
append
(
_blob_conf
(
'input_ids'
,
[
seq_length
]))
blob_confs
.
append
(
_blob_conf
(
'next_sentence_labels'
,
[
1
]))
blob_confs
.
append
(
_blob_conf
(
'input_mask'
,
[
seq_length
]))
blob_confs
.
append
(
_blob_conf
(
'segment_ids'
,
[
seq_length
]))
blob_confs
.
append
(
_blob_conf
(
'masked_lm_ids'
,
[
max_predictions_per_seq
]))
blob_confs
.
append
(
_blob_conf
(
'masked_lm_positions'
,
[
max_predictions_per_seq
]))
blob_confs
.
append
(
_blob_conf
(
'masked_lm_weights'
,
[
max_predictions_per_seq
],
flow
.
float
))
return
flow
.
data
.
decode_ofrecord
(
data_dir
,
blob_confs
,
batch_size
=
batch_size
,
name
=
"decode"
,
data_part_num
=
data_part_num
)
def
BuildPreTrainNet
(
batch_size
,
data_part_num
,
seq_length
=
128
,
max_position_embeddings
=
512
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
vocab_size
=
30522
,
type_vocab_size
=
2
,
max_predictions_per_seq
=
20
):
hidden_size
=
64
*
num_attention_heads
#, H = 64, size per head
intermediate_size
=
hidden_size
*
4
decoders
=
BertDecoder
(
args
.
train_dir
,
batch_size
,
data_part_num
,
seq_length
,
max_predictions_per_seq
)
input_ids
=
decoders
[
0
]
next_sentence_labels
=
decoders
[
1
]
token_type_ids
=
decoders
[
2
]
input_mask
=
decoders
[
3
]
masked_lm_ids
=
decoders
[
4
]
masked_lm_positions
=
decoders
[
5
]
masked_lm_weights
=
decoders
[
6
]
return
PreTrain
(
input_ids
,
input_mask
,
token_type_ids
,
masked_lm_positions
,
masked_lm_ids
,
masked_lm_weights
,
next_sentence_labels
,
vocab_size
,
seq_length
=
seq_length
,
hidden_size
=
hidden_size
,
num_hidden_layers
=
num_hidden_layers
,
num_attention_heads
=
num_attention_heads
,
intermediate_size
=
intermediate_size
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
hidden_dropout_prob
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
max_position_embeddings
=
max_position_embeddings
,
type_vocab_size
=
type_vocab_size
,
max_predictions_per_seq
=
max_predictions_per_seq
,
initializer_range
=
0.02
)
_BERT_MODEL_UPDATE_CONF
=
dict
(
learning_rate_decay
=
dict
(
polynomial_conf
=
dict
(
decay_batches
=
100000
,
end_learning_rate
=
0.0
,
)
),
warmup_conf
=
dict
(
linear_conf
=
dict
(
warmup_batches
=
1000
,
start_multiplier
=
0
,
)
),
clip_conf
=
dict
(
clip_by_global_norm
=
dict
(
clip_norm
=
1.0
,
)
),
adam_conf
=
dict
(
epsilon
=
1e-6
),
)
@
flow
.
function
def
PretrainJob
():
total_device_num
=
args
.
node_num
*
args
.
device_num_per_node
batch_size
=
total_device_num
*
args
.
batch_size_per_device
flow
.
config
.
train
.
primary_lr
(
args
.
learning_rate
)
flow
.
config
.
train
.
model_update_conf
(
_BERT_MODEL_UPDATE_CONF
)
flow
.
config
.
train
.
weight_l2
(
args
.
weight_l2
)
loss
=
BuildPreTrainNet
(
batch_size
,
args
.
data_part_num
,
seq_length
=
args
.
seq_length
,
max_position_embeddings
=
args
.
max_position_embeddings
,
num_hidden_layers
=
args
.
num_hidden_layers
,
num_attention_heads
=
args
.
num_attention_heads
,
hidden_dropout_prob
=
args
.
hidden_dropout_prob
,
attention_probs_dropout_prob
=
args
.
attention_probs_dropout_prob
,
vocab_size
=
args
.
vocab_size
,
type_vocab_size
=
args
.
type_vocab_size
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
)
flow
.
losses
.
add_loss
(
loss
)
return
loss
cur_step
=
0
def
AsyncGetCallback
(
result
):
global
cur_step
print
(
'{:>12} {:>.10f} {:.2f}'
.
format
(
cur_step
,
result
.
mean
(),
time
.
time
()))
cur_step
+=
1
if
__name__
==
'__main__'
:
for
arg
in
vars
(
args
):
print
(
'{} = {}'
.
format
(
arg
,
getattr
(
args
,
arg
)))
start_time
=
time
.
time
()
flow
.
config
.
gpu_device_num
(
args
.
device_num_per_node
)
flow
.
config
.
ctrl_port
(
9788
)
flow
.
config
.
data_port
(
9789
)
flow
.
config
.
default_data_type
(
flow
.
float
)
flow
.
config
.
enable_inplace
(
False
)
if
args
.
node_num
>
1
:
flow
.
config
.
ctrl_port
(
12138
)
nodes
=
[]
for
n
in
args
.
node_list
.
strip
().
split
(
","
):
addr_dict
=
{}
addr_dict
[
"addr"
]
=
n
nodes
.
append
(
addr_dict
)
flow
.
config
.
machine
(
nodes
)
check_point
=
flow
.
train
.
CheckPoint
()
if
args
.
model_load_dir
!=
''
:
assert
os
.
path
.
isdir
(
args
.
model_load_dir
)
check_point
.
load
(
args
.
model_load_dir
)
print
(
'init model from {}'
.
format
(
args
.
model_load_dir
))
else
:
check_point
.
init
()
print
(
'init model on demand'
)
fmt_str
=
"{:>12} {:>12} {:>12.10f}"
print
(
'{:>12} {:14} {}'
.
format
(
"step"
,
"loss"
,
"time"
))
train_start_time
=
time
.
time
()
step_time
=
[]
for
step
in
range
(
args
.
iter_num
):
loss_mean
=
PretrainJob
().
get
().
mean
()
step_time
.
append
(
time
.
time
())
train_step_time
=
step_time
[
step
]
-
step_time
[
step
-
1
]
print
(
fmt_str
.
format
(
step
,
loss_mean
,
train_step_time
))
if
args
.
model_save_dir
!=
''
:
if
not
os
.
path
.
exists
(
args
.
model_save_dir
):
os
.
makedirs
(
args
.
model_save_dir
)
assert
args
.
log_every_n_iter
>
0
if
step
%
args
.
log_every_n_iter
==
0
:
snapshot_save_path
=
os
.
path
.
join
(
args
.
model_save_dir
,
'snapshot_%d'
%
(
step
+
1
))
check_point
.
save
(
snapshot_save_path
)
total_time
=
step_time
[
-
1
]
-
start_time
train_time
=
step_time
[
-
1
]
-
train_start_time
init_time
=
train_start_time
-
start_time
mean_batch_time
=
(
step_time
[
-
1
]
-
step_time
[
0
])
/
(
args
.
iter_num
-
1
)
total_batch_size
=
args
.
node_num
*
args
.
device_num_per_node
*
args
.
batch_size_per_device
throughput
=
total_batch_size
/
mean_batch_time
print
(
'total time'
,
total_time
)
print
(
'init time'
,
init_time
)
print
(
'first loss time'
,
step_time
[
0
]
-
start_time
)
#include model init and first batch cal time.
print
(
'train time'
,
train_time
)
print
(
'last - first loss time'
,
step_time
[
-
1
]
-
step_time
[
0
])
print
(
'average batch time'
,
mean_batch_time
)
print
(
'samples/sec'
,
throughput
)
print
(
'destroy time'
,
time
.
time
()
-
step_time
[
-
1
])
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录