提交 15d80f3f 编写于 作者: S ShawnXuan

support bert

上级 5f3653db
import oneflow as flow
import oneflow.core.common.data_type_pb2 as data_type_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util
import math
class BertBackbone(object):
def __init__(self,
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02):
with flow.deprecated.variable_scope("bert"):
with flow.deprecated.variable_scope("embeddings"):
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
input_ids_blob=input_ids_blob,
vocab_size=vocab_size,
embedding_size=hidden_size,
initializer_range=initializer_range,
word_embedding_name="word_embeddings")
self.embedding_output_ = _EmbeddingPostprocessor(
input_blob=self.embedding_output_,
seq_length=seq_length,
embedding_size=hidden_size,
use_token_type=True,
token_type_ids_blob=token_type_ids_blob,
token_type_vocab_size=type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=initializer_range,
max_position_embeddings=max_position_embeddings,
dropout_prob=hidden_dropout_prob)
with flow.deprecated.variable_scope("encoder"):
attention_mask_blob = _CreateAttentionMaskFromInputMask(
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length)
self.all_encoder_layers_ = _TransformerModel(
input_blob=self.embedding_output_,
attention_mask_blob=attention_mask_blob,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
intermediate_act_fn=GetActivation(hidden_act),
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_all_layers=False)
self.sequence_output_ = self.all_encoder_layers_[-1]
def embedding_output(self): return self.embedding_output_
def all_encoder_layers(self): return self.all_encoder_layers_
def sequence_output(self): return self.sequence_output_
def embedding_table(self): return self.embedding_table_
def CreateInitializer(std):
return flow.truncated_normal(std)
def _Gelu(in_blob):
return flow.keras.activations.gelu(in_blob)
def _TransformerModel(input_blob,
attention_mask_blob,
seq_length,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=_Gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False):
assert hidden_size % num_attention_heads == 0
attention_head_size = int(hidden_size / num_attention_heads)
input_width = hidden_size
prev_output_blob = flow.reshape(input_blob, (-1, input_width))
all_layer_output_blobs = []
for layer_idx in range(num_hidden_layers):
with flow.deprecated.variable_scope("layer_%d"%layer_idx):
layer_input_blob = prev_output_blob
with flow.deprecated.variable_scope("attention"):
with flow.deprecated.variable_scope("self"):
attention_output_blob = _AttentionLayer(
from_blob=layer_input_blob,
to_blob=layer_input_blob,
attention_mask_blob=attention_mask_blob,
num_attention_heads=num_attention_heads,
size_per_head=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_2d_tensor=True,
from_seq_length=seq_length,
to_seq_length=seq_length)
with flow.deprecated.variable_scope("output"):
attention_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense')
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob)
attention_output_blob = attention_output_blob + layer_input_blob
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size)
with flow.deprecated.variable_scope("intermediate"):
if callable(intermediate_act_fn):
act_fn = op_conf_util.kNone
else:
act_fn = intermediate_act_fn
intermediate_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=intermediate_size,
activation=act_fn,
weight_initializer=CreateInitializer(initializer_range),
name='dense')
if callable(intermediate_act_fn):
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob)
with flow.deprecated.variable_scope("output"):
layer_output_blob = _FullyConnected(
intermediate_output_blob,
input_size=intermediate_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense')
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
layer_output_blob = layer_output_blob + attention_output_blob
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
prev_output_blob = layer_output_blob
all_layer_output_blobs.append(layer_output_blob)
input_shape = (-1, seq_length, hidden_size)
if do_return_all_layers:
final_output_blobs = []
for layer_output_blob in all_layer_output_blobs:
final_output_blob = flow.reshape(layer_output_blob, input_shape)
final_output_blobs.append(final_output_blob)
return final_output_blobs
else:
final_output_blob = flow.reshape(prev_output_blob, input_shape)
return [final_output_blob]
def _AttentionLayer(from_blob,
to_blob,
attention_mask_blob,
num_attention_heads=1,
size_per_head=512,
query_act=op_conf_util.kNone,
key_act=op_conf_util.kNone,
value_act=op_conf_util.kNone,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None):
def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width])
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
return output_blob
from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])
query_blob = _FullyConnected(
from_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=query_act,
name="query",
weight_initializer=CreateInitializer(initializer_range))
key_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=key_act,
name="key",
weight_initializer=CreateInitializer(initializer_range))
value_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=value_act,
name="value",
weight_initializer=CreateInitializer(initializer_range))
query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head)
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head)
attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head)))
attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length])
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
addr_blob = (attention_mask_blob - 1.0) * 10000.0
attention_scores_blob = attention_scores_blob + addr_blob
attention_probs_blob = flow.nn.softmax(attention_scores_blob)
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head])
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
context_blob = flow.matmul(attention_probs_blob, value_blob)
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])
if do_return_2d_tensor:
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head])
else:
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head])
return context_blob
def _FullyConnected(input_blob, input_size, units, activation=None, name=None,
weight_initializer=None):
weight_blob = flow.get_variable(
name=name + '-weight',
shape=[input_size, units],
dtype=input_blob.dtype,
initializer=weight_initializer)
bias_blob = flow.get_variable(
name=name + '-bias',
shape=[units],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0))
output_blob = flow.matmul(input_blob, weight_blob)
output_blob = flow.nn.bias_add(output_blob, bias_blob)
return output_blob
def _Dropout(input_blob, dropout_prob):
if dropout_prob == 0.0:
return input_blob
return flow.nn.dropout(input_blob, rate=dropout_prob)
def _LayerNorm(input_blob, hidden_size):
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1)
def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
output = flow.cast(to_mask_blob, dtype=flow.float)
output = flow.reshape(output, [-1, 1, to_seq_length])
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
output = zeros + output
return output
def _EmbeddingPostprocessor(input_blob,
seq_length,
embedding_size,
use_token_type=False,
token_type_ids_blob=None,
token_type_vocab_size=16,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=0.02,
max_position_embeddings=512,
dropout_prob=0.1):
output = input_blob
if use_token_type:
assert token_type_ids_blob is not None
token_type_table = flow.get_variable(name=token_type_embedding_name,
shape=[token_type_vocab_size, embedding_size],
dtype=input_blob.dtype,
initializer=CreateInitializer(initializer_range))
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0)
output = output + token_type_embeddings
if use_position_embeddings:
position_table = flow.get_variable(name=position_embedding_name,
shape=[1, max_position_embeddings, embedding_size],
dtype=input_blob.dtype,
initializer=CreateInitializer(initializer_range))
assert seq_length <= max_position_embeddings
if seq_length != max_position_embeddings:
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1])
output = output + position_table
output = _LayerNorm(output, embedding_size)
output = _Dropout(output, dropout_prob)
return output
def _EmbeddingLookup(input_ids_blob,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings"):
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size],
dtype=flow.float,
initializer=CreateInitializer(initializer_range))
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
return output, embedding_table
def GetActivation(name):
if name == 'linear':
return None
elif name == 'relu':
return flow.keras.activations.relu
elif name == 'tanh':
return flow.keras.activations.tanh
elif name == 'gelu':
return flow.keras.activations.gelu
else:
raise Exception("unsupported activation")
import oneflow as flow
import bert as bert_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util
def PreTrain(input_ids_blob,
input_mask_blob,
token_type_ids_blob,
masked_lm_positions_blob,
masked_lm_ids_blob,
masked_lm_weights_blob,
next_sentence_label_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act='gelu',
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
max_predictions_per_seq=20,
initializer_range=0.02):
backbone = bert_util.BertBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range)
(lm_loss, _, _) = _AddMaskedLanguageModelLoss(
input_blob=backbone.sequence_output(),
output_weights_blob=backbone.embedding_table(),
positions_blob=masked_lm_positions_blob,
label_id_blob=masked_lm_ids_blob,
label_weight_blob=masked_lm_weights_blob,
seq_length=seq_length,
hidden_size=hidden_size,
vocab_size=vocab_size,
max_predictions_per_seq=max_predictions_per_seq,
hidden_act=bert_util.GetActivation(hidden_act),
initializer_range=initializer_range)
pooled_output = PooledOutput(backbone.sequence_output(), hidden_size, initializer_range)
(ns_loss, _, _) = _AddNextSentenceOutput(
input_blob=pooled_output,
label_blob=next_sentence_label_blob,
hidden_size=hidden_size,
initializer_range=initializer_range)
with flow.deprecated.variable_scope("cls-loss"):
total_loss = lm_loss + ns_loss
return total_loss
def PooledOutput(sequence_output, hidden_size, initializer_range):
with flow.deprecated.variable_scope("bert-pooler"):
first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
pooled_output = bert_util._FullyConnected(
first_token_tensor,
input_size=hidden_size,
units=hidden_size,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name='dense')
pooled_output = flow.keras.activations.tanh(pooled_output)
return pooled_output
def _AddMaskedLanguageModelLoss(input_blob,
output_weights_blob,
positions_blob,
label_id_blob,
label_weight_blob,
seq_length,
hidden_size,
vocab_size,
max_predictions_per_seq,
hidden_act,
initializer_range):
with flow.deprecated.variable_scope("other"):
sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
ones = sum_label_weight_blob * 0.0 + 1.0
sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
batch_size = flow.math.reduce_sum(ones)
sum_label_weight_blob = sum_label_weight_blob / batch_size
with flow.deprecated.variable_scope("cls-predictions"):
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
with flow.deprecated.variable_scope("transform"):
if callable(hidden_act):
act_fn = op_conf_util.kNone
else:
act_fn = hidden_act
input_blob = bert_util._FullyConnected(
input_blob,
input_size=hidden_size,
units=hidden_size,
activation=act_fn,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name='dense')
if callable(hidden_act):
input_blob = hidden_act(input_blob)
input_blob = bert_util._LayerNorm(input_blob, hidden_size)
output_bias = flow.get_variable(name="output_bias", shape=[vocab_size], dtype=input_blob.dtype,
initializer=flow.constant_initializer(1.0))
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias)
label_id_blob = flow.reshape(label_id_blob, [-1])
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(logits=logit_blob,
labels=label_id_blob)
pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
numerator = pre_example_loss * label_weight_blob
with flow.deprecated.variable_scope("loss"):
numerator = flow.math.reduce_sum(numerator, axis=[-1])
denominator = sum_label_weight_blob + 1e-5
loss = numerator / denominator
return loss, pre_example_loss, logit_blob
def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
output = flow.gather(params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2)
output = flow.reshape(output, [-1, hidden_size])
return output
def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
with flow.deprecated.variable_scope("cls-seq_relationship"):
output_weight_blob = flow.get_variable(name="output_weights", shape=[2, hidden_size],
dtype=input_blob.dtype, initializer=bert_util.CreateInitializer(initializer_range))
output_bias_blob = flow.get_variable( name="output_bias", shape=[2],
dtype=input_blob.dtype, initializer=flow.constant_initializer(0.0))
logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(logits=logit_blob,
labels=label_blob)
loss = pre_example_loss
return loss, pre_example_loss, logit_blob
import os
import sys
import time
import argparse
import shutil
import numpy as np
from datetime import datetime
import oneflow as flow
from pretrain import PreTrain#, Eval
_DATA_DIR = '/dataset/bert/of_wiki_seq_len_128'
_MODEL_LOAD = "/dataset/model_zoo/bert_new_snapshot/of_L-12_H-768_A-12_random_init"
_MODEL_SAVE_DIR = "./model_save-{}".format(
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
)
NODE_LIST = "192.168.1.15,192.168.1.16"
parser = argparse.ArgumentParser(description="flags for bert")
# resouce
parser.add_argument("--device_num_per_node", type=int, default=1)
parser.add_argument("--node_num", type=int, default=1)
parser.add_argument("--node_list", type=str, default=NODE_LIST)
# train
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--weight_l2", type=float, default=0.01, help="weight l2 decay parameter")
parser.add_argument("--batch_size_per_device", type=int, default=24)
parser.add_argument("--iter_num", type=int, default=10, help="total iterations to run")
parser.add_argument("--log_every_n_iter", type=int, default=1, help="print loss every n iteration")
parser.add_argument("--train_dir", type=str, default=_DATA_DIR)
parser.add_argument("--data_part_num", type=int, default=32, help="data part number in dataset")
parser.add_argument("--model_load_dir", type=str, default=_MODEL_LOAD)
parser.add_argument("--model_save_dir", type=str, default=_MODEL_SAVE_DIR)
# bert
parser.add_argument("--seq_length", type=int, default=512)
parser.add_argument("--max_predictions_per_seq", type=int, default=80)
parser.add_argument("--num_hidden_layers", type=int, default=24)
parser.add_argument("--num_attention_heads", type=int, default=16)
parser.add_argument("--max_position_embeddings", type=int, default=512)
parser.add_argument("--type_vocab_size", type=int, default=2)
parser.add_argument("--vocab_size", type=int, default=30522)
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_size_per_head", type=int, default=64)
args = parser.parse_args()
def _blob_conf(name, shape, dtype=flow.int32):
return flow.data.BlobConf(name=name, shape=shape, dtype=dtype, codec=flow.data.RawCodec())
def BertDecoder(data_dir='', batch_size, data_part_num, seq_length, max_predictions_per_seq):
blob_confs = []
blob_confs.append(_blob_conf('input_ids', [seq_length]))
blob_confs.append(_blob_conf('next_sentence_labels', [1]))
blob_confs.append(_blob_conf('input_mask', [seq_length]))
blob_confs.append(_blob_conf('segment_ids', [seq_length]))
blob_confs.append(_blob_conf('masked_lm_ids', [max_predictions_per_seq]))
blob_confs.append(_blob_conf('masked_lm_positions', [max_predictions_per_seq]))
blob_confs.append(_blob_conf('masked_lm_weights', [max_predictions_per_seq], flow.float))
return flow.data.decode_ofrecord(data_dir, blob_confs,
batch_size=batch_size,
name="decode",
data_part_num=data_part_num)
def BuildPreTrainNet(batch_size, data_part_num, seq_length=128, max_position_embeddings=512,
num_hidden_layers=12, num_attention_heads=12,
hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1,
vocab_size=30522, type_vocab_size=2, max_predictions_per_seq=20):
hidden_size = 64 * num_attention_heads#, H = 64, size per head
intermediate_size = hidden_size * 4
decoders = BertDecoder(args.train_dir, batch_size, data_part_num, seq_length,
max_predictions_per_seq)
input_ids = decoders[0]
next_sentence_labels = decoders[1]
token_type_ids = decoders[2]
input_mask = decoders[3]
masked_lm_ids = decoders[4]
masked_lm_positions = decoders[5]
masked_lm_weights = decoders[6]
return PreTrain(input_ids,
input_mask,
token_type_ids,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights,
next_sentence_labels,
vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
max_predictions_per_seq=max_predictions_per_seq,
initializer_range=0.02)
_BERT_MODEL_UPDATE_CONF = dict(
learning_rate_decay = dict(
polynomial_conf = dict(
decay_batches = 100000,
end_learning_rate = 0.0,
)
),
warmup_conf = dict(
linear_conf = dict(
warmup_batches = 1000,
start_multiplier = 0,
)
),
clip_conf = dict(
clip_by_global_norm = dict(
clip_norm = 1.0,
)
),
adam_conf = dict(
epsilon = 1e-6
),
)
@flow.function
def PretrainJob():
total_device_num = args.node_num * args.device_num_per_node
batch_size = total_device_num * args.batch_size_per_device
flow.config.train.primary_lr(args.learning_rate)
flow.config.train.model_update_conf(_BERT_MODEL_UPDATE_CONF)
flow.config.train.weight_l2(args.weight_l2)
loss = BuildPreTrainNet(batch_size, args.data_part_num,
seq_length=args.seq_length,
max_position_embeddings=args.max_position_embeddings,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
hidden_dropout_prob=args.hidden_dropout_prob,
attention_probs_dropout_prob=args.attention_probs_dropout_prob,
vocab_size=args.vocab_size,
type_vocab_size=args.type_vocab_size,
max_predictions_per_seq=args.max_predictions_per_seq)
flow.losses.add_loss(loss)
return loss
cur_step = 0
def AsyncGetCallback(result):
global cur_step
print('{:>12} {:>.10f} {:.2f}'.format(cur_step, result.mean(), time.time()))
cur_step += 1
if __name__ == '__main__':
for arg in vars(args):
print('{} = {}'.format(arg, getattr(args, arg)))
start_time = time.time()
flow.config.gpu_device_num(args.device_num_per_node)
flow.config.ctrl_port(9788)
flow.config.data_port(9789)
flow.config.default_data_type(flow.float)
flow.config.enable_inplace(False)
if args.node_num > 1:
flow.config.ctrl_port(12138)
nodes = []
for n in args.node_list.strip().split(","):
addr_dict = {}
addr_dict["addr"] = n
nodes.append(addr_dict)
flow.config.machine(nodes)
check_point = flow.train.CheckPoint()
if args.model_load_dir != '':
assert os.path.isdir(args.model_load_dir)
check_point.load(args.model_load_dir)
print('init model from {}'.format(args.model_load_dir))
else:
check_point.init()
print('init model on demand')
fmt_str = "{:>12} {:>12} {:>12.10f}"
print('{:>12} {:14} {}'.format( "step", "loss", "time"))
train_start_time = time.time()
step_time = []
for step in range(args.iter_num):
loss_mean = PretrainJob().get().mean()
step_time.append(time.time())
train_step_time = step_time[step] - step_time[step-1]
print(fmt_str.format(step, loss_mean, train_step_time))
if args.model_save_dir != '':
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
assert args.log_every_n_iter > 0
if step % args.log_every_n_iter == 0:
snapshot_save_path = os.path.join(args.model_save_dir, 'snapshot_%d'%(step+1))
check_point.save(snapshot_save_path)
total_time = step_time[-1] - start_time
train_time = step_time[-1] - train_start_time
init_time = train_start_time - start_time
mean_batch_time = (step_time[-1] - step_time[0]) / (args.iter_num - 1)
total_batch_size = args.node_num * args.device_num_per_node * args.batch_size_per_device
throughput = total_batch_size / mean_batch_time
print('total time', total_time)
print('init time', init_time)
print('first loss time', step_time[0] - start_time) #include model init and first batch cal time.
print('train time', train_time)
print('last - first loss time', step_time[-1] - step_time[0])
print('average batch time', mean_batch_time)
print('samples/sec', throughput)
print('destroy time', time.time() - step_time[-1])
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册