提交 83f0a576 编写于 作者: C Chen Chen 提交者: saberkun

Internal change

PiperOrigin-RevId: 282096004
上级 986ffac4
......@@ -94,7 +94,8 @@ def run_customized_training_loop(
metric_fn=None,
init_checkpoint=None,
custom_callbacks=None,
run_eagerly=False):
run_eagerly=False,
sub_model_export_name=None):
"""Run BERT pretrain model training using low-level API.
Arguments:
......@@ -131,6 +132,11 @@ def run_customized_training_loop(
methods are invoked during training.
run_eagerly: Whether to run model training in pure eager execution. This
should be disable for TPUStrategy.
sub_model_export_name: If not None, will export `sub_model` returned by
`model_fn` into checkpoint files. The name of intermediate checkpoint
file is {sub_model_export_name}_step_{step}.ckpt and the last
checkpint's name is {sub_model_export_name}.ckpt;
if None, `sub_model` will not be exported as checkpoint.
Returns:
Trained model.
......@@ -139,6 +145,8 @@ def run_customized_training_loop(
ValueError: (1) When model returned by `model_fn` does not have optimizer
attribute or when required parameters are set to none. (2) eval args are
not specified correctly. (3) metric_fn must be a callable if specified.
(4) sub_model_checkpoint_name is specified, but `sub_model` returned
by `model_fn` is None.
"""
if _sentinel is not None:
......@@ -191,6 +199,10 @@ def run_customized_training_loop(
if not hasattr(model, 'optimizer'):
raise ValueError('User should set optimizer attribute to model '
'inside `model_fn`.')
if sub_model_export_name and sub_model is None:
raise ValueError('sub_model_export_name is specified as %s, but '
'sub_model is None.' % sub_model_export_name)
optimizer = model.optimizer
use_float16 = isinstance(
optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer)
......@@ -326,6 +338,9 @@ def run_customized_training_loop(
# Training loop starts here.
checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
sub_model_checkpoint = tf.train.Checkpoint(
model=sub_model) if sub_model_export_name else None
latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
if latest_checkpoint_file:
logging.info(
......@@ -382,7 +397,10 @@ def run_customized_training_loop(
if current_step < total_training_steps:
_save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if sub_model_export_name:
_save_checkpoint(
sub_model_checkpoint, model_dir,
'%s_step_%d.ckpt' % (sub_model_export_name, current_step))
if eval_input_fn:
logging.info('Running evaluation after step: %s.', current_step)
_run_evaluation(current_step,
......@@ -393,6 +411,9 @@ def run_customized_training_loop(
_save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if sub_model_export_name:
_save_checkpoint(sub_model_checkpoint, model_dir,
'%s.ckpt' % sub_model_export_name)
if eval_input_fn:
logging.info('Running final evaluation after training is complete.')
......
......@@ -77,37 +77,6 @@ def export_bert_model(model_export_path: typing.Text,
model.save(model_export_path, include_optimizer=False, save_format='tf')
def export_pretraining_checkpoint(
checkpoint_dir: typing.Text,
model: tf.keras.Model,
checkpoint_name: typing.Optional[
typing.Text] = 'pretrained/bert_model.ckpt'):
"""Exports BERT model for as a checkpoint without optimizer.
Arguments:
checkpoint_dir: Path to where training model checkpoints are stored.
model: Keras model object to export.
checkpoint_name: File name or suffix path to export pretrained checkpoint.
Raises:
ValueError when either checkpoint_dir or model is not specified.
"""
if not checkpoint_dir:
raise ValueError('checkpoint_dir must be specified.')
if not isinstance(model, tf.keras.Model):
raise ValueError('model must be a tf.keras.Model object.')
checkpoint = tf.train.Checkpoint(model=model)
latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
assert latest_checkpoint_file
logging.info('Checkpoint file %s found and restoring from '
'checkpoint', latest_checkpoint_file)
status = checkpoint.restore(latest_checkpoint_file)
status.assert_existing_objects_matched().expect_partial()
saved_path = checkpoint.save(os.path.join(checkpoint_dir, checkpoint_name))
logging.info('Exporting the model as a new TF checkpoint: %s', saved_path)
class BertModelCheckpoint(tf.keras.callbacks.Callback):
"""Keras callback that saves model at the end of every epoch."""
......
......@@ -126,16 +126,9 @@ def run_customized_training(strategy,
train_input_fn=train_input_fn,
steps_per_epoch=steps_per_epoch,
steps_per_loop=steps_per_loop,
epochs=epochs)
epochs=epochs,
sub_model_export_name='pretrained/bert_model')
# Creates the BERT core model outside distribution strategy scope.
_, core_model = bert_models.pretrain_model(bert_config, max_seq_length,
max_predictions_per_seq)
# Restores the core model from model checkpoints and get a new checkpoint only
# contains the core model.
model_saving_utils.export_pretraining_checkpoint(
checkpoint_dir=model_dir, model=core_model)
return trained_model
......
......@@ -18,139 +18,26 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import tensorflow as tf
import tensorflow_hub as hub
from official.modeling import tf_utils
from official.nlp import bert_modeling as modeling
from official.nlp.modeling import losses
from official.nlp.modeling import networks
from official.nlp.modeling.networks import bert_classifier
from official.nlp.modeling.networks import bert_pretrainer
from official.nlp.modeling.networks import bert_span_labeler
def gather_indexes(sequence_tensor, positions):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining of
with dimension (batch_size, max_predictions_per_seq) where
`max_predictions_per_seq` is maximum number of tokens to mask out and
predict per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * max_predictions_per_seq,
num_hidden).
"""
sequence_shape = tf_utils.get_shape_list(
sequence_tensor, name='sequence_output_tensor')
batch_size = sequence_shape[0]
seq_length = sequence_shape[1]
width = sequence_shape[2]
flat_offsets = tf.keras.backend.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.keras.backend.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.keras.backend.reshape(
sequence_tensor, [batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
class BertPretrainLayer(tf.keras.layers.Layer):
"""Wrapper layer for pre-training a BERT model.
This layer wraps an existing `bert_layer` which is a Keras Layer.
It outputs `sequence_output` from TransformerBlock sub-layer and
`sentence_output` which are suitable for feeding into a BertPretrainLoss
layer. This layer can be used along with an unsupervised input to
pre-train the embeddings for `bert_layer`.
"""
def __init__(self,
config,
bert_layer,
initializer=None,
float_type=tf.float32,
**kwargs):
super(BertPretrainLayer, self).__init__(**kwargs)
self.config = copy.deepcopy(config)
self.float_type = float_type
self.embedding_table = bert_layer.embedding_lookup.embeddings
self.num_next_sentence_label = 2
if initializer:
self.initializer = initializer
else:
self.initializer = tf.keras.initializers.TruncatedNormal(
stddev=self.config.initializer_range)
def build(self, unused_input_shapes):
"""Implements build() for the layer."""
self.output_bias = self.add_weight(
shape=[self.config.vocab_size],
name='predictions/output_bias',
initializer=tf.keras.initializers.Zeros())
self.lm_dense = tf.keras.layers.Dense(
self.config.hidden_size,
activation=tf_utils.get_activation(self.config.hidden_act),
kernel_initializer=self.initializer,
name='predictions/transform/dense')
self.lm_layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='predictions/transform/LayerNorm')
# Next sentence binary classification dense layer including bias to match
# TF1.x BERT variable shapes.
with tf.name_scope('seq_relationship'):
self.next_seq_weights = self.add_weight(
shape=[self.num_next_sentence_label, self.config.hidden_size],
name='output_weights',
initializer=self.initializer)
self.next_seq_bias = self.add_weight(
shape=[self.num_next_sentence_label],
name='output_bias',
initializer=tf.keras.initializers.Zeros())
super(BertPretrainLayer, self).build(unused_input_shapes)
def __call__(self,
pooled_output,
sequence_output=None,
masked_lm_positions=None,
**kwargs):
inputs = tf_utils.pack_inputs(
[pooled_output, sequence_output, masked_lm_positions])
return super(BertPretrainLayer, self).__call__(inputs, **kwargs)
def call(self, inputs):
"""Implements call() for the layer."""
unpacked_inputs = tf_utils.unpack_inputs(inputs)
pooled_output = unpacked_inputs[0]
sequence_output = unpacked_inputs[1]
masked_lm_positions = unpacked_inputs[2]
mask_lm_input_tensor = gather_indexes(sequence_output, masked_lm_positions)
lm_output = self.lm_dense(mask_lm_input_tensor)
lm_output = self.lm_layer_norm(lm_output)
lm_output = tf.matmul(lm_output, self.embedding_table, transpose_b=True)
lm_output = tf.nn.bias_add(lm_output, self.output_bias)
lm_output = tf.nn.log_softmax(lm_output, axis=-1)
logits = tf.matmul(pooled_output, self.next_seq_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, self.next_seq_bias)
sentence_output = tf.nn.log_softmax(logits, axis=-1)
return (lm_output, sentence_output)
class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
"""Returns layer that computes custom loss and metrics for pretraining."""
def __init__(self, bert_config, **kwargs):
def __init__(self, vocab_size, **kwargs):
super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
self.config = copy.deepcopy(bert_config)
self._vocab_size = vocab_size
self.config = {
'vocab_size': vocab_size,
}
def __call__(self,
lm_output,
......@@ -167,8 +54,8 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
self).__call__(inputs, **kwargs)
def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
lm_per_example_loss, sentence_output, sentence_labels,
sentence_per_example_loss):
lm_example_loss, sentence_output, sentence_labels,
next_sentence_loss):
"""Adds metrics."""
masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
lm_labels, lm_output)
......@@ -178,8 +65,6 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
self.add_metric(
masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
lm_example_loss = tf.reshape(lm_per_example_loss, [-1])
lm_example_loss = tf.reduce_mean(lm_example_loss * lm_label_weights)
self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
......@@ -189,9 +74,8 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
name='next_sentence_accuracy',
aggregation='mean')
next_sentence_mean_loss = tf.reduce_mean(sentence_per_example_loss)
self.add_metric(
next_sentence_mean_loss, name='next_sentence_loss', aggregation='mean')
next_sentence_loss, name='next_sentence_loss', aggregation='mean')
def call(self, inputs):
"""Implements call() for the layer."""
......@@ -199,31 +83,21 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
lm_output = unpacked_inputs[0]
sentence_output = unpacked_inputs[1]
lm_label_ids = unpacked_inputs[2]
lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1])
lm_label_ids_one_hot = tf.keras.backend.one_hot(lm_label_ids,
self.config.vocab_size)
lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32)
lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1])
lm_per_example_loss = -tf.keras.backend.sum(
lm_output * lm_label_ids_one_hot, axis=[-1])
numerator = tf.keras.backend.sum(lm_label_weights * lm_per_example_loss)
denominator = tf.keras.backend.sum(lm_label_weights) + 1e-5
mask_label_loss = numerator / denominator
sentence_labels = unpacked_inputs[4]
sentence_labels = tf.keras.backend.reshape(sentence_labels, [-1])
sentence_label_one_hot = tf.keras.backend.one_hot(sentence_labels, 2)
per_example_loss_sentence = -tf.keras.backend.sum(
sentence_label_one_hot * sentence_output, axis=-1)
sentence_loss = tf.keras.backend.mean(per_example_loss_sentence)
mask_label_loss = losses.weighted_sparse_categorical_crossentropy_loss(
labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights)
sentence_loss = losses.weighted_sparse_categorical_crossentropy_loss(
labels=sentence_labels, predictions=sentence_output)
loss = mask_label_loss + sentence_loss
batch_shape = tf.slice(tf.keras.backend.shape(sentence_labels), [0], [1])
# TODO(hongkuny): Avoids the hack and switches add_loss.
final_loss = tf.fill(
tf.keras.backend.shape(per_example_loss_sentence), loss)
final_loss = tf.fill(batch_shape, loss)
self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
lm_per_example_loss, sentence_output, sentence_labels,
per_example_loss_sentence)
mask_label_loss, sentence_output, sentence_labels,
sentence_loss)
return final_loss
......@@ -268,13 +142,12 @@ def pretrain_model(bert_config,
seq_length: Maximum sequence length of the training data.
max_predictions_per_seq: Maximum number of tokens in sequence to mask out
and use for pretraining.
initializer: Initializer for weights in BertPretrainLayer.
initializer: Initializer for weights in BertPretrainer.
Returns:
Pretraining model as well as core BERT submodel from which to save
weights after pretraining.
"""
input_word_ids = tf.keras.layers.Input(
shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
input_mask = tf.keras.layers.Input(
......@@ -285,38 +158,34 @@ def pretrain_model(bert_config,
shape=(max_predictions_per_seq,),
name='masked_lm_positions',
dtype=tf.int32)
masked_lm_ids = tf.keras.layers.Input(
shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
masked_lm_weights = tf.keras.layers.Input(
shape=(max_predictions_per_seq,),
name='masked_lm_weights',
dtype=tf.int32)
next_sentence_labels = tf.keras.layers.Input(
shape=(1,), name='next_sentence_labels', dtype=tf.int32)
masked_lm_ids = tf.keras.layers.Input(
shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
bert_submodel_name = 'bert_model'
bert_submodel = modeling.get_bert_model(
input_word_ids,
input_mask,
input_type_ids,
name=bert_submodel_name,
config=bert_config)
pooled_output = bert_submodel.outputs[0]
sequence_output = bert_submodel.outputs[1]
pretrain_layer = BertPretrainLayer(
bert_config,
bert_submodel.get_layer(bert_submodel_name),
transformer_encoder = _get_transformer_encoder(bert_config, seq_length)
if initializer is None:
initializer = tf.keras.initializers.TruncatedNormal(
stddev=bert_config.initializer_range)
pretrainer_model = bert_pretrainer.BertPretrainer(
network=transformer_encoder,
num_classes=2, # The next sentence prediction label has two classes.
num_token_predictions=max_predictions_per_seq,
initializer=initializer,
name='cls')
lm_output, sentence_output = pretrain_layer(pooled_output, sequence_output,
masked_lm_positions)
output='predictions')
lm_output, sentence_output = pretrainer_model(
[input_word_ids, input_mask, input_type_ids, masked_lm_positions])
pretrain_loss_layer = BertPretrainLossAndMetricLayer(bert_config)
pretrain_loss_layer = BertPretrainLossAndMetricLayer(
vocab_size=bert_config.vocab_size)
output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
masked_lm_weights, next_sentence_labels)
return tf.keras.Model(
keras_model = tf.keras.Model(
inputs={
'input_word_ids': input_word_ids,
'input_mask': input_mask,
......@@ -326,7 +195,8 @@ def pretrain_model(bert_config,
'masked_lm_weights': masked_lm_weights,
'next_sentence_labels': next_sentence_labels,
},
outputs=output_loss), bert_submodel
outputs=output_loss)
return keras_model, transformer_encoder
class BertSquadLogitsLayer(tf.keras.layers.Layer):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册