diff --git a/README.md b/README.md index 93003b5ed59fc3a7dfe8473f8f0019a6f7c72724..17630d6822359e81f92659b25b738695b625ea9e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Mongolian \*\*\*\*\*** We uploaded a new multilingual model which does *not* perform any normalization on the input (no lower casing, accent stripping, or Unicode normalization), and -additionally includes Thai and Mongolian. +additionally inclues Thai and Mongolian. **It is recommended to use this version for developing multilingual models, especially on languages with non-Latin alphabets.** @@ -38,8 +38,9 @@ repository. We have made two new BERT models available: -* **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: - 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip) + (Not recommended, use `Multilingual Cased` instead)**: 102 languages, + 12-layer, 768-hidden, 12-heads, 110M parameters * **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters @@ -228,8 +229,9 @@ The links to the models are here (right-click, 'Save link as...' on the name): 24-layer, 1024-hidden, 16-heads, 340M parameters * **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters -* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: - 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip) + (Not recommended, use `Multilingual Cased` instead)**: 102 languages, + 12-layer, 768-hidden, 12-heads, 110M parameters * **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters diff --git a/create_pretraining_data.py b/create_pretraining_data.py index 9ba79ddb6f63f2e53aa9dfb10c6efdd0272192aa..1715ea36e3df720f22c3d2d235bdaec6bc63d16b 100644 --- a/create_pretraining_data.py +++ b/create_pretraining_data.py @@ -20,9 +20,8 @@ from __future__ import print_function import collections import random - -import tokenization import tensorflow as tf +import tokenization flags = tf.flags diff --git a/multilingual.md b/multilingual.md index e355ec8fc21d24788e63e6d2f0f4b4e4b5a1308d..f1480ffda01ddefb980bcb5b70e9e24dd5d199d3 100644 --- a/multilingual.md +++ b/multilingual.md @@ -297,7 +297,7 @@ chosen because they are the top 100 languages with the largest Wikipedias: * Volapük * Waray-Waray * Welsh -* West +* West Frisian * Western Punjabi * Yoruba diff --git a/optimization.py b/optimization.py index b40348b3820f99aa09944fdc79066557fae64a81..d33dabd91dd965781d3dd912e0727ba23d7c2918 100644 --- a/optimization.py +++ b/optimization.py @@ -76,6 +76,9 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=global_step) + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op @@ -137,7 +140,7 @@ class AdamWeightDecayOptimizer(tf.train.Optimizer): # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # - # Instead we want to decay the weights in a manner that doesn't interact + # Instead we want ot decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. if self._do_use_weight_decay(param_name): diff --git a/run_classifier.py b/run_classifier.py index a45fac8444f3d3b7b82d2c5054672a0a97900d84..817b14720c235a3199b8419c3543bf312f544a5c 100644 --- a/run_classifier.py +++ b/run_classifier.py @@ -145,14 +145,33 @@ class InputExample(object): self.label = label +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + class InputFeatures(object): """A single set of features of data.""" - def __init__(self, input_ids, input_mask, segment_ids, label_id): + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id + self.is_real_example = is_real_example class DataProcessor(object): @@ -358,6 +377,15 @@ class ColaProcessor(DataProcessor): def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i @@ -393,7 +421,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because + # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] @@ -443,7 +471,8 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_id=label_id) + label_id=label_id, + is_real_example=True) return feature @@ -469,9 +498,12 @@ def file_based_convert_examples_to_features( features["input_mask"] = create_int_feature(feature.input_mask) features["segment_ids"] = create_int_feature(feature.segment_ids) features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writer.write(tf_example.SerializeToString()) + writer.close() def file_based_input_fn_builder(input_file, seq_length, is_training, @@ -483,6 +515,7 @@ def file_based_input_fn_builder(input_file, seq_length, is_training, "input_mask": tf.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): @@ -599,6 +632,11 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) @@ -643,16 +681,18 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: - def metric_fn(per_example_loss, label_ids, logits): + def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) - accuracy = tf.metrics.accuracy(label_ids, predictions) - loss = tf.metrics.mean(per_example_loss) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } - eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, @@ -660,7 +700,9 @@ def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn) + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) return output_spec return model_fn @@ -748,6 +790,9 @@ def main(_): "xnli": XnliProcessor, } + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") @@ -836,12 +881,24 @@ def main(_): if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") - tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. @@ -849,9 +906,8 @@ def main(_): # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: - # Eval will be slightly WRONG on the TPU because it will truncate - # the last batch. - eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( @@ -871,20 +927,26 @@ def main(_): if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") - tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) - if FLAGS.use_tpu: - # Warning: According to tpu_estimator.py Prediction on TPU is an - # experimental feature and hence not supported here - raise ValueError("Prediction in TPU not supported") - predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, @@ -896,11 +958,18 @@ def main(_): output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 tf.logging.info("***** Predict results *****") - for prediction in result: + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break output_line = "\t".join( - str(class_probability) for class_probability in prediction) + "\n" + str(class_probability) + for class_probability in probabilities) + "\n" writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples if __name__ == "__main__": diff --git a/run_squad.py b/run_squad.py index 72c5f43f9826caaec71efaa16cb7dd5cf93db553..edd4c3ed9863ea350dd8de330dbe9e84609a048c 100644 --- a/run_squad.py +++ b/run_squad.py @@ -1096,6 +1096,9 @@ class FeatureWriter(object): def validate_flags_or_throw(bert_config): """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + if not FLAGS.do_train and not FLAGS.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") diff --git a/tokenization.py b/tokenization.py index 49e902604f683aa75d83b563e781b0d59718a7ab..dc476a69859af4029005d6ef31ddfdeb1c5d547d 100644 --- a/tokenization.py +++ b/tokenization.py @@ -19,11 +19,62 @@ from __future__ import division from __future__ import print_function import collections +import re import unicodedata import six import tensorflow as tf +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" if six.PY3: @@ -84,7 +135,10 @@ def load_vocab(vocab_file): def convert_by_vocab(vocab, items): """Converts a sequence of [tokens|ids] using the vocab.""" - return [vocab[item] for item in items] + output = [] + for item in items: + output.append(vocab[item]) + return output def convert_tokens_to_ids(vocab, tokens): diff --git a/tokenization_test.py b/tokenization_test.py index 0d6d3949452b211d3caa4990b56791fa22991def..64ebec80c0996d5670625e2629461619dc7d1ba9 100644 --- a/tokenization_test.py +++ b/tokenization_test.py @@ -18,9 +18,9 @@ from __future__ import print_function import os import tempfile - -import tokenization +import six import tensorflow as tf +import tokenization class TokenizationTest(tf.test.TestCase): @@ -31,11 +31,11 @@ class TokenizationTest(tf.test.TestCase): "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: - import six if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens]).encode("utf-8")) + vocab_writer.write("".join( + [x + "\n" for x in vocab_tokens]).encode("utf-8")) vocab_file = vocab_writer.name