add leveldb reader for bert

1373e294 · xyzhou-puck · b2f94aa8 · 1373e294 · 1373e294 · 1373e294
6 changed file
--- a/examples/bert_leveldb/bert.yaml
+++ b/examples/bert_leveldb/bert.yaml
+bert_config_path: "./config/bert_config.json"
+init_checkpoint: None
+init_pretraining_params: None
+checkpoints: "./saved_model"
+epoch: 3
+learning_rate: 0.0001
+lr_scheduler: "linear_warmup_decay"
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 100000
+validation_steps: 100000
+loss_scaling: 1.0
+skip_steps: 100
+data_dir: None
+vocab_path: None
+max_seq_len: 512
+batch_size: 32
+in_tokens: False
+do_lower_case: True
+random_seed: 5512
+use_cuda: False
+shuffle: True
+do_train: True
+do_test: True
+use_data_parallel: False
+verbose: False
+
--- a/examples/bert_leveldb/bert_classifier.py
+++ b/examples/bert_leveldb/bert_classifier.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+
+import paddle.fluid as fluid
+from hapi.metrics import Accuracy
+from hapi.configure import Config
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+
+from cls import ClsModelLayer
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+
+
+def train():
+
+    config = Config(yaml_file="./bert.yaml")
+    config.build()
+    config.Print()
+
+    device = set_device("gpu" if config.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+
+    bert_config = BertConfig(config.bert_config_path)
+    bert_config.print_config()
+
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
+
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(
+            uid=uid, text_a=text_a, text_b=text_b, label=label)
+
+    bert_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/train.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=64,
+        batch_size=32,
+        line_processor=mnli_line_processor,
+        mode="leveldb")
+
+    num_train_examples = len(bert_dataloader.dataset)
+    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * config.warmup_proportion)
+
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+
+    inputs = [
+        Input(
+            [None, None], 'int64', name='src_ids'), Input(
+                [None, None], 'int64', name='pos_ids'), Input(
+                    [None, None], 'int64', name='sent_ids'), Input(
+                        [None, None], 'float32', name='input_mask')
+    ]
+
+    labels = [Input([None, 1], 'int64', name='label')]
+
+    cls_model = ClsModelLayer(
+        config,
+        bert_config,
+        len(["contradiction", "entailment", "neutral"]),
+        is_training=True,
+        return_pooled_out=True)
+
+    optimizer = Optimizer(
+        warmup_steps=warmup_steps,
+        num_train_steps=max_train_steps,
+        learning_rate=config.learning_rate,
+        model_cls=cls_model,
+        weight_decay=config.weight_decay,
+        scheduler=config.lr_scheduler,
+        loss_scaling=config.loss_scaling,
+        parameter_list=cls_model.parameters())
+
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+
+    cls_model.bert_layer.init_parameters(
+        config.init_pretraining_params, verbose=config.verbose)
+
+    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+
+    return cls_model
+
+
+if __name__ == '__main__':
+    cls_model = train()
--- a/examples/bert_leveldb/cls.py
+++ b/examples/bert_leveldb/cls.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+
+from hapi.text.bert import BertEncoder
+from hapi.model import Model
+
+
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 is_training=True,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+
+        logits = self.cls_fc(cls_feats)
+
+        return logits
--- a/examples/bert_leveldb/nohup.out
+++ b/examples/bert_leveldb/nohup.out
--- a/examples/bert_leveldb/run_classifier_single_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+
+export CUDA_VISIBLE_DEVICES=7
+
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
+
--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
@@ -19,6 +19,7 @@ import csv
 import glob
 import tarfile
 import itertools
+import leveldb
 from functools import partial

 import numpy as np
@@ -167,10 +168,14 @@ class SingleSentenceDataset(Dataset):
        assert isinstance(mode,
                          str), "mode of SingleSentenceDataset should be str"
        assert mode in [
-            "all_in_memory", "leveldb"
-        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb], but get" % mode
+            "all_in_memory", "leveldb", "streaming"
+        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb, streaming], but get" % mode

+        self.delimiter = None
+        self.mode = mode
        self.examples = []
+        self._db = None
+        self._line_processor = None

    def load_all_data_in_memory(self,
                                input_file,
@@ -202,13 +207,87 @@ class SingleSentenceDataset(Dataset):
                tokenizer)
            self.examples.append(input_feature)

+    def prepare_leveldb(self,
+                        input_file,
+                        leveldb_file,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                        line_processor=None,
+                        delimiter="\t",
+                        quotechar=None):
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+
+        if line_processor is None:
+            line_processor = default_line_processor
+
+        if not os.path.exists(leveldb_file):
+            print("putting data %s into leveldb %s" %
+                  (input_file, leveldb_file))
+            _example_num = 0
+            _db = leveldb.LevelDB(leveldb_file, create_if_missing=True)
+            with io.open(input_file, "r", encoding="utf8") as f:
+                reader = csv.reader(
+                    f, delimiter=delimiter, quotechar=quotechar)
+                line_id = 0
+                for (_line_id, line) in enumerate(reader):
+                    if line_processor(str(_line_id), line) is None:
+                        continue
+
+                    line_str = delimiter.join(line)
+                    _db.Put(
+                        str(line_id).encode("utf8"), line_str.encode("utf8"))
+                    line_id += 1
+                    _example_num += 1
+            _db.Put("_example_num_".encode("utf8"),
+                    str(_example_num).encode("utf8"))
+        else:
+            _db = leveldb.LevelDB(leveldb_file, create_if_missing=False)
+
+        self.label_list = label_list
+        self.max_seq_length = max_seq_length
+        self.tokenizer = tokenizer
+        self.delimiter = delimiter
+        self._db = _db
+        self._line_processor = line_processor
+
    def __getitem__(self, idx):
-        return self.examples[idx].input_ids, self.examples[
-            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
-                idx].label_id
+
+        if self.mode == "all_in_memory":
+            return self.examples[idx].input_ids, self.examples[
+                idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                    idx].label_id
+
+        if self.mode == "leveldb":
+            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
+            line_str = self._db.Get(str(idx).encode("utf8"))
+            line_str = line_str.decode("utf8")
+
+            line = line_str.split(self.delimiter)
+            input_example = self._line_processor(str(idx + 1), line)
+
+            input_example = convert_single_example(
+                str(idx + 1), input_example, self.label_list,
+                self.max_seq_length, self.tokenizer)
+
+            return input_example.input_ids, input_example.pos_ids, input_example.segment_ids, input_example.label_id

    def __len__(self):
-        return len(self.examples)
+        if self.mode == "all_in_memory":
+            return len(self.examples)
+
+        if self.mode == "leveldb":
+            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
+
+            exmaple_num = self._db.Get("_example_num_".encode("utf8"))
+            exmaple_num = exmaple_num.decode("utf8")
+            return int(exmaple_num)


 class SentencePairDataset(Dataset):
@@ -299,6 +378,7 @@ class BertDataLoader(object):
                 shuffle=False,
                 drop_last=False,
                 mode="all_in_memory",
+                 leveldb_file="./leveldb",
                 line_processor=None,
                 delimiter="\t",
                 quotechar=None,
@@ -314,8 +394,10 @@ class BertDataLoader(object):
                input_file, label_list, max_seq_length, tokenizer,
                line_processor, delimiter, quotechar)
        elif mode == "leveldb":
-            #TODO add leveldb reader
-            pass
+            #prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
+            self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
+                                         max_seq_length, tokenizer,
+                                         line_processor, delimiter, quotechar)
        else:
            raise ValueError("mode should be in [all_in_memory, leveldb]")