Merge pull request #48 from xyzhou-puck/master

refine text.py

Merge pull request #48 from xyzhou-puck/master
refine text.py
e0f5c55d · pkpk · GitHub · ed14907e · 1fac53aa · e0f5c55d
10 changed file
--- a/examples/bert/bert_classifier.py
+++ b/examples/bert/bert_classifier.py
@@ -16,14 +16,60 @@
 import paddle.fluid as fluid
 from hapi.metrics import Accuracy
 from hapi.configure import Config
+from hapi.text.bert import BertEncoder
+from paddle.fluid.dygraph import Linear, Layer
 from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
-from cls import ClsModelLayer
 import hapi.text.tokenizer.tokenization as tokenization
 from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
-def train():
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        pred = self.cls_fc(cls_feats)
+        return pred
+def main():
    config = Config(yaml_file="./bert.yaml")
    config.build()
@@ -35,8 +81,6 @@ def train():
    bert_config = BertConfig(config.bert_config_path)
    bert_config.print_config()
-    trainer_count = fluid.dygraph.parallel.Env().nranks
    tokenizer = tokenization.FullTokenizer(
        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
@@ -52,14 +96,24 @@ def train():
        return BertInputExample(
            uid=uid, text_a=text_a, text_b=text_b, label=label)
-    bert_dataloader = BertDataLoader(
+    train_dataloader = BertDataLoader(
        "./data/glue_data/MNLI/train.tsv",
        tokenizer, ["contradiction", "entailment", "neutral"],
-        max_seq_length=64,
+        max_seq_length=config.max_seq_len,
-        batch_size=32,
+        batch_size=config.batch_size,
        line_processor=mnli_line_processor)
-    num_train_examples = len(bert_dataloader.dataset)
+    dev_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/dev_matched.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        shuffle=False,
+        phase="predict")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    num_train_examples = len(train_dataloader.dataset)
    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
    warmup_steps = int(max_train_steps * config.warmup_proportion)
@@ -82,7 +136,6 @@ def train():
        config,
        bert_config,
        len(["contradiction", "entailment", "neutral"]),
-        is_training=True,
        return_pooled_out=True)
    optimizer = Optimizer(
@@ -106,10 +159,15 @@ def train():
    cls_model.bert_layer.init_parameters(
        config.init_pretraining_params, verbose=config.verbose)
-    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+    # do train
+    cls_model.fit(train_data=train_dataloader.dataloader,
+                  epochs=config.epoch,
+                  save_dir=config.checkpoints)
-    return cls_model
+    # do eval
+    cls_model.evaluate(
+        eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
 if __name__ == '__main__':
-    cls_model = train()
+    main()
--- a/examples/bert/cls.py
+++ b/examples/bert/cls.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"dygraph transformer layers"
-import six
-import json
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import Linear, Layer
-from hapi.text.bert import BertEncoder
-from hapi.model import Model
-class ClsModelLayer(Model):
-    """
-    classify model
-    """
-    def __init__(self,
-                 args,
-                 config,
-                 num_labels,
-                 is_training=True,
-                 return_pooled_out=True,
-                 use_fp16=False):
-        super(ClsModelLayer, self).__init__()
-        self.config = config
-        self.is_training = is_training
-        self.use_fp16 = use_fp16
-        self.loss_scaling = args.loss_scaling
-        self.bert_layer = BertEncoder(
-            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
-        self.cls_fc = Linear(
-            input_dim=self.config["hidden_size"],
-            output_dim=num_labels,
-            param_attr=fluid.ParamAttr(
-                name="cls_out_w",
-                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-            bias_attr=fluid.ParamAttr(
-                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
-    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
-        """
-        forward
-        """
-        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
-                                                     sentence_ids, input_mask)
-        cls_feats = fluid.layers.dropout(
-            x=next_sent_feat,
-            dropout_prob=0.1,
-            dropout_implementation="upscale_in_train")
-        logits = self.cls_fc(cls_feats)
-        return logits
--- a/examples/bert_leveldb/bert.yaml
+++ b/examples/bert_leveldb/bert.yaml
@@ -18,7 +18,7 @@ batch_size: 32
 in_tokens: False
 do_lower_case: True
 random_seed: 5512
-use_cuda: False
+use_cuda: True
 shuffle: True
 do_train: True
 do_test: True

--- a/examples/bert_leveldb/bert_classifier.py
+++ b/examples/bert_leveldb/bert_classifier.py
@@ -16,14 +16,60 @@
 import paddle.fluid as fluid
 from hapi.metrics import Accuracy
 from hapi.configure import Config
+from hapi.text.bert import BertEncoder
+from paddle.fluid.dygraph import Linear, Layer
 from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
-from cls import ClsModelLayer
 import hapi.text.tokenizer.tokenization as tokenization
 from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
-def train():
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        pred = self.cls_fc(cls_feats)
+        return pred
+def main():
    config = Config(yaml_file="./bert.yaml")
    config.build()
@@ -35,8 +81,6 @@ def train():
    bert_config = BertConfig(config.bert_config_path)
    bert_config.print_config()
-    trainer_count = fluid.dygraph.parallel.Env().nranks
    tokenizer = tokenization.FullTokenizer(
        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
@@ -52,15 +96,26 @@ def train():
        return BertInputExample(
            uid=uid, text_a=text_a, text_b=text_b, label=label)
-    bert_dataloader = BertDataLoader(
+    train_dataloader = BertDataLoader(
        "./data/glue_data/MNLI/train.tsv",
        tokenizer, ["contradiction", "entailment", "neutral"],
-        max_seq_length=64,
+        max_seq_length=config.max_seq_len,
-        batch_size=32,
+        batch_size=config.batch_size,
        line_processor=mnli_line_processor,
-        mode="leveldb")
+        mode="leveldb",
+        phase="train")
-    num_train_examples = len(bert_dataloader.dataset)
+    dev_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/dev_matched.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        shuffle=False,
+        phase="predict")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    num_train_examples = len(train_dataloader.dataset)
    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
    warmup_steps = int(max_train_steps * config.warmup_proportion)
@@ -83,7 +138,6 @@ def train():
        config,
        bert_config,
        len(["contradiction", "entailment", "neutral"]),
-        is_training=True,
        return_pooled_out=True)
    optimizer = Optimizer(
@@ -107,10 +161,15 @@ def train():
    cls_model.bert_layer.init_parameters(
        config.init_pretraining_params, verbose=config.verbose)
-    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+    # do train
+    cls_model.fit(train_data=train_dataloader.dataloader,
+                  epochs=config.epoch,
+                  save_dir=config.checkpoints)
-    return cls_model
+    # do eval
+    cls_model.evaluate(
+        eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
 if __name__ == '__main__':
-    cls_model = train()
+    main()
--- a/examples/bert_leveldb/cls.py
+++ b/examples/bert_leveldb/cls.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"dygraph transformer layers"
-import six
-import json
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import Linear, Layer
-from hapi.text.bert import BertEncoder
-from hapi.model import Model
-class ClsModelLayer(Model):
-    """
-    classify model
-    """
-    def __init__(self,
-                 args,
-                 config,
-                 num_labels,
-                 is_training=True,
-                 return_pooled_out=True,
-                 use_fp16=False):
-        super(ClsModelLayer, self).__init__()
-        self.config = config
-        self.is_training = is_training
-        self.use_fp16 = use_fp16
-        self.loss_scaling = args.loss_scaling
-        self.bert_layer = BertEncoder(
-            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
-        self.cls_fc = Linear(
-            input_dim=self.config["hidden_size"],
-            output_dim=num_labels,
-            param_attr=fluid.ParamAttr(
-                name="cls_out_w",
-                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-            bias_attr=fluid.ParamAttr(
-                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
-    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
-        """
-        forward
-        """
-        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
-                                                     sentence_ids, input_mask)
-        cls_feats = fluid.layers.dropout(
-            x=next_sent_feat,
-            dropout_prob=0.1,
-            dropout_implementation="upscale_in_train")
-        logits = self.cls_fc(cls_feats)
-        return logits
--- a/examples/bert_leveldb/nohup.out
+++ b/examples/bert_leveldb/nohup.out
--- a/examples/bert_leveldb/run_classifier_multi_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_multi_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+# start fine-tuning
+python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=0,1,2,3 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
--- a/examples/bert_leveldb/run_classifier_single_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_single_gpu.sh
@@ -4,7 +4,7 @@ TASK_NAME='MNLI'
 DATA_PATH="./data/glue_data/MNLI/"
 CKPT_PATH="./data/saved_model/mnli_models"
-export CUDA_VISIBLE_DEVICES=7
+export CUDA_VISIBLE_DEVICES=0
 # start fine-tuning
 python3.7 bert_classifier.py\

--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
@@ -30,6 +30,7 @@ from hapi.distributed import DistributedBatchSampler
 from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
 from hapi.text.bert.batching import prepare_batch_data
 import hapi.text.tokenizer.tokenization as tokenization
+from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
 __all__ = [
    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
@@ -227,6 +228,9 @@ class SingleSentenceDataset(Dataset):
        if line_processor is None:
            line_processor = default_line_processor
+        if ParallelEnv().nranks > 1:
+            leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank)
        if not os.path.exists(leveldb_file):
            print("putting data %s into leveldb %s" %
                  (input_file, leveldb_file))
@@ -384,7 +388,12 @@ class BertDataLoader(object):
                 quotechar=None,
                 device=fluid.CPUPlace(),
                 num_workers=0,
-                 return_list=True):
+                 return_list=True,
+                 phase="train"):
+        assert phase in [
+            "train", "predict", "test"
+        ], "phase of BertDataLoader should be in [train, predict, test], but get %s" % phase
        self.dataset = SingleSentenceDataset(tokenizer, label_list,
                                             max_seq_length, mode)
@@ -394,15 +403,21 @@ class BertDataLoader(object):
                input_file, label_list, max_seq_length, tokenizer,
                line_processor, delimiter, quotechar)
        elif mode == "leveldb":
-            #prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
            self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
                                         max_seq_length, tokenizer,
                                         line_processor, delimiter, quotechar)
        else:
            raise ValueError("mode should be in [all_in_memory, leveldb]")
+        if phase == "train":
            self.sampler = DistributedBatchSampler(
                self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        elif phase == "test" or phase == "predict":
+            self.sampler = BatchSampler(
+                dataset=self.dataset,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last)
        self.dataloader = DataLoader(
            dataset=self.dataset,

--- a/hapi/text/text.py
+++ b/hapi/text/text.py