提交 1373e294 编写于 作者: X xyzhou-puck

add leveldb reader for bert

上级 b2f94aa8
bert_config_path: "./config/bert_config.json"
init_checkpoint: None
init_pretraining_params: None
checkpoints: "./saved_model"
epoch: 3
learning_rate: 0.0001
lr_scheduler: "linear_warmup_decay"
weight_decay: 0.01
warmup_proportion: 0.1
save_steps: 100000
validation_steps: 100000
loss_scaling: 1.0
skip_steps: 100
data_dir: None
vocab_path: None
max_seq_len: 512
batch_size: 32
in_tokens: False
do_lower_case: True
random_seed: 5512
use_cuda: False
shuffle: True
do_train: True
do_test: True
use_data_parallel: False
verbose: False
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""
import paddle.fluid as fluid
from hapi.metrics import Accuracy
from hapi.configure import Config
from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
from cls import ClsModelLayer
import hapi.text.tokenizer.tokenization as tokenization
from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
def train():
config = Config(yaml_file="./bert.yaml")
config.build()
config.Print()
device = set_device("gpu" if config.use_cuda else "cpu")
fluid.enable_dygraph(device)
bert_config = BertConfig(config.bert_config_path)
bert_config.print_config()
trainer_count = fluid.dygraph.parallel.Env().nranks
tokenizer = tokenization.FullTokenizer(
vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
def mnli_line_processor(line_id, line):
if line_id == "0":
return None
uid = tokenization.convert_to_unicode(line[0])
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
label = tokenization.convert_to_unicode(line[-1])
if label not in ["contradiction", "entailment", "neutral"]:
label = "contradiction"
return BertInputExample(
uid=uid, text_a=text_a, text_b=text_b, label=label)
bert_dataloader = BertDataLoader(
"./data/glue_data/MNLI/train.tsv",
tokenizer, ["contradiction", "entailment", "neutral"],
max_seq_length=64,
batch_size=32,
line_processor=mnli_line_processor,
mode="leveldb")
num_train_examples = len(bert_dataloader.dataset)
max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
warmup_steps = int(max_train_steps * config.warmup_proportion)
print("Trainer count: %d" % trainer_count)
print("Num train examples: %d" % num_train_examples)
print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps)
inputs = [
Input(
[None, None], 'int64', name='src_ids'), Input(
[None, None], 'int64', name='pos_ids'), Input(
[None, None], 'int64', name='sent_ids'), Input(
[None, None], 'float32', name='input_mask')
]
labels = [Input([None, 1], 'int64', name='label')]
cls_model = ClsModelLayer(
config,
bert_config,
len(["contradiction", "entailment", "neutral"]),
is_training=True,
return_pooled_out=True)
optimizer = Optimizer(
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=config.learning_rate,
model_cls=cls_model,
weight_decay=config.weight_decay,
scheduler=config.lr_scheduler,
loss_scaling=config.loss_scaling,
parameter_list=cls_model.parameters())
cls_model.prepare(
optimizer,
SoftmaxWithCrossEntropy(),
Accuracy(topk=(1, 2)),
inputs,
labels,
device=device)
cls_model.bert_layer.init_parameters(
config.init_pretraining_params, verbose=config.verbose)
cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
return cls_model
if __name__ == '__main__':
cls_model = train()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from hapi.text.bert import BertEncoder
from hapi.model import Model
class ClsModelLayer(Model):
"""
classify model
"""
def __init__(self,
args,
config,
num_labels,
is_training=True,
return_pooled_out=True,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.is_training = is_training
self.use_fp16 = use_fp16
self.loss_scaling = args.loss_scaling
self.bert_layer = BertEncoder(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
cls_feats = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logits = self.cls_fc(cls_feats)
return logits
此差异已折叠。
#!/bin/bash
BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
TASK_NAME='MNLI'
DATA_PATH="./data/glue_data/MNLI/"
CKPT_PATH="./data/saved_model/mnli_models"
export CUDA_VISIBLE_DEVICES=7
# start fine-tuning
python3.7 bert_classifier.py\
--use_cuda true \
--do_train true \
--do_test true \
--batch_size 64 \
--init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
--data_dir ${DATA_PATH} \
--vocab_path ${BERT_BASE_PATH}/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 1000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--validation_steps 100 \
--epoch 3 \
--max_seq_len 128 \
--bert_config_path ${BERT_BASE_PATH}/bert_config.json \
--learning_rate 5e-5 \
--skip_steps 10 \
--shuffle true
......@@ -19,6 +19,7 @@ import csv
import glob
import tarfile
import itertools
import leveldb
from functools import partial
import numpy as np
......@@ -167,10 +168,14 @@ class SingleSentenceDataset(Dataset):
assert isinstance(mode,
str), "mode of SingleSentenceDataset should be str"
assert mode in [
"all_in_memory", "leveldb"
], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb], but get" % mode
"all_in_memory", "leveldb", "streaming"
], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb, streaming], but get" % mode
self.delimiter = None
self.mode = mode
self.examples = []
self._db = None
self._line_processor = None
def load_all_data_in_memory(self,
input_file,
......@@ -202,13 +207,87 @@ class SingleSentenceDataset(Dataset):
tokenizer)
self.examples.append(input_feature)
def prepare_leveldb(self,
input_file,
leveldb_file,
label_list,
max_seq_length,
tokenizer,
line_processor=None,
delimiter="\t",
quotechar=None):
def default_line_processor(line_id, line):
assert len(line) == 2
text_a = line[0]
label = line[1]
return BertInputExample(
str(line_id), text_a=text_a, text_b=None, label=label)
if line_processor is None:
line_processor = default_line_processor
if not os.path.exists(leveldb_file):
print("putting data %s into leveldb %s" %
(input_file, leveldb_file))
_example_num = 0
_db = leveldb.LevelDB(leveldb_file, create_if_missing=True)
with io.open(input_file, "r", encoding="utf8") as f:
reader = csv.reader(
f, delimiter=delimiter, quotechar=quotechar)
line_id = 0
for (_line_id, line) in enumerate(reader):
if line_processor(str(_line_id), line) is None:
continue
line_str = delimiter.join(line)
_db.Put(
str(line_id).encode("utf8"), line_str.encode("utf8"))
line_id += 1
_example_num += 1
_db.Put("_example_num_".encode("utf8"),
str(_example_num).encode("utf8"))
else:
_db = leveldb.LevelDB(leveldb_file, create_if_missing=False)
self.label_list = label_list
self.max_seq_length = max_seq_length
self.tokenizer = tokenizer
self.delimiter = delimiter
self._db = _db
self._line_processor = line_processor
def __getitem__(self, idx):
return self.examples[idx].input_ids, self.examples[
idx].pos_ids, self.examples[idx].segment_ids, self.examples[
idx].label_id
if self.mode == "all_in_memory":
return self.examples[idx].input_ids, self.examples[
idx].pos_ids, self.examples[idx].segment_ids, self.examples[
idx].label_id
if self.mode == "leveldb":
assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
line_str = self._db.Get(str(idx).encode("utf8"))
line_str = line_str.decode("utf8")
line = line_str.split(self.delimiter)
input_example = self._line_processor(str(idx + 1), line)
input_example = convert_single_example(
str(idx + 1), input_example, self.label_list,
self.max_seq_length, self.tokenizer)
return input_example.input_ids, input_example.pos_ids, input_example.segment_ids, input_example.label_id
def __len__(self):
return len(self.examples)
if self.mode == "all_in_memory":
return len(self.examples)
if self.mode == "leveldb":
assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
exmaple_num = self._db.Get("_example_num_".encode("utf8"))
exmaple_num = exmaple_num.decode("utf8")
return int(exmaple_num)
class SentencePairDataset(Dataset):
......@@ -299,6 +378,7 @@ class BertDataLoader(object):
shuffle=False,
drop_last=False,
mode="all_in_memory",
leveldb_file="./leveldb",
line_processor=None,
delimiter="\t",
quotechar=None,
......@@ -314,8 +394,10 @@ class BertDataLoader(object):
input_file, label_list, max_seq_length, tokenizer,
line_processor, delimiter, quotechar)
elif mode == "leveldb":
#TODO add leveldb reader
pass
#prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
max_seq_length, tokenizer,
line_processor, delimiter, quotechar)
else:
raise ValueError("mode should be in [all_in_memory, leveldb]")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册