提交 74ec0647 编写于 作者: Z Zeyu Chen

reorg finetune and reader

上级 810cdd3a
......@@ -49,6 +49,7 @@ if __name__ == '__main__':
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
eval_interval=10,
use_cuda=True,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
......
......@@ -40,6 +40,3 @@ from .finetune.finetune import finetune_and_eval
from .finetune.config import RunConfig
from .finetune.strategy import BERTFinetuneStrategy
from .finetune.strategy import DefaultStrategy
from .reader import BERTTokenizeReader
from .reader.cv_reader import ImageClassificationReader
......@@ -13,3 +13,4 @@
# limitations under the License.
from . import utils
from .utils import get_running_device_info
......@@ -17,6 +17,8 @@ from __future__ import division
from __future__ import print_function
import os
import time
import multiprocessing
import hashlib
import paddle
......@@ -185,6 +187,17 @@ def is_yaml_file(file_path):
return get_file_ext(file_path) == ".yml"
def get_running_device_info(config):
if config.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
return place, dev_count
if __name__ == "__main__":
print(is_yaml_file("test.yml"))
print(is_csv_file("test.yml"))
......
......@@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from datetime import datetime
import time
from datetime import datetime
from paddlehub.finetune.strategy import DefaultStrategy
from paddlehub.common.logger import logger
......
......@@ -12,6 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
import paddlehub as hub
from paddlehub.common.logger import logger
def evaluate_cls_task(task, data_reader, feed_list, phase="test", config=None):
logger.info("Evaluation on {} dataset start".format(phase))
......@@ -20,7 +28,7 @@ def evaluate_cls_task(task, data_reader, feed_list, phase="test", config=None):
loss = task.variable("loss")
accuracy = task.variable("accuracy")
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
place, dev_count = hub.common.get_running_device_info(config)
exe = fluid.Executor(place=place)
with fluid.program_guard(inference_program):
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
......@@ -64,7 +72,7 @@ def evaluate_seq_labeling_task(task,
logger.info("Evaluation on {} dataset start".format(phase))
inference_program = task.inference_program()
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
place, dev_count = hub.common.get_running_device_info(config)
exe = fluid.Executor(place=place)
num_labels = len(data_reader.get_labels())
with fluid.program_guard(inference_program):
......
......@@ -18,10 +18,10 @@ from __future__ import print_function
import os
import time
import multiprocessing
import paddle
import paddle.fluid as fluid
import paddlehub as hub
import numpy as np
from paddlehub.common.logger import logger
......@@ -29,18 +29,6 @@ from paddlehub.finetune.strategy import BERTFinetuneStrategy, DefaultStrategy
from paddlehub.finetune.checkpoint import load_checkpoint, save_checkpoint
from paddlehub.finetune.evaluate import evaluate_cls_task, evaluate_seq_labeling_task
from visualdl import LogWriter
import paddlehub as hub
def _get_running_device_info(config):
if config.use_cuda:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
return place, dev_count
def _do_memory_optimization(task, config):
......@@ -80,7 +68,7 @@ def _finetune_seq_label_task(task,
num_epoch = config.num_epoch
batch_size = config.batch_size
place, dev_count = _get_running_device_info(config)
place, dev_count = hub.common.get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
......@@ -177,7 +165,7 @@ def _finetune_cls_task(task, data_reader, feed_list, config=None,
log_writter = LogWriter(
os.path.join(config.checkpoint_dir, "vdllog"), sync_cycle=10)
place, dev_count = _get_running_device_info(config)
place, dev_count = hub.common.get_running_device_info(config)
with fluid.program_guard(main_program, startup_program):
exe = fluid.Executor(place=place)
data_feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
......
......@@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .nlp_reader import BERTTokenizeReader
from .task_reader import ClassifyReader
from .task_reader import SequenceLabelReader
from .nlp_reader import ClassifyReader
from .nlp_reader import SequenceLabelReader
from .cv_reader import ImageClassificationReader
......@@ -12,33 +12,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import types
import csv
import json
import numpy as np
from collections import namedtuple
#from paddlehub import dataset
from paddlehub.reader import tokenization
from paddlehub.reader.batching import prepare_batch_data
from .batching import pad_batch_data
class BERTTokenizeReader(object):
"""Base class for data converters for sequence classification data sets."""
class BaseReader(object):
def __init__(self,
dataset,
vocab_path,
max_seq_len,
label_map_config=None,
max_seq_len=512,
do_lower_case=True,
in_tokens=False,
random_seed=None):
self.dataset = dataset
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.dataset = dataset
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.in_tokens = in_tokens
np.random.seed(random_seed)
# generate label map
self.label_map = {}
for index, label in enumerate(self.dataset.get_labels()):
self.label_map[label] = index
print("Dataset label map = {}".format(self.label_map))
self.current_example = 0
self.current_epoch = 0
self.num_examples = 0
# if label_map_config:
# with open(label_map_config) as f:
# self.label_map = json.load(f)
# else:
# self.label_map = None
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def get_train_examples(self):
......@@ -61,41 +80,140 @@ class BERTTokenizeReader(object):
"""Gets the list of labels for this data set."""
return self.dataset.get_labels()
def convert_example(self, index, example, labels, max_seq_len, tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
feature = convert_single_example(index, example, labels, max_seq_len,
tokenizer)
return feature
def generate_instance(self, feature):
"""
generate instance with given feature
Args:
feature: InputFeatures(object). A single set of features of data.
"""
position_ids = list(range(len(feature.input_ids)))
return [
feature.input_ids, feature.segment_ids, position_ids,
feature.label_id
]
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_example, self.current_epoch
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def _convert_example_to_record(self, example, max_seq_length, tokenizer):
"""Converts a single `Example` into a single `Record`."""
text_a = tokenization.convert_to_unicode(example.text_a)
tokens_a = tokenizer.tokenize(text_a)
tokens_b = None
if example.text_b is not None:
#if "text_b" in example._fields:
text_b = tokenization.convert_to_unicode(example.text_b)
tokens_b = tokenizer.tokenize(text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT/ERNIE is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
text_type_ids = []
tokens.append("[CLS]")
text_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
text_type_ids.append(0)
tokens.append("[SEP]")
text_type_ids.append(0)
def generate_batch_data(self,
batch_data,
total_token_num,
return_input_mask=True,
return_max_len=False,
return_num_token=False):
return prepare_batch_data(
batch_data,
total_token_num,
max_seq_len=self.max_seq_len,
pad_id=self.vocab["[PAD]"],
cls_id=self.vocab["[CLS]"],
sep_id=self.vocab["[SEP]"],
return_input_mask=return_input_mask,
return_max_len=return_max_len,
return_num_token=return_num_token)
if tokens_b:
for token in tokens_b:
tokens.append(token)
text_type_ids.append(1)
tokens.append("[SEP]")
text_type_ids.append(1)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
if self.label_map:
label_id = self.label_map[example.label]
else:
label_id = example.label
# Record = namedtuple(
# 'Record',
# ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
# qid = None
# if "qid" in example._fields:
# qid = example.qid
# record = Record(
# token_ids=token_ids,
# text_type_ids=text_type_ids,
# position_ids=position_ids,
# label_id=label_id,
# qid=qid)
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_id=label_id)
return record
def _prepare_batch_data(self, examples, batch_size, phase=None):
"""generate batch records"""
batch_records, max_len = [], 0
for index, example in enumerate(examples):
if phase == "train":
self.current_example = index
record = self._convert_example_to_record(example, self.max_seq_len,
self.tokenizer)
max_len = max(max_len, len(record.token_ids))
if self.in_tokens:
to_append = (len(batch_records) + 1) * max_len <= batch_size
else:
to_append = len(batch_records) < batch_size
if to_append:
batch_records.append(record)
else:
yield self._pad_batch_records(batch_records)
batch_records, max_len = [record], len(record.token_ids)
if batch_records:
yield self._pad_batch_records(batch_records)
# def get_num_examples(self, input_file):
# examples = self._read_tsv(input_file)
# return len(examples)
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
......@@ -106,15 +224,7 @@ class BERTTokenizeReader(object):
return self.num_examples[phase]
def data_generator(self, batch_size, phase='train', shuffle=True):
"""
Generate data for train, dev/val or test.
Args:
batch_size: int. The batch size of generated data.
phase: string. The phase for which to generate data.
epoch: int. Total epoches to generate data.
shuffle: bool. Whether to shuffle examples.
"""
if phase == 'train':
examples = self.get_train_examples()
self.num_examples['train'] = len(examples)
......@@ -128,169 +238,164 @@ class BERTTokenizeReader(object):
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def instance_reader():
"""
convert a single instance to BERT input feature
"""
def wrapper():
if shuffle:
np.random.shuffle(examples)
for (index, example) in enumerate(examples):
feature = self.convert_example(index, example,
self.get_labels(),
self.max_seq_len, self.tokenizer)
instance = self.generate_instance(feature)
yield instance
def batch_reader(reader, batch_size):
batch, total_token_num, max_len = [], 0, 0
for instance in reader():
token_ids, sent_ids, pos_ids, label = instance[:4]
max_len = max(max_len, len(token_ids))
batch.append(instance)
total_token_num += len(token_ids)
if len(batch) == batch_size:
yield batch, total_token_num
batch, total_token_num, max_len = [], 0, 0
if len(batch) > 0:
yield batch, total_token_num
def wrapper():
for batch_data, total_token_num in batch_reader(
instance_reader, batch_size):
batch_data = self.generate_batch_data(
batch_data,
total_token_num,
return_input_mask=True,
return_max_len=True,
return_num_token=False)
for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield [batch_data]
return wrapper
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
def convert_single_example_to_unicode(guid, single_example):
text_a = tokenization.convert_to_unicode(single_example[0])
text_b = tokenization.convert_to_unicode(single_example[1])
label = tokenization.convert_to_unicode(single_example[2])
return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
class ClassifyReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
batch_labels = [record.label_id for record in batch_records]
batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# if batch_records[0].qid:
# batch_qids = [record.qid for record in batch_records]
# batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
# else:
# batch_qids = np.array([]).astype("int64").reshape([-1, 1])
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# padding
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id,
return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
label_id = label_map[example.label]
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id)
return feature
return return_list
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
class SequenceLabelReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
batch_label_ids = [record.label_ids for record in batch_records]
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
print("Writing example %d of %d" % (ex_index, len(examples)))
# padding
padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
batch_token_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len,
return_input_mask=True,
return_seq_lens=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_label_ids = pad_batch_data(
batch_label_ids,
max_seq_len=self.max_seq_len,
pad_idx=len(self.label_map) - 1)
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_label_ids, batch_seq_lens
]
return return_list
def _reseg_token_label(self, tokens, labels, tokenizer):
assert len(tokens) == len(labels)
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels
def _convert_example_to_record(self, example, max_seq_length, tokenizer):
tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
labels = tokenization.convert_to_unicode(example.label).split(u"")
tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
if len(tokens) > max_seq_length - 2:
tokens = tokens[0:(max_seq_length - 2)]
labels = labels[0:(max_seq_length - 2)]
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids)
no_entity_id = len(self.label_map) - 1
label_ids = [no_entity_id
] + [self.label_map[label]
for label in labels] + [no_entity_id]
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_ids=label_ids)
return record
class ExtractEmbeddingReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
# padding
padded_token_ids, input_mask, seq_lens = pad_batch_data(
batch_token_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len,
return_input_mask=True,
return_seq_lens=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len)
padded_position_ids = pad_batch_data(
batch_position_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len)
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
input_mask, seq_lens
]
features.append(feature)
return features
return return_list
if __name__ == '__main__':
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import json
import numpy as np
from collections import namedtuple
from paddlehub.reader import tokenization
from .batching import pad_batch_data
class BaseReader(object):
def __init__(self,
dataset,
vocab_path,
label_map_config=None,
max_seq_len=512,
do_lower_case=True,
in_tokens=False,
random_seed=None):
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.dataset = dataset
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.in_tokens = in_tokens
np.random.seed(random_seed)
# generate label map
self.label_map = {}
for index, label in enumerate(self.dataset.get_labels()):
self.label_map[label] = index
print("Dataset label map = {}".format(self.label_map))
self.current_example = 0
self.current_epoch = 0
self.num_examples = 0
# if label_map_config:
# with open(label_map_config) as f:
# self.label_map = json.load(f)
# else:
# self.label_map = None
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def get_train_examples(self):
"""Gets a collection of `InputExample`s for the train set."""
return self.dataset.get_train_examples()
def get_dev_examples(self):
"""Gets a collection of `InputExample`s for the dev set."""
return self.dataset.get_dev_examples()
def get_val_examples(self):
"""Gets a collection of `InputExample`s for the val set."""
return self.dataset.get_val_examples()
def get_test_examples(self):
"""Gets a collection of `InputExample`s for prediction."""
return self.dataset.get_test_examples()
def get_labels(self):
"""Gets the list of labels for this data set."""
return self.dataset.get_labels()
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_example, self.current_epoch
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def _convert_example_to_record(self, example, max_seq_length, tokenizer):
"""Converts a single `Example` into a single `Record`."""
text_a = tokenization.convert_to_unicode(example.text_a)
tokens_a = tokenizer.tokenize(text_a)
tokens_b = None
if example.text_b is not None:
#if "text_b" in example._fields:
text_b = tokenization.convert_to_unicode(example.text_b)
tokens_b = tokenizer.tokenize(text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT/ERNIE is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
text_type_ids = []
tokens.append("[CLS]")
text_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
text_type_ids.append(0)
tokens.append("[SEP]")
text_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
text_type_ids.append(1)
tokens.append("[SEP]")
text_type_ids.append(1)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
if self.label_map:
label_id = self.label_map[example.label]
else:
label_id = example.label
# Record = namedtuple(
# 'Record',
# ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
# qid = None
# if "qid" in example._fields:
# qid = example.qid
# record = Record(
# token_ids=token_ids,
# text_type_ids=text_type_ids,
# position_ids=position_ids,
# label_id=label_id,
# qid=qid)
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_id=label_id)
return record
def _prepare_batch_data(self, examples, batch_size, phase=None):
"""generate batch records"""
batch_records, max_len = [], 0
for index, example in enumerate(examples):
if phase == "train":
self.current_example = index
record = self._convert_example_to_record(example, self.max_seq_len,
self.tokenizer)
max_len = max(max_len, len(record.token_ids))
if self.in_tokens:
to_append = (len(batch_records) + 1) * max_len <= batch_size
else:
to_append = len(batch_records) < batch_size
if to_append:
batch_records.append(record)
else:
yield self._pad_batch_records(batch_records)
batch_records, max_len = [record], len(record.token_ids)
if batch_records:
yield self._pad_batch_records(batch_records)
# def get_num_examples(self, input_file):
# examples = self._read_tsv(input_file)
# return len(examples)
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return self.num_examples[phase]
def data_generator(self, batch_size, phase='train', shuffle=True):
if phase == 'train':
examples = self.get_train_examples()
self.num_examples['train'] = len(examples)
elif phase == 'val' or phase == 'dev':
examples = self.get_dev_examples()
self.num_examples['dev'] = len(examples)
elif phase == 'test':
examples = self.get_test_examples()
self.num_examples['test'] = len(examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def wrapper():
if shuffle:
np.random.shuffle(examples)
for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield [batch_data]
return wrapper
class ClassifyReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
batch_labels = [record.label_id for record in batch_records]
batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])
# if batch_records[0].qid:
# batch_qids = [record.qid for record in batch_records]
# batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
# else:
# batch_qids = np.array([]).astype("int64").reshape([-1, 1])
# padding
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id,
return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
return return_list
class SequenceLabelReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
batch_label_ids = [record.label_ids for record in batch_records]
# padding
padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
batch_token_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len,
return_input_mask=True,
return_seq_lens=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_label_ids = pad_batch_data(
batch_label_ids,
max_seq_len=self.max_seq_len,
pad_idx=len(self.label_map) - 1)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_label_ids, batch_seq_lens
]
return return_list
def _reseg_token_label(self, tokens, labels, tokenizer):
assert len(tokens) == len(labels)
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels
def _convert_example_to_record(self, example, max_seq_length, tokenizer):
tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
labels = tokenization.convert_to_unicode(example.label).split(u"")
tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
if len(tokens) > max_seq_length - 2:
tokens = tokens[0:(max_seq_length - 2)]
labels = labels[0:(max_seq_length - 2)]
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids)
no_entity_id = len(self.label_map) - 1
label_ids = [no_entity_id
] + [self.label_map[label]
for label in labels] + [no_entity_id]
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_ids=label_ids)
return record
class ExtractEmbeddingReader(BaseReader):
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
# padding
padded_token_ids, input_mask, seq_lens = pad_batch_data(
batch_token_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len,
return_input_mask=True,
return_seq_lens=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len)
padded_position_ids = pad_batch_data(
batch_position_ids,
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len)
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
input_mask, seq_lens
]
return return_list
if __name__ == '__main__':
pass
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册