提交 c380150e 编写于 作者: W wuzewu

Add nlp module

上级 6acb2dd4
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
import re
import six
from typing import Any, List, Text, Tuple
import paddle
import numpy as np
from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
from paddlehub.compat import paddle_utils
from paddlehub.compat.task.transformer_emb_task import TransformerEmbeddingTask
from paddlehub.compat.task.config import RunConfig
from paddlehub.compat.task.reader import ClassifyReader
from paddlehub.module.module import runnable, RunModule
from paddlehub.utils.parser import txt_parser
from paddlehub.utils.utils import sys_stdin_encoding
class DataFormatError(Exception):
def __init__(self, *args):
self.args = args
class NLPBaseModule(RunModule):
def get_vocab_path(self):
'''
Get the path to the vocabulary whih was used to pretrain
Returns:
self.vocab_path(str): the path to vocabulary
'''
return self.vocab_path
class NLPPredictionModule(NLPBaseModule):
def _set_config(self):
'''predictor config setting'''
cpu_config = AnalysisConfig(self.pretrained_model_path)
cpu_config.disable_glog_info()
cpu_config.disable_gpu()
self.cpu_predictor = create_paddle_predictor(cpu_config)
try:
_places = os.environ['CUDA_VISIBLE_DEVICES']
int(_places[0])
use_gpu = True
except:
use_gpu = False
if use_gpu:
gpu_config = AnalysisConfig(self.pretrained_model_path)
gpu_config.disable_glog_info()
gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0)
self.gpu_predictor = create_paddle_predictor(gpu_config)
def texts2tensor(self, texts: List[dict]) -> paddle.Tensor:
'''
Tranform the texts(dict) to PaddleTensor
Args:
texts(list): each element is a dict that must have a named 'processed' key whose value is word_ids, such as
texts = [{'processed': [23, 89, 43, 906]}]
Returns:
tensor(PaddleTensor): tensor with texts data
'''
lod = [0]
data = []
for i, text in enumerate(texts):
data += text['processed']
lod.append(len(text['processed']) + lod[i])
tensor = PaddleTensor(np.array(data).astype('int64'))
tensor.name = 'words'
tensor.lod = [lod]
tensor.shape = [lod[-1], 1]
return tensor
def to_unicode(self, texts: str) -> Text:
'''
Convert each element's type(str) of texts(list) to unicode in python2.7
Args:
texts(list): each element's type is str in python2.7
Returns:
texts(list): each element's type is unicode in python2.7
'''
if six.PY2:
unicode_texts = []
for text in texts:
if isinstance(text, six.string_types):
unicode_texts.append(text.decode(sys_stdin_encoding()).decode('utf8'))
else:
unicode_texts.append(text)
texts = unicode_texts
return texts
@runnable
def run_cmd(self, argvs: List[Any]):
'''Run as a command'''
self.parser = argparse.ArgumentParser(
description='Run the %s module.' % self.name,
prog='hub run %s' % self.name,
usage='%(prog)s',
add_help=True)
self.arg_input_group = self.parser.add_argument_group(title='Input options', description='Input data. Required')
self.arg_config_group = self.parser.add_argument_group(
title='Config options', description='Run configuration for controlling module behavior, not required.')
self.add_module_config_arg()
self.add_module_input_arg()
args = self.parser.parse_args(argvs)
try:
input_data = self.check_input_data(args)
except DataFormatError and RuntimeError:
self.parser.print_help()
return None
results = self.predict(texts=input_data, use_gpu=args.use_gpu, batch_size=args.batch_size)
return results
def add_module_config_arg(self):
'''Add the command config options'''
self.arg_config_group.add_argument(
'--use_gpu', type=ast.literal_eval, default=False, help='whether use GPU for prediction')
self.arg_config_group.add_argument('--batch_size', type=int, default=1, help='batch size for prediction')
def add_module_input_arg(self):
'''Add the command input options'''
self.arg_input_group.add_argument('--input_file', type=str, default=None, help='file contain input data')
self.arg_input_group.add_argument('--input_text', type=str, default=None, help='text to predict')
def check_input_data(self, args):
input_data = []
if args.input_file:
if not os.path.exists(args.input_file):
raise FileNotFoundError('File %s does not exist.' % args.input_file)
else:
input_data = txt_parser.parse(args.input_file, use_strip=True)
elif args.input_text:
input_data = [args.input_text]
return input_data
class TransformerModule(NLPBaseModule):
'''
Tranformer Module base class can be used by BERT, ERNIE, RoBERTa and so on.
'''
def __init__(self,
name: str = None,
directory: str = None,
module_dir: List = None,
version: str = None,
max_seq_len: int = 128,
**kwargs):
if not directory:
return
super(TransformerModule, self).__init__(
name=name, directory=directory, module_dir=module_dir, version=version, **kwargs)
self.max_seq_len = max_seq_len
def init_pretraining_params(self, exe: paddle.static.Executor, pretraining_params_path: str,
main_program: paddle.static.Program):
assert os.path.exists(pretraining_params_path), '[{}] cann\'t be found.'.format(pretraining_params_path)
def existed_params(var):
if not isinstance(var, paddle.fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(pretraining_params_path, var.name))
paddle.io.load(
executor=exe,
model_path=pretraining_params_path,
program=main_program,
var_list=main_program.all_parameters())
def param_prefix(self) -> str:
return '@HUB_%s@' % self.name
def context(
self,
max_seq_len: int = None,
trainable: bool = True,
num_slots: int = 1,
) -> Tuple[dict, dict, paddle.static.Program]:
'''
get inputs, outputs and program from pre-trained module
Args:
max_seq_len (int): It will limit the total sequence returned so that it has a maximum length.
trainable (bool): Whether fine-tune the pre-trained module parameters or not.
num_slots(int): It's number of data inputted to the model, selectted as following options:
- 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task.
- 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
- 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).
Returns: inputs, outputs, program.
The inputs is a dict with keys named input_ids, position_ids, segment_ids, input_mask and task_ids
The outputs is a dict with two keys named pooled_output and sequence_output.
'''
assert num_slots >= 1 and num_slots <= 3, 'num_slots must be 1, 2, or 3, but the input is %d' % num_slots
if not max_seq_len:
max_seq_len = self.max_seq_len
assert max_seq_len <= self.MAX_SEQ_LEN and max_seq_len >= 1, 'max_seq_len({}) should be in the range of [1, {}]'.format(
max_seq_len, self.MAX_SEQ_LEN)
module_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(module_program, startup_program):
with paddle.fluid.unique_name.guard():
input_ids = paddle.data(name='input_ids', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
position_ids = paddle.data(name='position_ids', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
segment_ids = paddle.data(name='segment_ids', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
input_mask = paddle.data(name='input_mask', shape=[-1, max_seq_len, 1], dtype='float32', lod_level=0)
pooled_output, sequence_output = self.net(input_ids, position_ids, segment_ids, input_mask)
data_list = [(input_ids, position_ids, segment_ids, input_mask)]
output_name_list = [(pooled_output.name, sequence_output.name)]
if num_slots > 1:
input_ids_2 = paddle.data(
name='input_ids_2', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
position_ids_2 = paddle.data(
name='position_ids_2', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
segment_ids_2 = paddle.data(
name='segment_ids_2', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
input_mask_2 = paddle.data(
name='input_mask_2', shape=[-1, max_seq_len, 1], dtype='float32', lod_level=0)
pooled_output_2, sequence_output_2 = self.net(input_ids_2, position_ids_2, segment_ids_2,
input_mask_2)
data_list.append((input_ids_2, position_ids_2, segment_ids_2, input_mask_2))
output_name_list.append((pooled_output_2.name, sequence_output_2.name))
if num_slots > 2:
input_ids_3 = paddle.data(
name='input_ids_3', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
position_ids_3 = paddle.data(
name='position_ids_3', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
segment_ids_3 = paddle.data(
name='segment_ids_3', shape=[-1, max_seq_len, 1], dtype='int64', lod_level=0)
input_mask_3 = paddle.data(
name='input_mask_3', shape=[-1, max_seq_len, 1], dtype='float32', lod_level=0)
pooled_output_3, sequence_output_3 = self.net(input_ids_3, position_ids_3, segment_ids_3,
input_mask_3)
data_list.append((input_ids_3, position_ids_3, segment_ids_3, input_mask_3))
output_name_list.append((pooled_output_3.name, sequence_output_3.name))
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
# To be compatible with the module v1
vars = filter(
lambda var: var not in [
'input_ids', 'position_ids', 'segment_ids', 'input_mask', 'input_ids_2', 'position_ids_2',
'segment_ids_2', 'input_mask_2', 'input_ids_3', 'position_ids_3', 'segment_ids_3', 'input_mask_3'
], list(module_program.global_block().vars.keys()))
paddle_utils.add_vars_prefix(program=module_program, prefix=self.param_prefix(), vars=vars)
self.init_pretraining_params(exe, self.params_path, main_program=module_program)
self.params_layer = {}
for param in module_program.global_block().iter_parameters():
param.trainable = trainable
match = re.match(r'.*layer_(\d+).*', param.name)
if match:
# layer num begins from 0
layer = match.group(1)
self.params_layer[param.name] = int(layer)
inputs = {}
outputs = {}
for index, data in enumerate(data_list):
if index == 0:
inputs['input_ids'] = data[0]
inputs['position_ids'] = data[1]
inputs['segment_ids'] = data[2]
inputs['input_mask'] = data[3]
outputs['pooled_output'] = module_program.global_block().vars[self.param_prefix() +
output_name_list[0][0]]
outputs['sequence_output'] = module_program.global_block().vars[self.param_prefix() +
output_name_list[0][1]]
else:
inputs['input_ids_%s' % (index + 1)] = data[0]
inputs['position_ids_%s' % (index + 1)] = data[1]
inputs['segment_ids_%s' % (index + 1)] = data[2]
inputs['input_mask_%s' % (index + 1)] = data[3]
outputs['pooled_output_%s' % (index + 1)] = module_program.global_block().vars[
self.param_prefix() + output_name_list[index][0]]
outputs['sequence_output_%s' % (index + 1)] = module_program.global_block().vars[
self.param_prefix() + output_name_list[index][1]]
return inputs, outputs, module_program
def get_embedding(self, texts: List[str], max_seq_len: int = 512, use_gpu: bool = False, batch_size: int = 1):
'''
get pooled_output and sequence_output for input texts.
Warnings: this method depends on Paddle Inference Library, it may not work properly in PaddlePaddle <= 1.6.2.
Args:
texts (list): each element is a text sample, each sample include text_a and text_b where text_b can be omitted.
for example: [[sample0_text_a, sample0_text_b], [sample1_text_a, sample1_text_b], ...]
max_seq_len (int): the max sequence length.
use_gpu (bool): use gpu or not, default False.
batch_size (int): the data batch size, default 1.
Returns:
pooled_outputs(list): its element is a numpy array, the first feature of each text sample.
sequence_outputs(list): its element is a numpy array, the whole features of each text sample.
'''
if not hasattr(self,
'emb_job') or self.emb_job['batch_size'] != batch_size or self.emb_job['use_gpu'] != use_gpu:
inputs, outputs, program = self.context(trainable=True, max_seq_len=max_seq_len)
reader = ClassifyReader(
dataset=None,
vocab_path=self.get_vocab_path(),
max_seq_len=max_seq_len,
sp_model_path=self.get_spm_path() if hasattr(self, 'get_spm_path') else None,
word_dict_path=self.get_word_dict_path() if hasattr(self, 'word_dict_path') else None)
feed_list = [
inputs['input_ids'].name,
inputs['position_ids'].name,
inputs['segment_ids'].name,
inputs['input_mask'].name,
]
pooled_feature, seq_feature = outputs['pooled_output'], outputs['sequence_output']
config = RunConfig(use_data_parallel=False, use_cuda=use_gpu, batch_size=batch_size)
self.emb_job = {}
self.emb_job['task'] = TransformerEmbeddingTask(
pooled_feature=pooled_feature,
seq_feature=seq_feature,
feed_list=feed_list,
data_reader=reader,
config=config,
)
self.emb_job['batch_size'] = batch_size
self.emb_job['use_gpu'] = use_gpu
return self.emb_job['task'].predict(data=texts, return_result=True, accelerate_mode=True)
def get_spm_path(self) -> str:
if hasattr(self, 'spm_path'):
return self.spm_path
return None
def get_word_dict_path(self) -> str:
if hasattr(self, 'word_dict_path'):
return self.word_dict_path
return None
def get_params_layer(self) -> dict:
if not hasattr(self, 'params_layer'):
raise AttributeError('The module context has not been initialized. '
'Please call context() before using get_params_layer')
return self.params_layer
此差异已折叠。
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''Mask, padding and batching.'''
from typing import List, Union
import numpy as np
def pad_batch_data(insts: List,
pad_idx: int = 0,
max_seq_len: int = 128,
return_pos: bool = False,
return_input_mask: bool = False,
return_max_len: bool = False,
return_num_token: bool = False,
return_seq_lens: bool = False) -> Union[List, np.ndarray]:
'''
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
'''
return_list = []
#max_len = max(len(inst) for inst in insts)
max_len = max_seq_len
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype('int64').reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_pos.astype('int64').reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype('float32')]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
if return_seq_lens:
seq_lens = np.array([len(inst) for inst in insts])
return_list += [seq_lens.astype('int64').reshape([-1, 1])]
return return_list if len(return_list) > 1 else return_list[0]
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class RunConfig(object):
''' This class specifies the configurations for PaddleHub to finetune '''
def __init__(self,
log_interval: int = 10,
eval_interval: int = 100,
use_data_parallel: bool = True,
save_ckpt_interval: int = None,
use_cuda: bool = True,
checkpoint_dir: str = None,
num_epoch: int = 1,
batch_size: int = 32):
''' Construct finetune Config '''
self.log_interval = log_interval
self.eval_interval = eval_interval
self.save_ckpt_interval = save_ckpt_interval
self.use_cuda = use_cuda
self.num_epoch = num_epoch
self.batch_size = batch_size
self.use_data_parallel = use_data_parallel
if checkpoint_dir is None:
now = int(time.time())
time_str = time.strftime('%Y%m%d%H%M%S', time.localtime(now))
self.checkpoint_dir = 'ckpt_' + time_str
else:
self.checkpoint_dir = checkpoint_dir
def __repr__(self):
return 'config with num_epoch={}, batch_size={}, use_cuda={}, checkpoint_dir={} '.format(
self.num_epoch, self.batch_size, self.use_cuda, self.checkpoint_dir)
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
from collections import OrderedDict
from typing import Callable
class TaskHooks(object):
'''TaskHooks can handle some tasks during the spectific event.'''
def __init__(self):
self._registered_hooks = {
'build_env_start_event': OrderedDict(),
'build_env_end_event': OrderedDict(),
'finetune_start_event': OrderedDict(),
'finetune_end_event': OrderedDict(),
'predict_start_event': OrderedDict(),
'predict_end_event': OrderedDict(),
'eval_start_event': OrderedDict(),
'eval_end_event': OrderedDict(),
'log_interval_event': OrderedDict(),
'save_ckpt_interval_event': OrderedDict(),
'eval_interval_event': OrderedDict(),
'run_step_event': OrderedDict(),
}
self._hook_params_num = {
'build_env_start_event': 1,
'build_env_end_event': 1,
'finetune_start_event': 1,
'finetune_end_event': 2,
'predict_start_event': 1,
'predict_end_event': 2,
'eval_start_event': 1,
'eval_end_event': 2,
'log_interval_event': 2,
'save_ckpt_interval_event': 1,
'eval_interval_event': 1,
'run_step_event': 2,
}
def add(self, hook_type: str, name: str = None, func: Callable = None):
'''
add the handler function to spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name, default None
func (func): the handler function, default None
'''
if not func or not callable(func):
raise TypeError('The hook function is empty or it is not a function')
if name == None:
name = 'hook_%s' % id(func)
# check validity
if not isinstance(name, str) or name.strip() == '':
raise TypeError('The hook name must be a non-empty string')
if hook_type not in self._registered_hooks:
raise ValueError('hook_type: %s does not exist' % (hook_type))
if name in self._registered_hooks[hook_type]:
raise ValueError('name: %s has existed in hook_type:%s, use modify method to modify it' % (name, hook_type))
else:
args_num = len(inspect.getfullargspec(func).args)
if args_num != self._hook_params_num[hook_type]:
raise ValueError('The number of parameters to the hook hook_type:%s should be %i' %
(hook_type, self._hook_params_num[hook_type]))
self._registered_hooks[hook_type][name] = func
def delete(self, hook_type: str, name: str):
'''
delete the handler function of spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
'''
if self.exist(hook_type, name):
del self._registered_hooks[hook_type][name]
else:
raise ValueError(
'No hook_type: %s exists or name: %s does not exist in hook_type: %s' % (hook_type, name, hook_type))
def modify(self, hook_type: str, name: str, func: Callable):
'''
modify the handler function of spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
func (func): the new handler function
'''
if not (isinstance(name, str) and callable(func)):
raise TypeError('The hook name must be a string, and the hook function must be a function')
if self.exist(hook_type, name):
self._registered_hooks[hook_type][name] = func
else:
raise ValueError(
'No hook_type: %s exists or name: %s does not exist in hook_type: %s' % (hook_type, name, hook_type))
def exist(self, hook_type: str, name: str) -> bool:
'''
check if the the handler function of spectific event is existing.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
Returns:
bool: True or False
'''
if hook_type not in self._registered_hooks \
or name not in self._registered_hooks[hook_type]:
return False
else:
return True
def info(self, show_default: bool = False) -> str:
'''
get the hooks information, including the source code.
Args:
show_default (bool): show the information of Paddlehub default hooks or not, default False
Returns:
str: the formatted string of the hooks information
'''
# formatted output the source code
ret = ''
for hook_type, hooks in self._registered_hooks.items():
already_print_type = False
for name, func in hooks.items():
if name == 'default' and not show_default:
continue
if not already_print_type:
ret += 'hook_type: %s{\n' % hook_type
already_print_type = True
source = inspect.getsource(func)
ret += ' name: %s{\n' % name
for line in source.split('\n'):
ret += ' %s\n' % line
ret += ' }\n'
if already_print_type:
ret += '}\n'
if not ret:
ret = 'Not any customized hooks have been defined, you can set show_default=True to see the default hooks information'
return ret
def __getitem__(self, hook_type: str) -> OrderedDict:
return self._registered_hooks[hook_type]
def __repr__(self) -> str:
return self.info(show_default=False)
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import namedtuple
from typing import Callable, Generator, Generic, List
import numpy as np
from paddlehub.utils.log import logger
from paddlehub.compat.task import tokenization
from paddlehub.compat.task.batch import pad_batch_data
class InputExample(object):
'''
Input data structure of BERT/ERNIE, can satisfy single sequence task like
text classification, sequence lableing; Sequence pair task like dialog
task.
'''
def __init__(self, guid: int, text_a: str, text_b: str = None, label: str = None):
'''Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
'''
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
def __str__(self):
if self.text_b is None:
return 'text={}\tlabel={}'.format(self.text_a, self.label)
else:
return 'text_a={}\ttext_b={},label={}'.format(self.text_a, self.text_b, self.label)
class BaseReader(object):
def __init__(self, dataset: Generic, random_seed: int = None):
self.dataset = dataset
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
np.random.seed(random_seed)
# generate label map
self.label_map = {}
try:
for index, label in enumerate(self.dataset.get_labels()):
self.label_map[label] = index
logger.info('Dataset label map = {}'.format(self.label_map))
except:
# some dataset like squad, its label_list=None
logger.info('Dataset is None or it has not any labels, label map = {}'.format(self.label_map))
def get_train_examples(self) -> List:
return self.dataset.get_train_examples()
def get_dev_examples(self) -> List:
return self.dataset.get_dev_examples()
def get_test_examples(self) -> List:
return self.dataset.get_test_examples()
def data_generator(self) -> Generic:
raise NotImplementedError
class BaseNLPReader(BaseReader):
def __init__(self,
vocab_path: str,
dataset: Generic = None,
max_seq_len: int = 512,
do_lower_case: bool = True,
random_seed: int = None,
sp_model_path: str = None,
word_dict_path: str = None,
in_tokens: bool = False):
super(BaseNLPReader, self).__init__(dataset, random_seed)
self.max_seq_len = max_seq_len
if sp_model_path and word_dict_path:
self.tokenizer = tokenization.WSSPTokenizer(vocab_path, sp_model_path, word_dict_path, ws=True, lower=True)
else:
self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.pad_id = self.vocab['[PAD]']
self.cls_id = self.vocab['[CLS]']
self.sep_id = self.vocab['[SEP]']
self.mask_id = self.vocab['[MASK]']
self.in_tokens = in_tokens
self.Record_With_Label_Id = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
self.Record_Wo_Label_Id = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids'])
def _truncate_seq_pair(self, tokens_a: List, tokens_b: List, max_length: int):
'''Truncates a sequence pair in place to the maximum length.'''
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def _convert_example_to_record(self,
example: InputExample,
max_seq_length: int,
tokenizer: Generic,
phase: str = None) -> namedtuple:
'''Converts a single `Example` into a single `Record`.'''
text_a = tokenization.convert_to_unicode(example.text_a)
tokens_a = tokenizer.tokenize(text_a)
tokens_b = None
if example.text_b is not None:
#if 'text_b' in example._fields:
text_b = tokenization.convert_to_unicode(example.text_b)
tokens_b = tokenizer.tokenize(text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with '- 3'
self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with '- 2'
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT/ERNIE is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where 'type_ids' are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the 'sentence vector'. Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
text_type_ids = []
tokens.append('[CLS]')
text_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
text_type_ids.append(0)
tokens.append('[SEP]')
text_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
text_type_ids.append(1)
tokens.append('[SEP]')
text_type_ids.append(1)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids)))
if self.label_map:
if example.label not in self.label_map:
raise KeyError('example.label = {{{}}} not in label'.format(example.label))
label_id = self.label_map[example.label]
else:
label_id = example.label
if phase != 'predict':
record = self.Record_With_Label_Id(
token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_id=label_id)
else:
record = self.Record_Wo_Label_Id(
token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids)
return record
def _pad_batch_records(self, batch_records: List, phase: str):
raise NotImplementedError
def _prepare_batch_data(self, examples: List, batch_size: int, phase: str = None) -> Generator:
'''generate batch records'''
batch_records, max_len = [], 0
for index, example in enumerate(examples):
if phase == 'train':
self.current_example = index
record = self._convert_example_to_record(example, self.max_seq_len, self.tokenizer, phase)
max_len = max(max_len, len(record.token_ids))
if self.in_tokens:
to_append = (len(batch_records) + 1) * max_len <= batch_size
else:
to_append = len(batch_records) < batch_size
if to_append:
batch_records.append(record)
else:
yield self._pad_batch_records(batch_records, phase)
batch_records, max_len = [record], len(record.token_ids)
if batch_records:
yield self._pad_batch_records(batch_records, phase)
def data_generator(self,
batch_size: int = 1,
phase: str = 'train',
shuffle: bool = True,
data: List = None,
return_list: bool = True) -> Callable:
if phase != 'predict' and not self.dataset:
raise ValueError('The dataset is None ! It isn\'t allowed.')
if phase == 'train':
shuffle = True
examples = self.get_train_examples()
self.num_examples['train'] = len(examples)
elif phase == 'val' or phase == 'dev':
shuffle = False
examples = self.get_dev_examples()
self.num_examples['dev'] = len(examples)
elif phase == 'test':
shuffle = False
examples = self.get_test_examples()
self.num_examples['test'] = len(examples)
elif phase == 'predict':
shuffle = False
examples = []
seq_id = 0
for item in data:
# set label in order to run the program
if self.dataset:
label = list(self.label_map.keys())[0]
else:
label = 0
if len(item) == 1:
item_i = InputExample(guid=seq_id, text_a=item[0], label=label)
elif len(item) == 2:
item_i = InputExample(guid=seq_id, text_a=item[0], text_b=item[1], label=label)
else:
raise ValueError('The length of input_text is out of handling, which must be 1 or 2!')
examples.append(item_i)
seq_id += 1
else:
raise ValueError('Unknown phase, which should be in [\'train\', \'dev\', \'test\', \'predict\'].')
def wrapper():
if shuffle:
np.random.shuffle(examples)
for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase):
if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper
class ClassifyReader(BaseNLPReader):
def _pad_batch_records(self, batch_records: List, phase: str = None) -> List:
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
batch_token_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id,
return_input_mask=True,
return_seq_lens=True)
padded_text_type_ids = pad_batch_data(batch_text_type_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(batch_position_ids, max_seq_len=self.max_seq_len, pad_idx=self.pad_id)
return_list = [padded_token_ids, padded_position_ids, padded_text_type_ids, input_mask, batch_seq_lens]
if phase != 'predict':
batch_labels = [record.label_id for record in batch_records]
batch_labels = np.array(batch_labels).astype('int64').reshape([-1, 1])
return_list += [batch_labels]
return return_list
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import time
from typing import Any
import paddle
class RunState(object):
'''
RunState is used to save the result of every running step
Args:
length (int): the number of fetch result
'''
def __init__(self, length: int):
self.run_time_begin = time.time()
self.run_step = 0
self.run_examples = 0
self.run_results = [0] * length
self.run_time_used = 0
self.run_speed = 0.0
def __add__(self, other):
self.run_step += other.run_step
self.run_examples += other.run_examples
for index in range(len(self.run_results)):
self.run_results[index] += other.run_results[index]
return self
def update(self):
self.run_time_used = time.time() - self.run_time_begin
self.run_speed = self.run_step / self.run_time_used
return self
class RunEnv(object):
'''RunEnv saves the running environment of the train/dev/predict phase, including program, reader, metrics and so on.'''
def __init__(self):
self.current_epoch = 0
self.current_step = 0
self.main_program = None
self.start_program = None
self.main_program_compiled = None
self.py_reader = None
self.generator = None
self.loss = None
self.labels = None
self.metrics = None
self.is_inititalized = False
self.UNG = copy.deepcopy(paddle.fluid.unique_name.generator)
def __setattr__(self, key: str, value: Any):
self.__dict__[key] = value
def __getattr__(self, key: str) -> Any:
return self.__dict__[key]
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''Tokenization classes.'''
import collections
import io
import pickle
import unicodedata
from typing import List, Union
def convert_to_unicode(text: Union[str, bytes]) -> str:
'''Converts `text` to Unicode (if it's not already), assuming utf-8 input.'''
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode('utf-8', 'ignore')
else:
raise ValueError('Unsupported type: {}'.format(type(text)))
def load_vocab(vocab_file: str) -> List:
'''Loads a vocabulary file into a dictionary.'''
vocab = collections.OrderedDict()
with io.open(vocab_file, 'r', encoding='UTF-8') as file:
for num, line in enumerate(file):
items = convert_to_unicode(line.strip()).split('\t')
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def convert_by_vocab(vocab: collections.OrderedDict, items: List[str]) -> List:
'''Converts a sequence of [tokens|ids] using the vocab.'''
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(vocab: collections.OrderedDict, tokens: List[str]) -> List:
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text: str) -> List:
'''Runs basic whitespace cleaning and splitting on a peice of text.'''
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
'''Runs end-to-end tokenziation.'''
def __init__(self, vocab_file: str, do_lower_case: bool = True, use_sentence_piece_vocab: bool = False):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.use_sentence_piece_vocab = use_sentence_piece_vocab
self.wordpiece_tokenizer = WordpieceTokenizer(
vocab=self.vocab, use_sentence_piece_vocab=self.use_sentence_piece_vocab)
def tokenize(self, text: str) -> List:
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens: List) -> List:
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids: List) -> List:
return convert_by_vocab(self.inv_vocab, ids)
class WSSPTokenizer(object):
def __init__(self, vocab_file: str, sp_model_dir: str, word_dict: str, ws: bool = True, lower: bool = True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.ws = ws
self.lower = lower
self.dict = pickle.load(open(word_dict, 'rb'))
import sentencepiece as spm
self.sp_model = spm.SentencePieceProcessor()
self.window_size = 5
self.sp_model.Load(sp_model_dir)
def cut(self, chars: List) -> List:
words = []
idx = 0
while idx < len(chars):
matched = False
for i in range(self.window_size, 0, -1):
cand = chars[idx:idx + i]
if cand in self.dict:
words.append(cand)
matched = True
break
if not matched:
i = 1
words.append(chars[idx])
idx += i
return words
def tokenize(self, text: Union[str, bytes], unk_token: str = '[UNK]') -> List:
text = convert_to_unicode(text)
if self.ws:
text = [s for s in self.cut(text) if s != ' ']
else:
text = text.split(' ')
if self.lower:
text = [s.lower() for s in text]
text = ' '.join(text)
tokens = self.sp_model.EncodeAsPieces(text)
in_vocab_tokens = []
for token in tokens:
if token in self.vocab:
in_vocab_tokens.append(token)
else:
in_vocab_tokens.append(unk_token)
return in_vocab_tokens
def convert_tokens_to_ids(self, tokens: List) -> List:
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids: List) -> List:
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
'''Runs basic tokenization (punctuation splitting, lower casing, etc.).'''
def __init__(self, do_lower_case: bool = True):
'''Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
'''
self.do_lower_case = do_lower_case
def tokenize(self, text: Union[str, bytes]) -> List:
'''Tokenizes a piece of text.'''
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(' '.join(split_tokens))
return output_tokens
def _run_strip_accents(self, text: str) -> str:
'''Strips accents from a piece of text.'''
text = unicodedata.normalize('NFD', text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == 'Mn':
continue
output.append(char)
return ''.join(output)
def _run_split_on_punc(self, text: str) -> List:
'''Splits punctuation on a piece of text.'''
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return [''.join(x) for x in output]
def _tokenize_chinese_chars(self, text: str) -> str:
'''Adds whitespace around any CJK character.'''
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(' ')
output.append(char)
output.append(' ')
else:
output.append(char)
return ''.join(output)
def _is_chinese_char(self, cp: int) -> bool:
'''Checks whether CP is the codepoint of a CJK character.'''
# This defines a 'chinese character' as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text: str) -> str:
'''Performs invalid character removal and whitespace cleanup on text.'''
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(' ')
else:
output.append(char)
return ''.join(output)
class WordpieceTokenizer(object):
'''Runs WordPiece tokenziation.'''
def __init__(self,
vocab: collections.OrderedDict,
unk_token: str = '[UNK]',
max_input_chars_per_word: int = 100,
use_sentence_piece_vocab: bool = False):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
self.use_sentence_piece_vocab = use_sentence_piece_vocab
def tokenize(self, text: Union[str, bytes]) -> List:
'''Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = 'unaffable'
output = ['un', '##aff', '##able']
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
'''
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = ''.join(chars[start:end])
if start == 0 and self.use_sentence_piece_vocab:
substr = u'\u2581' + substr
if start > 0 and not self.use_sentence_piece_vocab:
substr = '##' + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char: str) -> bool:
'''Checks whether `chars` is a whitespace character.'''
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == ' ' or char == '\t' or char == '\n' or char == '\r':
return True
cat = unicodedata.category(char)
if cat == 'Zs':
return True
return False
def _is_control(char: str) -> bool:
'''Checks whether `chars` is a control character.'''
# These are technically control characters but we count them as whitespace
# characters.
if char == '\t' or char == '\n' or char == '\r':
return False
cat = unicodedata.category(char)
if cat.startswith('C'):
return True
return False
def _is_punctuation(char: str) -> bool:
'''Checks whether `chars` is a punctuation character.'''
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as '^', '$', and '`' are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith('P'):
return True
return False
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Generic, List
import paddle
import numpy as np
from paddlehub.compat.task.config import RunConfig
from paddlehub.compat.task.base_task import BaseTask
from paddlehub.compat.task.task_utils import RunState
class TransformerEmbeddingTask(BaseTask):
def __init__(self,
pooled_feature: paddle.Variable,
seq_feature: paddle.Variable,
feed_list: List[str],
data_reader: Generic,
config: RunConfig = None):
main_program = pooled_feature.block.program
super(TransformerEmbeddingTask, self).__init__(
main_program=main_program, config=config, feed_list=feed_list, data_reader=data_reader, metrics_choices=[])
self.pooled_feature = pooled_feature
self.seq_feature = seq_feature
def _build_net(self) -> List[paddle.Variable]:
# ClassifyReader will return the seqence length of an input text
self.seq_len = paddle.data(name='seq_len', shape=[1], dtype='int64', lod_level=0)
return [self.pooled_feature, self.seq_feature]
def _postprocessing(self, run_states: List[RunState]) -> List[List[np.ndarray]]:
results = []
for batch_state in run_states:
batch_result = batch_state.run_results
batch_pooled_features = batch_result[0]
batch_seq_features = batch_result[1]
for i in range(len(batch_pooled_features)):
results.append([batch_pooled_features[i], batch_seq_features[i]])
return results
@property
def feed_list(self) -> List[str]:
feed_list = [varname for varname in self._base_feed_list] + [self.seq_len.name]
return feed_list
@property
def fetch_list(self) -> List[str]:
fetch_list = [output.name for output in self.outputs] + [self.seq_len.name]
return fetch_list
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册