diff --git a/.gitignore b/.gitignore index c49200d1dd623f2f0fd00f084d8ce67228426b9a..a515d1af7ad686f7a5afef4a425692cfbd3c12ff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.pyc __pycache__ pretrain_model +pretrain +output* output_model build dist diff --git a/demo/demo3/run.py b/demo/demo3/run.py index b2487c10e0b8cd8031113db791d698a45d89ee73..6bc400e9be6e6f886588fe7bbf4d4fe82193405b 100644 --- a/demo/demo3/run.py +++ b/demo/demo3/run.py @@ -15,7 +15,6 @@ if __name__ == '__main__': config = json.load(open('./pretrain/ernie/ernie_config.json')) # ernie = palm.backbone.ERNIE(...) ernie = palm.backbone.ERNIE.from_config(config) - # pred_ernie = palm.backbone.ERNIE.from_config(config, phase='pred') # cls_reader2 = palm.reader.cls(train_file_topic, vocab_path, batch_size, max_seqlen) # cls_reader3 = palm.reader.cls(train_file_subj, vocab_path, batch_size, max_seqlen) @@ -30,7 +29,6 @@ if __name__ == '__main__': print(cls_reader.outputs_attr) # 创建任务头(task head),如分类、匹配、机器阅读理解等。每个任务头有跟该任务相关的必选/可选参数。注意,任务头与reader是解耦合的,只要任务头依赖的数据集侧的字段能被reader提供,那么就是合法的 cls_head = palm.head.Classify(4, 1024, 0.1) - # cls_pred_head = palm.head.Classify(4, 1024, 0.1, phase='pred') # 根据reader和任务头来创建一个训练器trainer,trainer代表了一个训练任务,内部维护着训练进程、和任务的关键信息,并完成合法性校验,该任务的模型保存、载入等相关规则控制 trainer = palm.Trainer('senti_cls', cls_reader, cls_head) @@ -64,7 +62,12 @@ if __name__ == '__main__': # print(trainer.train_one_step(next(iterator_fn()))) # trainer.train_one_epoch() - trainer.train(iterator_fn, print_steps=1, save_steps=5, save_path='outputs/ckpt') + # for save predict model. + pred_ernie = palm.backbone.ERNIE.from_config(config, phase='pred') + cls_pred_head = palm.head.Classify(4, 1024, phase='pred') + trainer.build_predict_head(cls_pred_head, pred_ernie) + + trainer.train(iterator_fn, print_steps=1, save_steps=5, save_path='outputs', save_type='ckpt,predict') # trainer.save() diff --git a/interface.py b/interface.py deleted file mode 100644 index b8c3f78b716944528018a596aca0a73325f8b8d7..0000000000000000000000000000000000000000 --- a/interface.py +++ /dev/null @@ -1,177 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""v1.1""" - -class reader(object): - """interface of data manager.""" - - def __init__(self, config): - assert isinstance(config, dict) - - # @property - # def inputs_attr(self): - # """描述reader输入对象的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1. - # Return: - # dict类型。对各个输入对象的属性描述。例如, - # 对于文本分类任务,可能需要包含输入文本和所属标签的id - # {"text": ([], 'str'), - # "label": ([], 'int')} - # 对于标注任务,可能需要输入词序列和对应的标签 - # {"tokens", ([-1], 'str'), - # "tags", ([-1], 'str')} - # 对于机器阅读理解任务,可能需要包含上下文、问题、回答、答案区域的起止位置等 - # {"paragraph", ([], 'str'), - # "question", ([], 'str'), - # "start_position", ([], 'int') - # """ - # raise NotImplementedError() - - @property - def outputs_attr(self): - """描述reader输出对象(被yield出的对象)的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。 - 注意:当使用mini-batch梯度下降学习策略时,,应为常规的输入对象设置batch_size维度(一般为-1) - Return: - dict类型。对各个输入对象的属性描述。例如, - 对于文本分类和匹配任务,yield的输出内容可能包含如下的对象(下游backbone和task可按需访问其中的对象) - {"token_ids": ([-1, max_len], 'int64'), - "input_ids": ([-1, max_len], 'int64'), - "segment_ids": ([-1, max_len], 'int64'), - "input_mask": ([-1, max_len], 'float32'), - "label": ([-1], 'int')} - """ - raise NotImplementedError() - - # def parse_line(self): - # """框架内部使用字典描述每个样本,字典的key为inputs_attr,value为每个input对应的符合attr描述的值。 - # 该函数负责将文本行解析成符合inputs_attr描述的字典类型的样本。默认的parse_line方法会读取json格式的数据集文件,数据集的每一行为json格式描述的样本。 - # 用户可通过对该方法的继承改写来适配不同格式的数据集,例如csv格式甚至tfrecord文件。 - # """ - # raise NotImplementedError() - # - # def tokenize(self, line): - # """框架中内置了word piece tokenizer等分词器,用户可通过修改tokenizer超参数来制定使用的分词器,若内置的分词器均无法满足需求,用户可通过对该方法的继承改写来自定义分词器。 - # Args: - # - line: a unicode string. - # Return: - # a list of tokens - # """ - # raise NotImplementedError() - - def iterator(self): - """数据集遍历接口,注意,当数据集遍历到尾部时该接口应自动完成指针重置,即重新从数据集头部开始新的遍历。 - Yield: - (dict) elements that meet the requirements in output_templete - """ - raise NotImplementedError() - - @property - def num_examples(self): - """数据集中的样本数量,即每个epoch中iterator所生成的样本数。注意,使用滑动窗口等可能导致数据集样本数发生变化的策略时,该接口应返回runtime阶段的实际样本数。""" - raise NotImplementedError() - - - -class backbone(object): - """interface of backbone model.""" - - def __init__(self, config, phase): - """ - Args: - config: dict类型。描述了 多任务配置文件+预训练模型配置文件 中定义超参数 - phase: str类型。运行阶段,目前支持train和predict - """ - assert isinstance(config, dict) - - @property - def inputs_attr(self): - """描述backbone从reader处需要得到的输入对象的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。 - Return: - dict类型。对各个输入对象的属性描述。例如, - 对于文本分类和匹配任务,bert backbone依赖的reader对象主要包含如下的对象 - {"token_ids": ([-1, max_len], 'int64'), - "input_ids": ([-1, max_len], 'int64'), - "segment_ids": ([-1, max_len], 'int64'), - "input_mask": ([-1, max_len], 'float32')}""" - raise NotImplementedError() - - @property - def outputs_attr(self): - """描述backbone输出对象的属性,包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。 - Return: - dict类型。对各个输出对象的属性描述。例如, - 对于文本分类和匹配任务,bert backbone的输出内容可能包含如下的对象 - {"word_emb": ([-1, max_seqlen, word_emb_size], 'float32'), - "sentence_emb": ([-1, hidden_size], 'float32'), - "sim_vec": ([-1, hidden_size], 'float32')}""" - raise NotImplementedError() - - def build(self, inputs): - """建立backbone的计算图。将符合inputs_attr描述的静态图Variable输入映射成符合outputs_attr描述的静态图Variable输出。 - Args: - inputs: dict类型。字典中包含inputs_attr中的对象名到计算图Variable的映射,inputs中至少会包含inputs_attr中定义的对象 - Return: - 需要输出的计算图变量,输出对象会被加入到fetch_list中,从而在每个训练/推理step时得到runtime的计算结果,该计算结果会被传入postprocess方法中供用户处理。 - """ - raise NotImplementedError() - - - - -class task_paradigm(object): - - def __init__(self, config, phase, backbone_config): - """ - config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数 - phase: str类型。运行阶段,目前支持train和predict - """ - - @property - def inputs_attrs(self): - """描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性,第一级key为对象集和的名字,如backbone,reader等(后续会支持更灵活的输入),第二级key为对象集和中各对象的属性,包括对象的名字,shape和dtype。当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。 - Return: - dict类型。对各个对象集及其输入对象的属性描述。""" - raise NotImplementedError() - - @property - def outputs_attr(self): - """描述task输出对象的属性,包括对象的名字,shape和dtype。输出对象会被加入到fetch_list中,从而在每个训练/推理step时得到runtime的计算结果,该计算结果会被传入postprocess方法中供用户处理。 - 当某个对象为标量数据类型(如str, int, float等)时,shape设置为空列表[],当某个对象的某个维度长度可变时,shape中的相应维度设置为-1。 - Return: - dict类型。对各个输入对象的属性描述。注意,训练阶段必须包含名为loss的输出对象。 - """ - - raise NotImplementedError() - - @property - def epoch_inputs_attrs(self): - return {} - - def build(self, inputs, scope_name=""): - """建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。 - Args: - inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射,inputs中至少会包含inputs_attr中定义的对象 - Return: - 需要输出的计算图变量,输出对象会被加入到fetch_list中,从而在每个训练/推理step时得到runtime的计算结果,该计算结果会被传入postprocess方法中供用户处理。 - - """ - raise NotImplementedError() - - def postprocess(self, rt_outputs): - """每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意,rt_outputs除了包含build方法,还自动包含了loss的计算结果。""" - pass - - def epoch_postprocess(self, post_inputs): - pass - diff --git a/paddlepalm/trainer.py b/paddlepalm/trainer.py index 13e1a198ea70d7711a4b8777fb38e8ccefbafee1..01be7e14d0ebadfc0d93c07a7d56d251c6e5c2e8 100644 --- a/paddlepalm/trainer.py +++ b/paddlepalm/trainer.py @@ -38,7 +38,7 @@ class Trainer(object): self._reader = reader self._pred_reader = None self._task_head = task_head - self._pred_head = pred_head + self._pred_head = None # if save_predict_model: # self._save_predict_model = True @@ -89,20 +89,24 @@ class Trainer(object): self._lock = False self._build_forward = False - def build_predict_head(self, pred_backbone, pred_prog=None, pred_init_prog=None): + def build_predict_head(self, pred_head, pred_backbone, pred_prog=None, pred_init_prog=None): + self._pred_head = pred_head + # self._pred_reader = self._reader.clone(phase='pred') pred_task_attr_from_reader = helper.encode_inputs(self._pred_head.inputs_attrs['reader'], self.name) # pred_task_attr_from_reader = self._pred_head.inputs_attrs['reader'] # _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred') # _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred') # _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone') - pred_input_names, pred_shape_and_dtypes, _ = reader_helper.merge_input_attrs(backbone.inputs_attr, pred_task_attr_from_reader, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False) + pred_input_names, pred_shape_and_dtypes, _ = reader_helper.merge_input_attrs(pred_backbone.inputs_attr, pred_task_attr_from_reader, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False) pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_input_names, pred_shape_and_dtypes)] if pred_prog is None: pred_prog = fluid.Program() + self._pred_prog = pred_prog if pred_init_prog is None: pred_init_prog = fluid.Program() + self._pred_init_prog = pred_init_prog with fluid.program_guard(pred_prog, pred_init_prog): pred_net_inputs = reader_helper.create_net_inputs(pred_input_attrs) # pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_') @@ -121,8 +125,6 @@ class Trainer(object): self._build_head(pred_task_inputs, phase='pred', scope=scope) - - def build_forward(self, backbone, pred_backbone=None, train_prog=None, train_init_prog=None, pred_prog=None, pred_init_prog=None): # assert self._backbone is not None, "backbone is required for Trainer to build net forward to run with single task mode" @@ -154,7 +156,6 @@ class Trainer(object): print('joint input shape and dtypes:') print(joint_shape_and_dtypes) - input_attrs = [[i, j, k] for i, (j,k) in zip(input_names, shape_and_dtypes)] if train_prog is None: @@ -172,6 +173,7 @@ class Trainer(object): # bb_output_vars = self._backbone.build(net_inputs, scope_name='__paddlepalm_') bb_output_vars = backbone.build(net_inputs) assert sorted(bb_output_vars.keys()) == sorted(backbone.outputs_attr.keys()) + # self._bb_output_vars.keys # fluid.framework.switch_main_program(train_prog) @@ -293,10 +295,14 @@ class Trainer(object): pass def train(self, iterator, save_path=None, save_steps=None, save_type='ckpt', print_steps=5): + """ + Argument: + save_type: ckpt, predict, pretrain + """ save_type = save_type.split(',') if 'predict' in save_type: - assert self._pred_head is not None, "Predict head not found! You should call set_predict_head first if you want to save predict model." + assert self._pred_head is not None, "Predict head not found! You should build_predict_head first if you want to save predict model." assert save_path is not None and save_steps is not None, 'save_path and save_steps is required to save model.' save_predict = True if not os.path.exists(save_path): @@ -369,11 +375,11 @@ class Trainer(object): # cur_task.save() if (save_predict or save_ckpt) and self._cur_train_step % save_steps == 0: - if save_predict_model: - self.save(save_path, suffix='pred.step'+str(global_step)) + if save_predict: + self.save(save_path, suffix='pred.step'+str(self._cur_train_step)) if save_ckpt: - fluid.io.save_persistables(self.exe, os.path.join(save_path, 'ckpt.step'+str(global_step)), self._train_prog) - print('checkpoint has been saved at '+os.path.join(save_path, 'ckpt.step'+str(global_step))) + fluid.io.save_persistables(self._exe, os.path.join(save_path, 'ckpt.step'+str(self._cur_train_step)), self._train_prog) + print('checkpoint has been saved at '+os.path.join(save_path, 'ckpt.step'+str(self._cur_train_step))) # save_path = os.path.join(main_conf['save_path'], 'ckpt', # "step_" + str(global_step)) @@ -422,7 +428,7 @@ class Trainer(object): dirpath = save_path self._pred_input_varname_list = [str(i) for i in self._pred_input_varname_list] - prog = fluid.default_main_program().clone() + prog = self._pred_prog.clone() fluid.io.save_inference_model(dirpath, self._pred_input_varname_list, self._pred_fetch_var_list, self._exe, prog) conf = {} diff --git a/reader/__init__.py b/reader/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/reader/cls.py b/reader/cls.py deleted file mode 100644 index 1ecf6cbf7ffd5c6aea62297a292ca2e014232053..0000000000000000000000000000000000000000 --- a/reader/cls.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlepalm.interface import reader -from paddlepalm.reader.utils.reader4ernie import ClassifyReader - -class Reader(reader): - - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): - """ - Args: - phase: train, eval, pred - """ - - self._is_training = phase == 'train' - - reader = ClassifyReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', False), - for_cn=config.get('for_cn', False), - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count - - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] - self._num_classes = config['n_classes'] - - if phase == 'train': - self._input_file = config['train_file'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - # self._shuffle_buffer = config.get('shuffle_buffer', 5000) - elif phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - - self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 0) - - - @property - def outputs_attr(self): - if self._is_training: - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "label_ids": [[-1,1], 'int64'], - "task_ids": [[-1, -1, 1], 'int64'] - } - else: - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "task_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'] - } - - - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - - def iterator(self): - - def list_to_dict(x): - names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', - 'label_ids', 'unique_ids'] - outputs = {n: i for n,i in zip(names, x)} - del outputs['unique_ids'] - if not self._is_training: - del outputs['label_ids'] - return outputs - - for batch in self._data_generator(): - yield list_to_dict(batch) - - def get_epoch_outputs(self): - return {'examples': self._reader.get_examples(self._phase), - 'features': self._reader.get_features(self._phase)} - - @property - def num_examples(self): - return self._reader.get_num_examples(phase=self._phase) - diff --git a/reader/match.py b/reader/match.py deleted file mode 100644 index a77b241e5337dfe4a2352b9cd9d1d2d000032a22..0000000000000000000000000000000000000000 --- a/reader/match.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlepalm.interface import reader -from paddlepalm.reader.utils.reader4ernie import ClassifyReader - -def match(vocab_path, max_seq_len, do_lower_case=True, phase, dev_count=1): - config={ - xxx} - - return Reader(config()) - -class Reader(reader): - - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): - """ - Args: - phase: train, eval, pred - """ - - self._is_training = phase == 'train' - - reader = ClassifyReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', True), - for_cn=config.get('for_cn', False), - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count - - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] - if phase == 'train': - self._input_file = config['train_file'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - self._shuffle_buffer = config.get('shuffle_buffer', 5000) - elif phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - - self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 1) - - - @property - def outputs_attr(self): - if self._is_training: - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "label_ids": [[-1,1], 'int64'], - "task_ids": [[-1, -1, 1], 'int64'] - } - else: - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "task_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'] - } - - - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - - def iterator(self): - - def list_to_dict(x): - names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', - 'label_ids', 'unique_ids'] - outputs = {n: i for n,i in zip(names, x)} - del outputs['unique_ids'] - if not self._is_training: - del outputs['label_ids'] - return outputs - - for batch in self._data_generator(): - yield list_to_dict(batch) - - @property - def num_examples(self): - return self._reader.get_num_examples(phase=self._phase) - diff --git a/reader/mlm.py b/reader/mlm.py deleted file mode 100644 index eb09c5e438c1348fc70d6f45d8a3af5e07201eea..0000000000000000000000000000000000000000 --- a/reader/mlm.py +++ /dev/null @@ -1,97 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlepalm.interface import reader -from paddlepalm.reader.utils.reader4ernie import MaskLMReader -import numpy as np - -class Reader(reader): - - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): - """ - Args: - phase: train, eval, pred - """ - - self._is_training = phase == 'train' - - reader = MaskLMReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', False), - for_cn=config.get('for_cn', False), - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count - - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] - if phase == 'train': - self._input_file = config['train_file'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - self._shuffle_buffer = config.get('shuffle_buffer', 5000) - elif phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - - self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 1) - - - @property - def outputs_attr(self): - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "task_ids": [[-1, -1, 1], 'int64'], - "mask_label": [[-1, 1], 'int64'], - "mask_pos": [[-1, 1], 'int64'], - } - - - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - - def iterator(self): - - def list_to_dict(x): - names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', - 'task_ids', 'mask_label', 'mask_pos'] - outputs = {n: i for n,i in zip(names, x)} - # outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1] - return outputs - - for batch in self._data_generator(): - # print(np.shape(list_to_dict(batch)['token_ids'])) - # print(list_to_dict(batch)['mask_label'].tolist()) - yield list_to_dict(batch) - - def get_epoch_outputs(self): - return {'examples': self._reader.get_examples(self._phase), - 'features': self._reader.get_features(self._phase)} - - @property - def num_examples(self): - return self._reader.get_num_examples(phase=self._phase) - diff --git a/reader/mrc.py b/reader/mrc.py deleted file mode 100644 index 03c560c19206aa6587157a0701d948111c35cc8d..0000000000000000000000000000000000000000 --- a/reader/mrc.py +++ /dev/null @@ -1,119 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddlepalm.interface import reader -from paddlepalm.reader.utils.reader4ernie import MRCReader - -class Reader(reader): - - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): - """ - Args: - phase: train, eval, pred - """ - - self._is_training = phase == 'train' - - reader = MRCReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', False), - tokenizer='FullTokenizer', - for_cn=config.get('for_cn', False), - doc_stride=config['doc_stride'], - max_query_length=config['max_query_len'], - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count - - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] - if phase == 'train': - self._input_file = config['train_file'] - # self._num_epochs = config['num_epochs'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - self._shuffle_buffer = config.get('shuffle_buffer', 5000) - if phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - - self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 1) - - # TODO: without slide window version - self._with_slide_window = config.get('with_slide_window', False) - - - @property - def outputs_attr(self): - if self._is_training: - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "start_positions": [[-1, 1], 'int64'], - "end_positions": [[-1, 1], 'int64'], - "task_ids": [[-1, -1, 1], 'int64'] - } - else: - return {"token_ids": [[-1, -1, 1], 'int64'], - "position_ids": [[-1, -1, 1], 'int64'], - "segment_ids": [[-1, -1, 1], 'int64'], - "task_ids": [[-1, -1, 1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "unique_ids": [[-1, 1], 'int64'] - } - - @property - def epoch_outputs_attr(self): - if not self._is_training: - return {"examples": None, - "features": None} - - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - - def iterator(self): - - def list_to_dict(x): - names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', - 'start_positions', 'end_positions', 'unique_ids'] - outputs = {n: i for n,i in zip(names, x)} - if self._is_training: - del outputs['unique_ids'] - else: - del outputs['start_positions'] - del outputs['end_positions'] - return outputs - - for batch in self._data_generator(): - yield list_to_dict(batch) - - def get_epoch_outputs(self): - return {'examples': self._reader.get_examples(self._phase), - 'features': self._reader.get_features(self._phase)} - - @property - def num_examples(self): - return self._reader.get_num_examples(phase=self._phase) - diff --git a/reader/utils/__init__.py b/reader/utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/reader/utils/batching4bert.py b/reader/utils/batching4bert.py deleted file mode 100644 index daeb25ae9e0fd2dfd4abe021453a71ccd790d562..0000000000000000000000000000000000000000 --- a/reader/utils/batching4bert.py +++ /dev/null @@ -1,184 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Mask, padding and batching.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import numpy as np - - -def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): - """ - Add mask for batch_tokens, return out, mask_label, mask_pos; - Note: mask_pos responding the batch_tokens after padded; - """ - max_len = max([len(sent) for sent in batch_tokens]) - mask_label = [] - mask_pos = [] - prob_mask = np.random.rand(total_token_num) - # Note: the first token is [CLS], so [low=1] - replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) - pre_sent_len = 0 - prob_index = 0 - for sent_index, sent in enumerate(batch_tokens): - mask_flag = False - prob_index += pre_sent_len - for token_index, token in enumerate(sent): - prob = prob_mask[prob_index + token_index] - if prob > 0.15: - continue - elif 0.03 < prob <= 0.15: - # mask - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - elif 0.015 < prob <= 0.03: - # random replace - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = replace_ids[prob_index + token_index] - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - else: - # keep the original token - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - mask_pos.append(sent_index * max_len + token_index) - pre_sent_len = len(sent) - # ensure at least mask one word in a sentence - while not mask_flag: - token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) - if sent[token_index] != SEP and sent[token_index] != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) - mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) - return batch_tokens, mask_label, mask_pos - - -def prepare_batch_data(insts, - total_token_num, - max_len=None, - voc_size=0, - pad_id=None, - cls_id=None, - sep_id=None, - mask_id=None, - return_input_mask=True, - return_max_len=True, - return_num_token=False): - """ - 1. generate Tensor of data - 2. generate Tensor of position - 3. generate self attention mask, [shape: batch_size * max_len * max_len] - """ - batch_src_ids = [inst[0] for inst in insts] - batch_sent_ids = [inst[1] for inst in insts] - batch_pos_ids = [inst[2] for inst in insts] - labels_list = [] - # compatible with mrqa, whose example includes start/end positions, - # or unique id - for i in range(3, len(insts[0]), 1): - labels = [inst[i] for inst in insts] - labels = np.array(labels).astype("int64").reshape([-1, 1]) - labels_list.append(labels) - # First step: do mask without padding - if mask_id >= 0: - out, mask_label, mask_pos = mask( - batch_src_ids, - total_token_num, - vocab_size=voc_size, - CLS=cls_id, - SEP=sep_id, - MASK=mask_id) - else: - out = batch_src_ids - # Second step: padding - src_id, self_input_mask = pad_batch_data( - out, - max_len=max_len, - pad_idx=pad_id, return_input_mask=True) - pos_id = pad_batch_data( - batch_pos_ids, - max_len=max_len, - pad_idx=pad_id, - return_pos=False, - return_input_mask=False) - sent_id = pad_batch_data( - batch_sent_ids, - max_len=max_len, - pad_idx=pad_id, - return_pos=False, - return_input_mask=False) - if mask_id >= 0: - return_list = [ - src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos - ] + labels_list - else: - return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list - return return_list if len(return_list) > 1 else return_list[0] - - -def pad_batch_data(insts, - max_len=None, - pad_idx=0, - return_pos=False, - return_input_mask=False, - return_max_len=False, - return_num_token=False): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and input mask. - """ - return_list = [] - if max_len is None: - max_len = max(len(inst) for inst in insts) - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - inst_data = np.array([ - list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts - ]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] - # position data - if return_pos: - inst_pos = np.array([ - list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) - for inst in insts - ]) - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] - if return_input_mask: - # This is used to avoid attention on paddings. - input_mask_data = np.array([[1] * len(inst) + [0] * - (max_len - len(inst)) for inst in insts]) - input_mask_data = np.expand_dims(input_mask_data, axis=-1) - return_list += [input_mask_data.astype("float32")] - if return_max_len: - return_list += [max_len] - if return_num_token: - num_token = 0 - for inst in insts: - num_token += len(inst) - return_list += [num_token] - return return_list if len(return_list) > 1 else return_list[0] - - -if __name__ == "__main__": - pass - - diff --git a/reader/utils/batching4ernie.py b/reader/utils/batching4ernie.py deleted file mode 100644 index d3d13573c38af3d7d6e7027cbff06969b449b722..0000000000000000000000000000000000000000 --- a/reader/utils/batching4ernie.py +++ /dev/null @@ -1,175 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Mask, padding and batching.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -from six.moves import xrange - - -def mask(batch_tokens, - seg_labels, - mask_word_tags, - total_token_num, - vocab_size, - CLS=1, - SEP=2, - MASK=3): - """ - Add mask for batch_tokens, return out, mask_label, mask_pos; - Note: mask_pos responding the batch_tokens after padded; - """ - max_len = max([len(sent) for sent in batch_tokens]) - mask_label = [] - mask_pos = [] - prob_mask = np.random.rand(total_token_num) - # Note: the first token is [CLS], so [low=1] - replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) - pre_sent_len = 0 - prob_index = 0 - for sent_index, sent in enumerate(batch_tokens): - mask_flag = False - mask_word = mask_word_tags[sent_index] - prob_index += pre_sent_len - if mask_word: - beg = 0 - for token_index, token in enumerate(sent): - seg_label = seg_labels[sent_index][token_index] - if seg_label == 1: - continue - if beg == 0: - if seg_label != -1: - beg = token_index - continue - - prob = prob_mask[prob_index + beg] - if prob > 0.15: - pass - else: - for index in xrange(beg, token_index): - prob = prob_mask[prob_index + index] - base_prob = 1.0 - if index == beg: - base_prob = 0.15 - if base_prob * 0.2 < prob <= base_prob: - mask_label.append(sent[index]) - sent[index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + index) - elif base_prob * 0.1 < prob <= base_prob * 0.2: - mask_label.append(sent[index]) - sent[index] = replace_ids[prob_index + index] - mask_flag = True - mask_pos.append(sent_index * max_len + index) - else: - mask_label.append(sent[index]) - mask_pos.append(sent_index * max_len + index) - - if seg_label == -1: - beg = 0 - else: - beg = token_index - else: - for token_index, token in enumerate(sent): - prob = prob_mask[prob_index + token_index] - if prob > 0.15: - continue - elif 0.03 < prob <= 0.15: - # mask - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - elif 0.015 < prob <= 0.03: - # random replace - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = replace_ids[prob_index + - token_index] - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - else: - # keep the original token - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - mask_pos.append(sent_index * max_len + token_index) - - pre_sent_len = len(sent) - - mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) - mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) - return batch_tokens, mask_label, mask_pos - - -def pad_batch_data(insts, - pad_idx=0, - return_pos=False, - return_input_mask=False, - return_max_len=False, - return_num_token=False, - return_seq_lens=False): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - - inst_data = np.array( - [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] - - # position data - if return_pos: - inst_pos = np.array([ - list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) - for inst in insts - ]) - - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] - - if return_input_mask: - # This is used to avoid attention on paddings. - input_mask_data = np.array([[1] * len(inst) + [0] * - (max_len - len(inst)) for inst in insts]) - input_mask_data = np.expand_dims(input_mask_data, axis=-1) - return_list += [input_mask_data.astype("float32")] - - if return_max_len: - return_list += [max_len] - - if return_num_token: - num_token = 0 - for inst in insts: - num_token += len(inst) - return_list += [num_token] - - if return_seq_lens: - seq_lens = np.array([len(inst) for inst in insts]) - return_list += [seq_lens.astype("int64").reshape([-1, 1])] - - return return_list if len(return_list) > 1 else return_list[0] - - -if __name__ == "__main__": - - pass diff --git a/reader/utils/mlm_batching.py b/reader/utils/mlm_batching.py deleted file mode 100644 index 991d02d3b50c9b3c10b0cebd3d12f6762cb91f01..0000000000000000000000000000000000000000 --- a/reader/utils/mlm_batching.py +++ /dev/null @@ -1,177 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Mask, padding and batching.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import numpy as np - - -def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): - """ - Add mask for batch_tokens, return out, mask_label, mask_pos; - Note: mask_pos responding the batch_tokens after padded; - """ - max_len = max([len(sent) for sent in batch_tokens]) - mask_label = [] - mask_pos = [] - prob_mask = np.random.rand(total_token_num) - # Note: the first token is [CLS], so [low=1] - replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) - pre_sent_len = 0 - prob_index = 0 - for sent_index, sent in enumerate(batch_tokens): - mask_flag = False - prob_index += pre_sent_len - for token_index, token in enumerate(sent): - prob = prob_mask[prob_index + token_index] - if prob > 0.15: - continue - elif 0.03 < prob <= 0.15: - # mask - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - elif 0.015 < prob <= 0.03: - # random replace - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = replace_ids[prob_index + token_index] - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - else: - # keep the original token - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - mask_pos.append(sent_index * max_len + token_index) - pre_sent_len = len(sent) - # ensure at least mask one word in a sentence - while not mask_flag: - token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) - if sent[token_index] != SEP and sent[token_index] != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - mask_label = np.array(mask_label).astype("int64").reshape([-1, 1]) - mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1]) - return batch_tokens, mask_label, mask_pos - - -def prepare_batch_data(insts, - total_token_num, - max_len=None, - voc_size=0, - pad_id=None, - cls_id=None, - sep_id=None, - mask_id=None, - task_id=0, - return_input_mask=True, - return_max_len=True, - return_num_token=False): - """ - 1. generate Tensor of data - 2. generate Tensor of position - 3. generate self attention mask, [shape: batch_size * max_len * max_len] - """ - batch_src_ids = [inst[0] for inst in insts] - batch_sent_ids = [inst[1] for inst in insts] - batch_pos_ids = [inst[2] for inst in insts] - - # 这里是否应该反过来???否则在task layer里展开后的word embedding是padding后的,这时候word的index是跟没有padding时的index对不上的? - # First step: do mask without padding - out, mask_label, mask_pos = mask( - batch_src_ids, - total_token_num, - vocab_size=voc_size, - CLS=cls_id, - SEP=sep_id, - MASK=mask_id) - # Second step: padding - src_id, self_input_mask = pad_batch_data( - out, - max_len=max_len, - pad_idx=pad_id, return_input_mask=True) - - pos_id = pad_batch_data( - batch_pos_ids, - max_len=max_len, - pad_idx=pad_id, - return_pos=False, - return_input_mask=False) - sent_id = pad_batch_data( - batch_sent_ids, - max_len=max_len, - pad_idx=pad_id, - return_pos=False, - return_input_mask=False) - task_ids = np.ones_like( - src_id, dtype="int64") * task_id - return_list = [ - src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos - ] - return return_list if len(return_list) > 1 else return_list[0] - - -def pad_batch_data(insts, - max_len=None, - pad_idx=0, - return_pos=False, - return_input_mask=False, - return_max_len=False, - return_num_token=False): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and input mask. - """ - return_list = [] - if max_len is None: - max_len = max(len(inst) for inst in insts) - # Any token included in dict can be used to pad, since the paddings' loss - # will be masked out by weights and make no effect on parameter gradients. - inst_data = np.array([ - list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts - ]) - return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] - # position data - if return_pos: - inst_pos = np.array([ - list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) - for inst in insts - ]) - return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] - if return_input_mask: - # This is used to avoid attention on paddings. - input_mask_data = np.array([[1] * len(inst) + [0] * - (max_len - len(inst)) for inst in insts]) - input_mask_data = np.expand_dims(input_mask_data, axis=-1) - return_list += [input_mask_data.astype("float32")] - if return_max_len: - return_list += [max_len] - if return_num_token: - num_token = 0 - for inst in insts: - num_token += len(inst) - return_list += [num_token] - return return_list if len(return_list) > 1 else return_list[0] - - -if __name__ == "__main__": - pass - - diff --git a/reader/utils/mrqa_helper.py b/reader/utils/mrqa_helper.py deleted file mode 100644 index e4f8bf53f0d51bde88148b10e40af95dbf0a0e0a..0000000000000000000000000000000000000000 --- a/reader/utils/mrqa_helper.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -class MRQAExample(object): - """A single training/test example for simple sequence classification. - - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=False): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) - s += ", question_text: %s" % ( - tokenization.printable_text(self.question_text)) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.start_position: - s += ", end_position: %d" % (self.end_position) - if self.start_position: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class MRQAFeature(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - diff --git a/reader/utils/reader4ernie.py b/reader/utils/reader4ernie.py deleted file mode 100644 index 37b6396dd80a6e158bf06e295894a2094dfd16f6..0000000000000000000000000000000000000000 --- a/reader/utils/reader4ernie.py +++ /dev/null @@ -1,995 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import absolute_import - -import sys -import os -import json -import random -import logging -import numpy as np -import six -from io import open -from collections import namedtuple - -import paddlepalm.tokenizer.ernie_tokenizer as tokenization -from paddlepalm.reader.utils.batching4ernie import pad_batch_data -from paddlepalm.reader.utils.mlm_batching import prepare_batch_data - - -log = logging.getLogger(__name__) - -if six.PY3: - import io - sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') - sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') - - -def csv_reader(fd, delimiter='\t'): - def gen(): - for i in fd: - yield i.rstrip('\n').split(delimiter) - return gen() - - -class BaseReader(object): - def __init__(self, - vocab_path, - label_map_config=None, - max_seq_len=512, - do_lower_case=True, - in_tokens=False, - is_inference=False, - random_seed=None, - tokenizer="FullTokenizer", - is_classify=True, - is_regression=False, - for_cn=True, - task_id=0): - self.max_seq_len = max_seq_len - self.tokenizer = tokenization.FullTokenizer( - vocab_file=vocab_path, do_lower_case=do_lower_case) - self.vocab = self.tokenizer.vocab - self.pad_id = self.vocab["[PAD]"] - self.cls_id = self.vocab["[CLS]"] - self.sep_id = self.vocab["[SEP]"] - self.mask_id = self.vocab["[MASK]"] - self.in_tokens = in_tokens - self.is_inference = is_inference - self.for_cn = for_cn - self.task_id = task_id - - np.random.seed(random_seed) - - self.is_classify = is_classify - self.is_regression = is_regression - self.current_example = 0 - self.current_epoch = 0 - self.num_examples = 0 - - self.examples = {} - - if label_map_config: - with open(label_map_config, encoding='utf8') as f: - self.label_map = json.load(f) - else: - self.label_map = None - - def get_train_progress(self): - """Gets progress for training phase.""" - return self.current_example, self.current_epoch - - def _read_tsv(self, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, 'r', encoding='utf8') as f: - reader = csv_reader(f) - headers = next(reader) - Example = namedtuple('Example', headers) - - examples = [] - for line in reader: - example = Example(*line) - examples.append(example) - return examples - - def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - def _convert_example_to_record(self, example, max_seq_length, tokenizer): - """Converts a single `Example` into a single `Record`.""" - - text_a = tokenization.convert_to_unicode(example.text_a) - tokens_a = tokenizer.tokenize(text_a) - tokens_b = None - - has_text_b = False - if isinstance(example, dict): - has_text_b = "text_b" in example.keys() - else: - has_text_b = "text_b" in example._fields - - if has_text_b: - text_b = tokenization.convert_to_unicode(example.text_b) - tokens_b = tokenizer.tokenize(text_b) - - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] - - # The convention in BERT/ERNIE is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - text_type_ids = [] - tokens.append("[CLS]") - text_type_ids.append(0) - for token in tokens_a: - tokens.append(token) - text_type_ids.append(0) - tokens.append("[SEP]") - text_type_ids.append(0) - - if tokens_b: - for token in tokens_b: - tokens.append(token) - text_type_ids.append(1) - tokens.append("[SEP]") - text_type_ids.append(1) - - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - - if self.is_inference: - Record = namedtuple('Record', - ['token_ids', 'text_type_ids', 'position_ids']) - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids) - else: - if self.label_map: - label_id = self.label_map[example.label] - else: - label_id = example.label - - Record = namedtuple('Record', [ - 'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid' - ]) - - qid = None - if "qid" in example._fields: - qid = example.qid - - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids, - label_id=label_id, - qid=qid) - return record - - def _prepare_batch_data(self, examples, batch_size, phase=None): - """generate batch records""" - batch_records, max_len = [], 0 - if len(examples) < batch_size: - raise Exception('CLS dataset contains too few samples. Expect more than '+str(batch_size)) - for index, example in enumerate(examples): - if phase == "train": - self.current_example = index - record = self._convert_example_to_record(example, self.max_seq_len, - self.tokenizer) - max_len = max(max_len, len(record.token_ids)) - if self.in_tokens: - to_append = (len(batch_records) + 1) * max_len <= batch_size - else: - to_append = len(batch_records) < batch_size - if to_append: - batch_records.append(record) - else: - yield self._pad_batch_records(batch_records) - batch_records, max_len = [record], len(record.token_ids) - - if phase == 'pred' and batch_records: - yield self._pad_batch_records(batch_records) - - def get_num_examples(self, input_file=None, phase=None): - if self.examples is not None: - if phase is None: - phase = 'all' - return len(self.examples[phase]) - else: - assert input_file is not None, "Argument input_file should be given or the data_generator should be created when this func is called." - examples = self._read_tsv(input_file) - return len(examples) - - def data_generator(self, - input_file, - batch_size, - epoch, - dev_count=1, - shuffle=True, - phase=None): - examples = self._read_tsv(input_file) - if phase is None: - phase = 'all' - self.examples[phase] = examples - - def wrapper(): - all_dev_batches = [] - if epoch is None: - num_epochs = 99999999 - else: - num_epochs = epoch - for epoch_index in range(num_epochs): - if phase == "train": - self.current_example = 0 - self.current_epoch = epoch_index - if shuffle: - np.random.shuffle(examples) - - for batch_data in self._prepare_batch_data( - examples, batch_size, phase=phase): - if len(all_dev_batches) < dev_count: - all_dev_batches.append(batch_data) - if len(all_dev_batches) == dev_count: - for batch in all_dev_batches: - yield batch - all_dev_batches = [] - def f(): - for i in wrapper(): - yield i - - # def f(): - # try: - # for i in wrapper(): - # yield i - # except Exception as e: - # import traceback - # traceback.print_exc() - - return f - - -class MaskLMReader(BaseReader): - - def _convert_example_to_record(self, example, max_seq_length, tokenizer): - """Converts a single `Example` into a single `Record`.""" - - text_a = tokenization.convert_to_unicode(example.text_a) - tokens_a = tokenizer.tokenize(text_a) - tokens_b = None - - has_text_b = False - if isinstance(example, dict): - has_text_b = "text_b" in example.keys() - else: - has_text_b = "text_b" in example._fields - - if has_text_b: - text_b = tokenization.convert_to_unicode(example.text_b) - tokens_b = tokenizer.tokenize(text_b) - - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] - - # The convention in BERT/ERNIE is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - text_type_ids = [] - tokens.append("[CLS]") - text_type_ids.append(0) - for token in tokens_a: - tokens.append(token) - text_type_ids.append(0) - tokens.append("[SEP]") - text_type_ids.append(0) - - if tokens_b: - for token in tokens_b: - tokens.append(token) - text_type_ids.append(1) - tokens.append("[SEP]") - text_type_ids.append(1) - - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - - # Record = namedtuple('Record', - # ['token_ids', 'text_type_ids', 'position_ids']) - # record = Record( - # token_ids=token_ids, - # text_type_ids=text_type_ids, - # position_ids=position_ids) - - return [token_ids, text_type_ids, position_ids] - - def batch_reader(self, examples, batch_size, in_tokens, phase): - batch = [] - total_token_num = 0 - if len(examples) < batch_size: - raise Exception('MaskLM dataset contains too few samples. Expect more than '+str(batch_size)) - for e in examples: - parsed_line = self._convert_example_to_record(e, self.max_seq_len, self.tokenizer) - to_append = len(batch) < batch_size - if to_append: - batch.append(parsed_line) - total_token_num += len(parsed_line[0]) - else: - yield batch, total_token_num - batch = [parsed_line] - total_token_num = len(parsed_line[0]) - - if len(batch) > 0 and phase == 'pred': - yield batch, total_token_num - - def data_generator(self, - input_file, - batch_size, - epoch, - dev_count=1, - shuffle=True, - phase=None): - examples = self._read_tsv(input_file) - if phase is None: - phase = 'all' - self.examples[phase] = examples - - def wrapper(): - all_dev_batches = [] - if epoch is None: - num_epochs = 99999999 - else: - num_epochs = epoch - for epoch_index in range(num_epochs): - if phase == "train": - self.current_example = 0 - self.current_epoch = epoch_index - if shuffle: - np.random.shuffle(examples) - - all_dev_batches = [] - for batch_data, num_tokens in self.batch_reader(examples, - batch_size, self.in_tokens, phase=phase): - batch_data = prepare_batch_data( - batch_data, - num_tokens, - voc_size=len(self.vocab), - pad_id=self.pad_id, - cls_id=self.cls_id, - sep_id=self.sep_id, - mask_id=self.mask_id, - # max_len=self.max_seq_len, # 注意,如果padding到最大长度,会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。 - return_input_mask=True, - return_max_len=False, - return_num_token=False) - - if len(all_dev_batches) < dev_count: - all_dev_batches.append(batch_data) - if len(all_dev_batches) == dev_count: - for batch in all_dev_batches: - yield batch - all_dev_batches = [] - - return wrapper - - -class ClassifyReader(BaseReader): - def _read_tsv(self, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, 'r', encoding='utf8') as f: - reader = csv_reader(f) - headers = next(reader) - text_indices = [ - index for index, h in enumerate(headers) if h != "label" - ] - Example = namedtuple('Example', headers) - - examples = [] - for line in reader: - for index, text in enumerate(line): - if index in text_indices: - if self.for_cn: - line[index] = text.replace(' ', '') - else: - line[index] = text - example = Example(*line) - examples.append(example) - return examples - - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - - if not self.is_inference: - batch_labels = [record.label_id for record in batch_records] - if self.is_classify: - batch_labels = np.array(batch_labels).astype("int64").reshape( - [-1, 1]) - elif self.is_regression: - batch_labels = np.array(batch_labels).astype("float32").reshape( - [-1, 1]) - - if batch_records[0].qid: - batch_qids = [record.qid for record in batch_records] - batch_qids = np.array(batch_qids).astype("int64").reshape( - [-1, 1]) - else: - batch_qids = np.array([]).astype("int64").reshape([-1, 1]) - - # padding - padded_token_ids, input_mask = pad_batch_data( - batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - padded_task_ids, input_mask - ] - if not self.is_inference: - return_list += [batch_labels, batch_qids] - - return return_list - - -class SequenceLabelReader(BaseReader): - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - batch_label_ids = [record.label_ids for record in batch_records] - - # padding - padded_token_ids, input_mask, batch_seq_lens = pad_batch_data( - batch_token_ids, - pad_idx=self.pad_id, - return_input_mask=True, - return_seq_lens=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - padded_label_ids = pad_batch_data( - batch_label_ids, pad_idx=len(self.label_map) - 1) - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - padded_task_ids, input_mask, padded_label_ids, batch_seq_lens - ] - return return_list - - def _reseg_token_label(self, tokens, labels, tokenizer): - assert len(tokens) == len(labels) - ret_tokens = [] - ret_labels = [] - for token, label in zip(tokens, labels): - sub_token = tokenizer.tokenize(token) - if len(sub_token) == 0: - continue - ret_tokens.extend(sub_token) - if len(sub_token) == 1: - ret_labels.append(label) - continue - - if label == "O" or label.startswith("I-"): - ret_labels.extend([label] * len(sub_token)) - elif label.startswith("B-"): - i_label = "I-" + label[2:] - ret_labels.extend([label] + [i_label] * (len(sub_token) - 1)) - elif label.startswith("S-"): - b_laebl = "B-" + label[2:] - e_label = "E-" + label[2:] - i_label = "I-" + label[2:] - ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label]) - elif label.startswith("E-"): - i_label = "I-" + label[2:] - ret_labels.extend([i_label] * (len(sub_token) - 1) + [label]) - - assert len(ret_tokens) == len(ret_labels) - return ret_tokens, ret_labels - - def _convert_example_to_record(self, example, max_seq_length, tokenizer): - tokens = tokenization.convert_to_unicode(example.text_a).split(u"") - labels = tokenization.convert_to_unicode(example.label).split(u"") - tokens, labels = self._reseg_token_label(tokens, labels, tokenizer) - - if len(tokens) > max_seq_length - 2: - tokens = tokens[0:(max_seq_length - 2)] - labels = labels[0:(max_seq_length - 2)] - - tokens = ["[CLS]"] + tokens + ["[SEP]"] - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - text_type_ids = [0] * len(token_ids) - no_entity_id = len(self.label_map) - 1 - label_ids = [no_entity_id] + [ - self.label_map[label] for label in labels - ] + [no_entity_id] - - Record = namedtuple( - 'Record', - ['token_ids', 'text_type_ids', 'position_ids', 'label_ids']) - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids, - label_ids=label_ids) - return record - - -class ExtractEmbeddingReader(BaseReader): - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - - # padding - padded_token_ids, input_mask, seq_lens = pad_batch_data( - batch_token_ids, - pad_idx=self.pad_id, - return_input_mask=True, - return_seq_lens=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - padded_task_ids, input_mask, seq_lens - ] - - return return_list - - -class MRCReader(BaseReader): - def __init__(self, - vocab_path, - label_map_config=None, - max_seq_len=512, - do_lower_case=True, - in_tokens=False, - random_seed=None, - tokenizer="FullTokenizer", - is_classify=True, - is_regression=False, - for_cn=True, - task_id=0, - doc_stride=128, - max_query_length=64, - remove_noanswer=True): - self.max_seq_len = max_seq_len - self.tokenizer = tokenization.FullTokenizer( - vocab_file=vocab_path, do_lower_case=do_lower_case) - self.vocab = self.tokenizer.vocab - self.pad_id = self.vocab["[PAD]"] - self.cls_id = self.vocab["[CLS]"] - self.sep_id = self.vocab["[SEP]"] - self.in_tokens = in_tokens - self.for_cn = for_cn - self.task_id = task_id - self.doc_stride = doc_stride - self.max_query_length = max_query_length - self.examples = {} - self.features = {} - self.remove_noanswer = remove_noanswer - - if random_seed is not None: - np.random.seed(random_seed) - - self.current_example = 0 - self.current_epoch = 0 - self.num_examples = 0 - - self.Example = namedtuple('Example', - ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text', - 'start_position', 'end_position']) - self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index", - "tokens", "token_to_orig_map", "token_is_max_context", - "token_ids", "position_ids", "text_type_ids", - "start_position", "end_position"]) - self.DocSpan = namedtuple("DocSpan", ["start", "length"]) - - def _read_json(self, input_file, is_training): - examples = [] - with open(input_file, "r", encoding='utf8') as f: - input_data = json.load(f)["data"] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_pos = None - end_pos = None - orig_answer_text = None - - if is_training: - if len(qa["answers"]) != 1: - raise ValueError( - "For training, each question should have exactly 1 answer." - ) - - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - doc_tokens = [ - paragraph_text[:answer_offset], - paragraph_text[answer_offset:answer_offset + - answer_length], - paragraph_text[answer_offset + answer_length:] - ] - - start_pos = 1 - end_pos = 1 - - actual_text = " ".join(doc_tokens[start_pos:(end_pos - + 1)]) - if actual_text.find(orig_answer_text) == -1: - log.info("Could not find answer: '%s' vs. '%s'", - actual_text, orig_answer_text) - continue - else: - doc_tokens = tokenization.tokenize_chinese_chars( - paragraph_text) - - example = self.Example( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_pos, - end_position=end_pos) - examples.append(example) - - return examples - - def _improve_answer_span(self, doc_tokens, input_start, input_end, - tokenizer, orig_answer_text): - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - - def _check_is_max_context(self, doc_spans, cur_span_index, position): - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, - num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - def _convert_example_to_feature(self, examples, max_seq_length, tokenizer, - is_training, remove_noanswer=True): - features = [] - unique_id = 1000000000 - - print('converting examples to features...') - for (example_index, example) in enumerate(examples): - if example_index % 1000 == 0: - print('processing {}th example...'.format(example_index)) - query_tokens = tokenizer.tokenize(example.question_text) - if len(query_tokens) > self.max_query_length: - query_tokens = query_tokens[0:self.max_query_length] - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - tok_start_position = None - tok_end_position = None - if is_training: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + - 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, - tok_end_position) = self._improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, - tokenizer, example.orig_answer_text) - - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(self.DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, self.doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - text_type_ids = [] - tokens.append("[CLS]") - text_type_ids.append(0) - for token in query_tokens: - tokens.append(token) - text_type_ids.append(0) - tokens.append("[SEP]") - text_type_ids.append(0) - - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[ - split_token_index] - - is_max_context = self._check_is_max_context( - doc_spans, doc_span_index, split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - text_type_ids.append(1) - tokens.append("[SEP]") - text_type_ids.append(1) - - token_ids = tokenizer.convert_tokens_to_ids(tokens) - position_ids = list(range(len(token_ids))) - start_position = None - end_position = None - if is_training: - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - if remove_noanswer: - continue - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - feature = self.Feature( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - token_ids=token_ids, - position_ids=position_ids, - text_type_ids=text_type_ids, - start_position=start_position, - end_position=end_position) - features.append(feature) - - unique_id += 1 - - return features - - def _prepare_batch_data(self, records, batch_size, phase=None): - """generate batch records""" - batch_records, max_len = [], 0 - - if len(records) < batch_size: - raise Exception('mrc dataset contains too few samples. Expect more than '+str(batch_size)) - - for index, record in enumerate(records): - if phase == "train": - self.current_example = index - max_len = max(max_len, len(record.token_ids)) - if self.in_tokens: - to_append = (len(batch_records) + 1) * max_len <= batch_size - else: - to_append = len(batch_records) < batch_size - if to_append: - batch_records.append(record) - else: - yield self._pad_batch_records(batch_records, phase == "train") - batch_records, max_len = [record], len(record.token_ids) - - if phase == 'pred' and batch_records: - yield self._pad_batch_records(batch_records, phase == "train") - - def _pad_batch_records(self, batch_records, is_training): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - if is_training: - batch_start_position = [ - record.start_position for record in batch_records - ] - batch_end_position = [ - record.end_position for record in batch_records - ] - batch_start_position = np.array(batch_start_position).astype( - "int64").reshape([-1, 1]) - batch_end_position = np.array(batch_end_position).astype( - "int64").reshape([-1, 1]) - - else: - batch_size = len(batch_token_ids) - batch_start_position = np.zeros( - shape=[batch_size, 1], dtype="int64") - batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64") - - batch_unique_ids = [record.unique_id for record in batch_records] - batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape( - [-1, 1]) - - # padding - padded_token_ids, input_mask = pad_batch_data( - batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - padded_task_ids, input_mask, batch_start_position, - batch_end_position, batch_unique_ids - ] - - return return_list - - def get_num_examples(self, phase): - return len(self.features[phase]) - - def get_features(self, phase): - return self.features[phase] - - def get_examples(self, phase): - return self.examples[phase] - - def data_generator(self, - input_file, - batch_size, - epoch, - dev_count=1, - shuffle=True, - phase=None): - - examples = self.examples.get(phase, None) - features = self.features.get(phase, None) - if not examples: - examples = self._read_json(input_file, phase == "train") - features = self._convert_example_to_feature( - examples, self.max_seq_len, self.tokenizer, phase == "train", remove_noanswer=self.remove_noanswer) - self.examples[phase] = examples - self.features[phase] = features - - def wrapper(): - all_dev_batches = [] - if epoch is None: - num_epochs = 99999999 - else: - num_epochs = epoch - for epoch_index in range(num_epochs): - if phase == "train": - self.current_example = 0 - self.current_epoch = epoch_index - if phase == "train" and shuffle: - np.random.shuffle(features) - - for batch_data in self._prepare_batch_data( - features, batch_size, phase=phase): - if len(all_dev_batches) < dev_count: - all_dev_batches.append(batch_data) - if len(all_dev_batches) == dev_count: - for batch in all_dev_batches: - yield batch - all_dev_batches = [] - - return wrapper - - -if __name__ == '__main__': - pass diff --git a/tasktype/__init__.py b/tasktype/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tasktype/cls.py b/tasktype/cls.py deleted file mode 100644 index 6cbacf79dd12622c4d952c29040c0c42768e2d11..0000000000000000000000000000000000000000 --- a/tasktype/cls.py +++ /dev/null @@ -1,101 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -from paddle.fluid import layers -from paddlepalm.interface import task_paradigm -import numpy as np -import os - -class TaskParadigm(task_paradigm): - ''' - classification - ''' - def __init__(self, config, phase, backbone_config=None): - self._is_training = phase == 'train' - self._hidden_size = backbone_config['hidden_size'] - self.num_classes = config['n_classes'] - - if 'initializer_range' in config: - self._param_initializer = config['initializer_range'] - else: - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=backbone_config.get('initializer_range', 0.02)) - if 'dropout_prob' in config: - self._dropout_prob = config['dropout_prob'] - else: - self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0) - self._pred_output_path = config.get('pred_output_path', None) - self._preds = [] - - @property - def inputs_attrs(self): - if self._is_training: - reader = {"label_ids": [[-1, 1], 'int64']} - else: - reader = {} - bb = {"sentence_embedding": [[-1, self._hidden_size], 'float32']} - return {'reader': reader, 'backbone': bb} - - @property - def outputs_attrs(self): - if self._is_training: - return {'loss': [[1], 'float32']} - else: - return {'logits': [[-1, self.num_classes], 'float32']} - - def build(self, inputs, scope_name=''): - sent_emb = inputs['backbone']['sentence_embedding'] - if self._is_training: - label_ids = inputs['reader']['label_ids'] - cls_feats = fluid.layers.dropout( - x=sent_emb, - dropout_prob=self._dropout_prob, - dropout_implementation="upscale_in_train") - - logits = fluid.layers.fc( - input=sent_emb, - size=self.num_classes, - param_attr=fluid.ParamAttr( - name=scope_name+"cls_out_w", - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr( - name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.))) - - if self._is_training: - loss = fluid.layers.softmax_with_cross_entropy( - logits=logits, label=label_ids) - loss = layers.mean(loss) - return {"loss": loss} - else: - return {"logits":logits} - - def postprocess(self, rt_outputs): - if not self._is_training: - logits = rt_outputs['logits'] - preds = np.argmax(logits, -1) - self._preds.extend(preds.tolist()) - - def epoch_postprocess(self, post_inputs): - # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs - if not self._is_training: - if self._pred_output_path is None: - raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') - with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer: - for p in self._preds: - writer.write(str(p)+'\n') - print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json')) - - diff --git a/tasktype/match.py b/tasktype/match.py deleted file mode 100644 index ee0d175b01e09ede242aa7fe404366dc48804580..0000000000000000000000000000000000000000 --- a/tasktype/match.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -from paddle.fluid import layers -from paddlepalm.interface import task_paradigm -import numpy as np -import os - -class TaskParadigm(task_paradigm): - ''' - matching - ''' - def __init__(self, config, phase, backbone_config=None): - self._is_training = phase == 'train' - self._hidden_size = backbone_config['hidden_size'] - - if 'initializer_range' in config: - self._param_initializer = config['initializer_range'] - else: - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=backbone_config.get('initializer_range', 0.02)) - if 'dropout_prob' in config: - self._dropout_prob = config['dropout_prob'] - else: - self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0) - - self._pred_output_path = config.get('pred_output_path', None) - self._preds = [] - - - @property - def inputs_attrs(self): - if self._is_training: - reader = {"label_ids": [[-1, 1], 'int64']} - else: - reader = {} - bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']} - return {'reader': reader, 'backbone': bb} - - @property - def outputs_attrs(self): - if self._is_training: - return {"loss": [[1], 'float32']} - else: - return {"logits": [[-1, 2], 'float32']} - - def build(self, inputs, scope_name=""): - if self._is_training: - labels = inputs["reader"]["label_ids"] - cls_feats = inputs["backbone"]["sentence_pair_embedding"] - - if self._is_training: - cls_feats = fluid.layers.dropout( - x=cls_feats, - dropout_prob=self._dropout_prob, - dropout_implementation="upscale_in_train") - - logits = fluid.layers.fc( - input=cls_feats, - size=2, - param_attr=fluid.ParamAttr( - name=scope_name+"cls_out_w", - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr( - name=scope_name+"cls_out_b", - initializer=fluid.initializer.Constant(0.))) - - if self._is_training: - ce_loss, probs = fluid.layers.softmax_with_cross_entropy( - logits=logits, label=labels, return_softmax=True) - loss = fluid.layers.mean(x=ce_loss) - return {'loss': loss} - else: - return {'logits': logits} - - def postprocess(self, rt_outputs): - if not self._is_training: - logits = rt_outputs['logits'] - preds = np.argmax(logits, -1) - self._preds.extend(preds.tolist()) - - def epoch_postprocess(self, post_inputs): - # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs - if not self._is_training: - if self._pred_output_path is None: - raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') - with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer: - for p in self._preds: - writer.write(str(p)+'\n') - print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json')) - - diff --git a/tasktype/mlm.py b/tasktype/mlm.py deleted file mode 100644 index ec86dd151e8b0f86c345120f4a5907f0afb91d5c..0000000000000000000000000000000000000000 --- a/tasktype/mlm.py +++ /dev/null @@ -1,110 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -from paddlepalm.interface import task_paradigm -from paddle.fluid import layers -from paddlepalm.backbone.utils.transformer import pre_process_layer - -class TaskParadigm(task_paradigm): - ''' - matching - ''' - def __init__(self, config, phase, backbone_config=None): - self._is_training = phase == 'train' - self._emb_size = backbone_config['hidden_size'] - self._hidden_size = backbone_config['hidden_size'] - self._vocab_size = backbone_config['vocab_size'] - self._hidden_act = backbone_config['hidden_act'] - self._initializer_range = backbone_config['initializer_range'] - - @property - def inputs_attrs(self): - reader = { - "mask_label": [[-1, 1], 'int64'], - "mask_pos": [[-1, 1], 'int64']} - if not self._is_training: - del reader['mask_label'] - del reader['batchsize_x_seqlen'] - bb = { - "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'], - "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']} - return {'reader': reader, 'backbone': bb} - - @property - def outputs_attrs(self): - if self._is_training: - return {"loss": [[1], 'float32']} - else: - return {"logits": [[-1], 'float32']} - - def build(self, inputs, scope_name=""): - mask_pos = inputs["reader"]["mask_pos"] - if self._is_training: - mask_label = inputs["reader"]["mask_label"] - max_position = inputs["reader"]["batchsize_x_seqlen"] - 1 - mask_pos = fluid.layers.elementwise_min(mask_pos, max_position) - mask_pos.stop_gradient = True - - word_emb = inputs["backbone"]["embedding_table"] - enc_out = inputs["backbone"]["encoder_outputs"] - - emb_size = word_emb.shape[-1] - - _param_initializer = fluid.initializer.TruncatedNormal( - scale=self._initializer_range) - - reshaped_emb_out = fluid.layers.reshape( - x=enc_out, shape=[-1, emb_size]) - - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - - # transform: fc - mask_trans_feat = fluid.layers.fc( - input=mask_feat, - size=emb_size, - act=self._hidden_act, - param_attr=fluid.ParamAttr( - name=scope_name+'mask_lm_trans_fc.w_0', - initializer=_param_initializer), - bias_attr=fluid.ParamAttr(name=scope_name+'mask_lm_trans_fc.b_0')) - # transform: layer norm - mask_trans_feat = pre_process_layer( - mask_trans_feat, 'n', name=scope_name+'mask_lm_trans') - - mask_lm_out_bias_attr = fluid.ParamAttr( - name=scope_name+"mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - - fc_out = fluid.layers.matmul( - x=mask_trans_feat, - y=word_emb, - transpose_y=True) - fc_out += fluid.layers.create_parameter( - shape=[self._vocab_size], - dtype='float32', - attr=mask_lm_out_bias_attr, - is_bias=True) - - if self._is_training: - mask_lm_loss = fluid.layers.softmax_with_cross_entropy( - logits=fc_out, label=mask_label) - loss = fluid.layers.mean(mask_lm_loss) - return {'loss': loss} - else: - return {'logits': fc_out} - - diff --git a/tasktype/mrc.py b/tasktype/mrc.py deleted file mode 100644 index b1f0b5688d18fc9c84156a1570f24815febba17f..0000000000000000000000000000000000000000 --- a/tasktype/mrc.py +++ /dev/null @@ -1,492 +0,0 @@ -# -*- coding: UTF-8 -*- -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -from paddlepalm.interface import task_paradigm -import collections -import numpy as np -import os -import math -import six -import paddlepalm.tokenizer.ernie_tokenizer as tokenization -import json - -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) - -class TaskParadigm(task_paradigm): - """""" - - def __init__(self, config, phase, backbone_config=None): - - self._is_training = phase == 'train' - self._max_sequence_length = config['max_seq_len'] - self._hidden_size = backbone_config['hidden_size'] - self._pred_results = [] - - if phase == 'pred': - self._max_answer_length = config.get('max_answer_len', None) - self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0) - self._n_best_size = config.get('n_best_size', 20) - self._pred_output_path = config.get('pred_output_path', None) - self._verbose = config.get('verbose', False) - self._with_negative = config.get('with_negative', False) - self._do_lower_case = config.get('do_lower_case', False) - - - @property - def inputs_attrs(self): - if self._is_training: - reader = {"start_positions": [[-1, 1], 'int64'], - "end_positions": [[-1, 1], 'int64'], - } - else: - reader = {'unique_ids': [[-1, 1], 'int64']} - bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']} - return {'reader': reader, 'backbone': bb} - - @property - def epoch_inputs_attrs(self): - if not self._is_training: - from_reader = {'examples': None, 'features': None} - return {'reader': from_reader} - - @property - def outputs_attr(self): - if self._is_training: - return {'loss': [[1], 'float32']} - else: - return {'start_logits': [[-1, -1, 1], 'float32'], - 'end_logits': [[-1, -1, 1], 'float32'], - 'unique_ids': [[-1, 1], 'int64']} - - - def build(self, inputs, scope_name=""): - if self._is_training: - start_positions = inputs['reader']['start_positions'] - end_positions = inputs['reader']['end_positions'] - max_position = inputs["reader"]["seqlen"] - 1 - start_positions = fluid.layers.elementwise_min(start_positions, max_position) - end_positions = fluid.layers.elementwise_min(end_positions, max_position) - start_positions.stop_gradient = True - end_positions.stop_gradient = True - else: - unique_id = inputs['reader']['unique_ids'] - - enc_out = inputs['backbone']['encoder_outputs'] - logits = fluid.layers.fc( - input=enc_out, - size=2, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=scope_name+"cls_squad_out_w", - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( - name=scope_name+"cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) - - logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) - start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) - - def _compute_single_loss(logits, positions): - """Compute start/end loss for mrc model""" - loss = fluid.layers.softmax_with_cross_entropy( - logits=logits, label=positions) - loss = fluid.layers.mean(x=loss) - return loss - - if self._is_training: - start_loss = _compute_single_loss(start_logits, start_positions) - end_loss = _compute_single_loss(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2.0 - return {'loss': total_loss} - else: - return {'start_logits': start_logits, - 'end_logits': end_logits, - 'unique_ids': unique_id} - - - def postprocess(self, rt_outputs): - """this func will be called after each step(batch) of training/evaluating/predicting process.""" - if not self._is_training: - unique_ids = np.squeeze(rt_outputs['unique_ids'], -1) - start_logits = rt_outputs['start_logits'] - end_logits = rt_outputs['end_logits'] - for idx in range(len(unique_ids)): - - if unique_ids[idx] < 0: - continue - if len(self._pred_results) % 1000 == 0: - print("Predicting example: {}".format(len(self._pred_results))) - uid = int(unique_ids[idx]) - - s = [float(x) for x in start_logits[idx].flat] - e = [float(x) for x in end_logits[idx].flat] - self._pred_results.append( - RawResult( - unique_id=uid, - start_logits=s, - end_logits=e)) - - def epoch_postprocess(self, post_inputs): - """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process.""" - - if not self._is_training: - if self._pred_output_path is None: - raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') - examples = post_inputs['reader']['examples'] - features = post_inputs['reader']['features'] - if not os.path.exists(self._pred_output_path): - os.makedirs(self._pred_output_path) - output_prediction_file = os.path.join(self._pred_output_path, "predictions.json") - output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json") - output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json") - _write_predictions(examples, features, self._pred_results, - self._n_best_size, self._max_answer_length, - self._do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, - self._with_negative, - self._null_score_diff_threshold, self._verbose) - - -def _write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, - with_negative, null_score_diff_threshold, - verbose): - """Write final predictions to the json file and log-odds of null if needed.""" - print("Writing predictions to: %s" % (output_prediction_file)) - print("Writing nbest to: %s" % (output_nbest_file)) - - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) - - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result - - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", [ - "feature_index", "start_index", "end_index", "start_logit", - "end_logit" - ]) - - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() - - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - min_null_feature_index = 0 # the paragraph slice with min mull score - null_start_logit = 0 # the start logit at the slice with min null score - null_end_logit = 0 # the end logit at the slice with min null score - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] - start_indexes = _get_best_indexes(result.start_logits, n_best_size) - end_indexes = _get_best_indexes(result.end_logits, n_best_size) - # if we could have irrelevant answers, get the min score of irrelevant - if with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[ - 0] - if feature_null_score < score_null: - score_null = feature_null_score - min_null_feature_index = feature_index - null_start_logit = result.start_logits[0] - null_end_logit = result.end_logits[0] - for start_index in start_indexes: - for end_index in end_indexes: - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= len(feature.tokens): - continue - if end_index >= len(feature.tokens): - continue - if start_index not in feature.token_to_orig_map: - continue - if end_index not in feature.token_to_orig_map: - continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) - - if with_negative: - prelim_predictions.append( - _PrelimPrediction( - feature_index=min_null_feature_index, - start_index=0, - end_index=0, - start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) - - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) - - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1 - )] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + - 1)] - tok_text = " ".join(tok_tokens) - - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) - - final_text = _get_final_text(tok_text, orig_text, do_lower_case, - verbose) - if final_text in seen_predictions: - continue - - seen_predictions[final_text] = True - else: - final_text = "" - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) - - # if we didn't inlude the empty option in the n-best, inlcude it - if with_negative: - if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction( - text="empty", start_logit=0.0, end_logit=0.0)) - - assert len(nbest) >= 1 - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) - if not best_non_null_entry: - if entry.text: - best_non_null_entry = entry - # debug - if best_non_null_entry is None: - print("Emmm..., sth wrong") - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_json.append(output) - - assert len(nbest_json) >= 1 - - if not with_negative: - all_predictions[example.qas_id] = nbest_json[0]["text"] - else: - # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) - scores_diff_json[example.qas_id] = score_diff - if score_diff > null_score_diff_threshold: - all_predictions[example.qas_id] = "" - else: - all_predictions[example.qas_id] = best_non_null_entry.text - - all_nbest_json[example.qas_id] = nbest_json - - with open(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - - with open(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - - if with_negative: - with open(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - -def _get_final_text(pred_text, orig_text, do_lower_case, verbose): - """Project the tokenized prediction back to the original text.""" - - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the MRQA eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heruistic between - # `pred_text` and `orig_text` to get a character-to-charcter alignment. This - # can fail in certain cases in which case we just return `orig_text`. - - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": - continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) - - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) - - tok_text = " ".join(tokenizer.tokenize(orig_text)) - - start_position = tok_text.find(pred_text) - if start_position == -1: - if verbose: - print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 - - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) - - if len(orig_ns_text) != len(tok_ns_text): - if verbose: - print("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text - - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in six.iteritems(tok_ns_to_s_map): - tok_s_to_ns_map[tok_index] = i - - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] - - if orig_start_position is None: - if verbose: - print("Couldn't map start position") - return orig_text - - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] - - if orig_end_position is None: - if verbose: - print("Couldn't map end position") - return orig_text - - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text - - -def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted( - enumerate(logits), key=lambda x: x[1], reverse=True) - - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs - -