diff --git a/README.md b/README.md index e51a2dff77e464b4c94b89ceb9ab2af7700d125c..9d3a638bd19d894c3d4cba64ae66fd43780ad45c 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ cd PALM && python setup.py install **环境依赖** -- Python >= 2.7 (即将支持python3) +- Python >= 2.7 - cuda >= 9.0 - cudnn >= 7.0 - PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装) diff --git a/demo/demo3/pretrain b/demo/demo3/pretrain deleted file mode 120000 index b67f0fca255ccbeb0e9feb6b93309a81b63162ad..0000000000000000000000000000000000000000 --- a/demo/demo3/pretrain +++ /dev/null @@ -1 +0,0 @@ -../../pretrain/ \ No newline at end of file diff --git a/download_models.py b/download_models.py new file mode 100644 index 0000000000000000000000000000000000000000..e3cfd80dc4d8c8fbcb1b1d9e5eb9887db1382ae9 --- /dev/null +++ b/download_models.py @@ -0,0 +1,34 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddlepalm as palm +import sys +import argparse + +# create parser +parser = argparse.ArgumentParser(prog='download_models.py', usage='python %(prog)s -l | -d [-h]\n\nFor example,\n\tpython %(prog)s -d bert-en-uncased-large ',description = 'Download pretrain models for initializing params of backbones. ') +parser1= parser.add_argument_group("required arguments") +parser1.add_argument('-l','--list', action = 'store_true', help = 'show the list of available pretrain models', default = False) +parser1.add_argument('-d','--download', action = 'store', help = 'download pretrain models. The available pretrain models can be listed by run "python download_models.py -l"') +args = parser.parse_args() + +if(args.list): + palm.downloader.ls('pretrain') +elif(args.download): + print('download~~~') + print(args.download) + palm.downloader.download('pretrain', args.download) +else: + print (parser.parse_args(['-h'])) diff --git a/paddlepalm/_downloader.py b/paddlepalm/_downloader.py index 1b8de4b629a491148e43b71f96cb70c0542d15d4..b36cbcc94df9769dc4c86a87956b9b345418786f 100644 --- a/paddlepalm/_downloader.py +++ b/paddlepalm/_downloader.py @@ -34,13 +34,16 @@ _items = { 'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz', 'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz', 'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz', + 'ernie-ch-uncased-base':'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz', + 'roberta-cn-base': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz', + 'roberta-cn-large': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz', 'utils': None}, 'reader': {'utils': None}, 'backbone': {'utils': None}, 'tasktype': {'utils': None}, } -def _download(item, scope, path, silent=False): +def _download(item, scope, path, silent=False, convert=False): data_url = _items[item][scope] if data_url == None: return @@ -100,9 +103,10 @@ def _download(item, scope, path, silent=False): os.removedirs(source_path) if not silent: print ('done!') - if not silent: - print ('Converting params...', end=" ") - _convert(data_dir, silent) + if convert: + if not silent: + print ('Converting params...', end=" ") + _convert(data_dir, silent) if not silent: print ('done!') diff --git a/paddlepalm/backbone/bert.py b/paddlepalm/backbone/bert.py index 06080e1df7e4c8a0b0419c13249228cdf059fc47..2872db727fcc065298c88ccab3dedefbba171817 100644 --- a/paddlepalm/backbone/bert.py +++ b/paddlepalm/backbone/bert.py @@ -29,22 +29,31 @@ from paddlepalm.backbone.base_backbone import Backbone class BERT(Backbone): - def __init__(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ + def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \ - attention_probs_dropout_prob, initializer_range, phase='train'): - config = {} - config['hidden_size'] = hidden_size - config['num_hidden_layers'] = num_hidden_layers - config['num_attention_heads'] = num_attention_heads - config['vocab_size'] = vocab_size - config['max_position_embeddings'] = max_position_embeddings - config['type_vocab_size'] = sent_type_vocab_size - config['hidden_act'] = hidden_act - config['hidden_dropout_prob'] = hidden_dropout_prob - config['attention_probs_dropout_prob'] = attention_probs_dropout_prob - config['initializer_range'] = initializer_range - - self.from_config(config, phase=phase) + attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'): + + self._emb_size = hidden_size + self._n_layer = num_hidden_layers + self._n_head = num_attention_heads + self._voc_size = vocab_size + self._max_position_seq_len = max_position_embeddings + self._sent_types = type_vocab_size + + + self._hidden_act = hidden_act + self._prepostprocess_dropout = hidden_dropout_prob + self._attention_dropout = attention_probs_dropout_prob + + self._word_emb_name = "word_embedding" + self._pos_emb_name = "pos_embedding" + self._sent_emb_name = "sent_embedding" + self._task_emb_name = "task_embedding" + self._emb_dtype = "float32" + self._phase = phase + self._is_pairwise = is_pairwise + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=initializer_range) @classmethod def from_config(self, config, phase='train'): @@ -62,40 +71,57 @@ class BERT(Backbone): "{} is required to initialize ERNIE".format('attention_probs_dropout_prob') assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range') - # self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变 - self._emb_size = config["hidden_size"] - self._n_layer = config["num_hidden_layers"] - self._n_head = config["num_attention_heads"] - self._voc_size = config["vocab_size"] - self._max_position_seq_len = config["max_position_embeddings"] - self._sent_types = config["type_vocab_size"] - self._hidden_act = config["hidden_act"] - self._prepostprocess_dropout = config["hidden_dropout_prob"] - self._attention_dropout = config["attention_probs_dropout_prob"] - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=config["initializer_range"]) + hidden_size = config['hidden_size'] + num_hidden_layers = config['num_hidden_layers'] + num_attention_heads = config['num_attention_heads'] + vocab_size = config['vocab_size'] + max_position_embeddings = config['max_position_embeddings'] + if 'sent_type_vocab_size' in config: + sent_type_vocab_size = config['sent_type_vocab_size'] + else: + sent_type_vocab_size = config['type_vocab_size'] + + hidden_act = config['hidden_act'] + hidden_dropout_prob = config['hidden_dropout_prob'] + attention_probs_dropout_prob = config['attention_probs_dropout_prob'] + initializer_range = config['initializer_range'] + if 'is_pairwise' in config: + is_pairwise = config['is_pairwise'] + else: + is_pairwise = False + + return self(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ + max_position_embeddings, sent_type_vocab_size, \ + hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase) @property def inputs_attr(self): - return {"token_ids": [[-1, -1], 'int64'], - "position_ids": [[-1, -1], 'int64'], - "segment_ids": [[-1, -1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32']} + ret = {"token_ids": [[-1, -1], 'int64'], + "position_ids": [[-1, -1], 'int64'], + "segment_ids": [[-1, -1], 'int64'], + "input_mask": [[-1, -1, 1], 'float32'], + } + if self._is_pairwise and self._phase=='train': + ret.update({"token_ids_neg": [[-1, -1], 'int64'], + "position_ids_neg": [[-1, -1], 'int64'], + "segment_ids_neg": [[-1, -1], 'int64'], + "input_mask_neg": [[-1, -1, 1], 'float32'], + }) + return ret @property def outputs_attr(self): - return {"word_embedding": [[-1, -1, self._emb_size], 'float32'], - "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], - "encoder_outputs": [[-1, -1, self._emb_size], 'float32'], - "sentence_embedding": [[-1, self._emb_size], 'float32'], - "sentence_pair_embedding": [[-1, self._emb_size], 'float32']} + ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'], + "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], + "encoder_outputs": [[-1, -1, self._emb_size], 'float32'], + "sentence_embedding": [[-1, self._emb_size], 'float32'], + "sentence_pair_embedding": [[-1, self._emb_size], 'float32']} + if self._is_pairwise and self._phase == 'train': + ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'], + "encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'], + "sentence_embedding_neg": [[-1, self._emb_size], 'float32'], + "sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']}) + return ret def build(self, inputs, scope_name=""): src_ids = inputs['token_ids'] @@ -104,83 +130,111 @@ class BERT(Backbone): input_mask = inputs['input_mask'] self._emb_dtype = 'float32' - # padding id in vocabulary must be set to 0 - emb_out = fluid.embedding( - input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._word_emb_name, initializer=self._param_initializer), - is_sparse=False) - - # fluid.global_scope().find_var('backbone-word_embedding').get_tensor() - embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name) - - position_emb_out = fluid.embedding( - input=pos_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._pos_emb_name, initializer=self._param_initializer)) - - sent_emb_out = fluid.embedding( - sent_ids, - size=[self._sent_types, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._sent_emb_name, initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - emb_out = pre_process_layer( - emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder') - - self_attn_mask = fluid.layers.matmul( - x=input_mask, y=input_mask, transpose_y=True) - - self_attn_mask = fluid.layers.scale( - x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack( - x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - enc_out = encoder( - enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name=scope_name+'encoder') + input_buffer = {} + output_buffer = {} + input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask] + output_buffer['base'] = {} + + if self._is_pairwise and self._phase =='train': + src_ids = inputs['token_ids_neg'] + pos_ids = inputs['position_ids_neg'] + sent_ids = inputs['segment_ids_neg'] + input_mask = inputs['input_mask_neg'] + input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask] + output_buffer['neg'] = {} - next_sent_feat = fluid.layers.slice( - input=enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]]) - next_sent_feat = fluid.layers.fc( - input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr( - name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), - bias_attr=scope_name+"pooled_fc.b_0") - - return {'embedding_table': embedding_table, - 'word_embedding': emb_out, - 'encoder_outputs': enc_out, - 'sentence_embedding': next_sent_feat, - 'sentence_pair_embedding': next_sent_feat} - + for key, (src_ids, pos_ids, sent_ids, input_mask) in input_buffer.items(): + # padding id in vocabulary must be set to 0 + emb_out = fluid.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) + + # fluid.global_scope().find_var('backbone-word_embedding').get_tensor() + embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name) + + position_emb_out = fluid.embedding( + input=pos_ids, + size=[self._max_position_seq_len, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.embedding( + sent_ids, + size=[self._sent_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._sent_emb_name, initializer=self._param_initializer)) + + emb_out = emb_out + position_emb_out + emb_out = emb_out + sent_emb_out + + emb_out = pre_process_layer( + emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder') + + self_attn_mask = fluid.layers.matmul( + x=input_mask, y=input_mask, transpose_y=True) + + self_attn_mask = fluid.layers.scale( + x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) + n_head_self_attn_mask = fluid.layers.stack( + x=[self_attn_mask] * self._n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + enc_out = encoder( + enc_input=emb_out, + attn_bias=n_head_self_attn_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd="", + postprocess_cmd="dan", + param_initializer=self._param_initializer, + name=scope_name+'encoder') + + + next_sent_feat = fluid.layers.slice( + input=enc_out, axes=[1], starts=[0], ends=[1]) + next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]]) + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._emb_size, + act="tanh", + param_attr=fluid.ParamAttr( + name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), + bias_attr=scope_name+"pooled_fc.b_0") + output_buffer[key]['word_embedding'] = emb_out + output_buffer[key]['encoder_outputs'] = enc_out + output_buffer[key]['sentence_embedding'] = next_sent_feat + output_buffer[key]['sentence_pair_embedding'] = next_sent_feat + + ret = {} + ret['embedding_table'] = embedding_table + ret['word_embedding'] = output_buffer['base']['word_embedding'] + ret['encoder_outputs'] = output_buffer['base']['encoder_outputs'] + ret['sentence_embedding'] = output_buffer['base']['sentence_embedding'] + ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding'] + + if self._is_pairwise and self._phase == 'train': + ret['word_embedding_neg'] = output_buffer['neg']['word_embedding'] + ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs'] + ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding'] + ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding'] + + return ret + def postprocess(self, rt_outputs): pass diff --git a/paddlepalm/backbone/ernie.py b/paddlepalm/backbone/ernie.py index 615766aa3a585dc081dc8da4bcddd40acfbd0976..bbfbde65bb1f7fabfff30384b17edcc9387c8478 100644 --- a/paddlepalm/backbone/ernie.py +++ b/paddlepalm/backbone/ernie.py @@ -31,10 +31,10 @@ class ERNIE(Backbone): def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \ - hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase='train'): + hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'): # self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变 - + self._emb_size = hidden_size self._n_layer = num_hidden_layers self._n_head = num_attention_heads @@ -53,7 +53,8 @@ class ERNIE(Backbone): self._sent_emb_name = "sent_embedding" self._task_emb_name = "task_embedding" self._emb_dtype = "float32" - + self._is_pairwise = is_pairwise + self._phase=phase self._param_initializer = fluid.initializer.TruncatedNormal( scale=initializer_range) @@ -65,7 +66,7 @@ class ERNIE(Backbone): assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size') assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings') assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size') - assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size') + # assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size') assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act') assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob') assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob') @@ -80,126 +81,175 @@ class ERNIE(Backbone): sent_type_vocab_size = config['sent_type_vocab_size'] else: sent_type_vocab_size = config['type_vocab_size'] - task_type_vocab_size = config['task_type_vocab_size'] + if 'task_type_vocab_size' in config: + task_type_vocab_size = config['task_type_vocab_size'] + else: + task_type_vocab_size = config['type_vocab_size'] hidden_act = config['hidden_act'] hidden_dropout_prob = config['hidden_dropout_prob'] attention_probs_dropout_prob = config['attention_probs_dropout_prob'] initializer_range = config['initializer_range'] + if 'is_pairwise' in config: + is_pairwise = config['is_pairwise'] + else: + is_pairwise = False return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \ - hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase=phase) + hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase=phase) @property def inputs_attr(self): - return {"token_ids": [[-1, -1], 'int64'], - "position_ids": [[-1, -1], 'int64'], - "segment_ids": [[-1, -1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "task_ids": [[-1,-1], 'int64']} + ret = {"token_ids": [[-1, -1], 'int64'], + "position_ids": [[-1, -1], 'int64'], + "segment_ids": [[-1, -1], 'int64'], + "input_mask": [[-1, -1, 1], 'float32'], + "task_ids": [[-1,-1], 'int64']} + if self._is_pairwise and self._phase=='train': + ret.update({"token_ids_neg": [[-1, -1], 'int64'], + "position_ids_neg": [[-1, -1], 'int64'], + "segment_ids_neg": [[-1, -1], 'int64'], + "input_mask_neg": [[-1, -1, 1], 'float32'], + "task_ids_neg": [[-1,-1], 'int64'] + }) + return ret + @property def outputs_attr(self): - return {"word_embedding": [[-1, -1, self._emb_size], 'float32'], - "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], - "encoder_outputs": [[-1, -1, self._emb_size], 'float32'], - "sentence_embedding": [[-1, self._emb_size], 'float32'], - "sentence_pair_embedding": [[-1, self._emb_size], 'float32']} + ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'], + "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], + "encoder_outputs": [[-1, -1, self._emb_size], 'float32'], + "sentence_embedding": [[-1, self._emb_size], 'float32'], + "sentence_pair_embedding": [[-1, self._emb_size], 'float32']} + if self._is_pairwise and self._phase == 'train': + ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'], + "encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'], + "sentence_embedding_neg": [[-1, self._emb_size], 'float32'], + "sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']}) + return ret def build(self, inputs, scope_name=""): - src_ids = inputs['token_ids'] pos_ids = inputs['position_ids'] sent_ids = inputs['segment_ids'] input_mask = inputs['input_mask'] task_ids = inputs['task_ids'] - # padding id in vocabulary must be set to 0 - emb_out = fluid.embedding( - input=src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._word_emb_name, initializer=self._param_initializer), - is_sparse=False) - - # fluid.global_scope().find_var('backbone-word_embedding').get_tensor() - embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name) + input_buffer = {} + output_buffer = {} + input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids] + output_buffer['base'] = {} + + if self._is_pairwise and self._phase =='train': + src_ids = inputs['token_ids_neg'] + pos_ids = inputs['position_ids_neg'] + sent_ids = inputs['segment_ids_neg'] + input_mask = inputs['input_mask_neg'] + task_ids = inputs['task_ids_neg'] + input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids] + output_buffer['neg'] = {} + + for key, (src_ids, pos_ids, sent_ids, input_mask, task_ids) in input_buffer.items(): + # padding id in vocabulary must be set to 0 + emb_out = fluid.embedding( + input=src_ids, + size=[self._voc_size, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._word_emb_name, initializer=self._param_initializer), + is_sparse=False) - position_emb_out = fluid.embedding( - input=pos_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._pos_emb_name, initializer=self._param_initializer)) - - sent_emb_out = fluid.embedding( - sent_ids, - size=[self._sent_types, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._sent_emb_name, initializer=self._param_initializer)) - - emb_out = emb_out + position_emb_out - emb_out = emb_out + sent_emb_out - - task_emb_out = fluid.embedding( - task_ids, - size=[self._task_types, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=scope_name+self._task_emb_name, - initializer=self._param_initializer)) - - emb_out = emb_out + task_emb_out - - emb_out = pre_process_layer( - emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder') - - self_attn_mask = fluid.layers.matmul( - x=input_mask, y=input_mask, transpose_y=True) - - self_attn_mask = fluid.layers.scale( - x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) - n_head_self_attn_mask = fluid.layers.stack( - x=[self_attn_mask] * self._n_head, axis=1) - n_head_self_attn_mask.stop_gradient = True - - enc_out = encoder( - enc_input=emb_out, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._emb_size // self._n_head, - d_value=self._emb_size // self._n_head, - d_model=self._emb_size, - d_inner_hid=self._emb_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd="", - postprocess_cmd="dan", - param_initializer=self._param_initializer, - name=scope_name+'encoder') - + # fluid.global_scope().find_var('backbone-word_embedding').get_tensor() + embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name) + + position_emb_out = fluid.embedding( + input=pos_ids, + size=[self._max_position_seq_len, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._pos_emb_name, initializer=self._param_initializer)) + + sent_emb_out = fluid.embedding( + sent_ids, + size=[self._sent_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._sent_emb_name, initializer=self._param_initializer)) + + emb_out = emb_out + position_emb_out + emb_out = emb_out + sent_emb_out + + task_emb_out = fluid.embedding( + task_ids, + size=[self._task_types, self._emb_size], + dtype=self._emb_dtype, + param_attr=fluid.ParamAttr( + name=scope_name+self._task_emb_name, + initializer=self._param_initializer)) + + emb_out = emb_out + task_emb_out + + emb_out = pre_process_layer( + emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder') + + self_attn_mask = fluid.layers.matmul( + x=input_mask, y=input_mask, transpose_y=True) + + self_attn_mask = fluid.layers.scale( + x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) + n_head_self_attn_mask = fluid.layers.stack( + x=[self_attn_mask] * self._n_head, axis=1) + n_head_self_attn_mask.stop_gradient = True + + enc_out = encoder( + enc_input=emb_out, + attn_bias=n_head_self_attn_mask, + n_layer=self._n_layer, + n_head=self._n_head, + d_key=self._emb_size // self._n_head, + d_value=self._emb_size // self._n_head, + d_model=self._emb_size, + d_inner_hid=self._emb_size * 4, + prepostprocess_dropout=self._prepostprocess_dropout, + attention_dropout=self._attention_dropout, + relu_dropout=0, + hidden_act=self._hidden_act, + preprocess_cmd="", + postprocess_cmd="dan", + param_initializer=self._param_initializer, + name=scope_name+'encoder') + + next_sent_feat = fluid.layers.slice( + input=enc_out, axes=[1], starts=[0], ends=[1]) + next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]]) + next_sent_feat = fluid.layers.fc( + input=next_sent_feat, + size=self._emb_size, + act="tanh", + param_attr=fluid.ParamAttr( + name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), + bias_attr=scope_name+"pooled_fc.b_0") + + output_buffer[key]['word_embedding'] = emb_out + output_buffer[key]['encoder_outputs'] = enc_out + output_buffer[key]['sentence_embedding'] = next_sent_feat + output_buffer[key]['sentence_pair_embedding'] = next_sent_feat + + ret = {} + ret['embedding_table'] = embedding_table + ret['word_embedding'] = output_buffer['base']['word_embedding'] + ret['encoder_outputs'] = output_buffer['base']['encoder_outputs'] + ret['sentence_embedding'] = output_buffer['base']['sentence_embedding'] + ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding'] + + if self._is_pairwise and self._phase == 'train': + ret['word_embedding_neg'] = output_buffer['neg']['word_embedding'] + ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs'] + ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding'] + ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding'] - next_sent_feat = fluid.layers.slice( - input=enc_out, axes=[1], starts=[0], ends=[1]) - next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]]) - next_sent_feat = fluid.layers.fc( - input=next_sent_feat, - size=self._emb_size, - act="tanh", - param_attr=fluid.ParamAttr( - name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), - bias_attr=scope_name+"pooled_fc.b_0") - - return {'embedding_table': embedding_table, - 'word_embedding': emb_out, - 'encoder_outputs': enc_out, - 'sentence_embedding': next_sent_feat, - 'sentence_pair_embedding': next_sent_feat} + return ret def postprocess(self, rt_outputs): pass diff --git a/paddlepalm/distribute/reader.py b/paddlepalm/distribute/reader.py index 8b36569b90152643712a33a49c238873df5757fd..fedbc84d24bbd42308eaabdeb6ddbed75e6e1cae 100644 --- a/paddlepalm/distribute/reader.py +++ b/paddlepalm/distribute/reader.py @@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size): assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors." data_list = data ds_list = distribute_strategy - stride = batch_size // dev_count p = stride # while p < len(data_list) + stride: @@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size): # print(len(temp)) yield temp -def data_feeder(reader, postprocess_fn=None, prefetch_steps=2, phase='train'): + +def data_feeder(reader, postprocess_fn=None, prefetch_steps=2): if postprocess_fn is None: def postprocess_fn(batch): return batch diff --git a/paddlepalm/head/__init__.py b/paddlepalm/head/__init__.py index 97e3abd5ac55044342675236fea32ed5434059a9..5a52e4c3b8a41fd203cf60fe2a6b2ab40fa319b2 100644 --- a/paddlepalm/head/__init__.py +++ b/paddlepalm/head/__init__.py @@ -1,5 +1,6 @@ from cls import Classify -# from match import Match -# from mrc import MRC -# from mlm import MaskLM +from match import Match +from ner import SequenceLabel +from mrc import MRC +from mlm import MaskLM diff --git a/paddlepalm/head/match.py b/paddlepalm/head/match.py index ee0d175b01e09ede242aa7fe404366dc48804580..a5366c9ffac94991b7814ac2aea71257772f9cd8 100644 --- a/paddlepalm/head/match.py +++ b/paddlepalm/head/match.py @@ -13,41 +13,66 @@ # See the License for the specific language governing permissions and # limitations under the License. + import paddle.fluid as fluid from paddle.fluid import layers -from paddlepalm.interface import task_paradigm +from paddlepalm.head.base_head import Head import numpy as np import os +import json + + +def computeHingeLoss(pos, neg, margin): + loss_part1 = fluid.layers.elementwise_sub( + fluid.layers.fill_constant_batch_size_like( + input=pos, shape=[-1, 1], value=margin, dtype='float32'), pos) + loss_part2 = fluid.layers.elementwise_add(loss_part1, neg) + loss_part3 = fluid.layers.elementwise_max( + fluid.layers.fill_constant_batch_size_like( + input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2) + return loss_part3 -class TaskParadigm(task_paradigm): + +class Match(Head): ''' matching ''' - def __init__(self, config, phase, backbone_config=None): + + def __init__(self, num_classes, input_dim, dropout_prob=0.0, param_initializer_range=0.02, \ + learning_strategy='pointwise', margin=0.5, phase='train'): + + """ + Args: + phase: train, eval, pred + lang: en, ch, ... + learning_strategy: pointwise, pairwise + """ + self._is_training = phase == 'train' - self._hidden_size = backbone_config['hidden_size'] + self._hidden_size = input_dim + + self._num_classes = num_classes - if 'initializer_range' in config: - self._param_initializer = config['initializer_range'] - else: - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=backbone_config.get('initializer_range', 0.02)) - if 'dropout_prob' in config: - self._dropout_prob = config['dropout_prob'] - else: - self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0) + self._dropout_prob = dropout_prob if phase == 'train' else 0.0 + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=param_initializer_range) + self._learning_strategy = learning_strategy + self._margin = margin - self._pred_output_path = config.get('pred_output_path', None) + self._preds = [] - + self._preds_logits = [] @property def inputs_attrs(self): - if self._is_training: - reader = {"label_ids": [[-1, 1], 'int64']} - else: - reader = {} + reader = {} bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']} + if self._is_training: + if self._learning_strategy == 'pointwise': + reader["label_ids"] = [[-1], 'int64'] + elif self._learning_strategy == 'pairwise': + bb["sentence_pair_embedding_neg"] = [[-1, self._hidden_size], 'float32'] + return {'reader': reader, 'backbone': bb} @property @@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm): if self._is_training: return {"loss": [[1], 'float32']} else: - return {"logits": [[-1, 2], 'float32']} + if self._learning_strategy=='paiwise': + return {"probs": [[-1, 1], 'float32']} + else: + return {"logits": [[-1, 2], 'float32'], + "probs": [[-1, 2], 'float32']} def build(self, inputs, scope_name=""): - if self._is_training: - labels = inputs["reader"]["label_ids"] - cls_feats = inputs["backbone"]["sentence_pair_embedding"] + # inputs + cls_feats = inputs["backbone"]["sentence_pair_embedding"] if self._is_training: cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=self._dropout_prob, dropout_implementation="upscale_in_train") + if self._learning_strategy == 'pairwise': + cls_feats_neg = inputs["backbone"]["sentence_pair_embedding_neg"] + cls_feats_neg = fluid.layers.dropout( + x=cls_feats_neg, + dropout_prob=self._dropout_prob, + dropout_implementation="upscale_in_train") + elif self._learning_strategy == 'pointwise': + labels = inputs["reader"]["label_ids"] + + # loss + # for pointwise + if self._learning_strategy == 'pointwise': + logits = fluid.layers.fc( + input=cls_feats, + size=self._num_classes, + param_attr=fluid.ParamAttr( + name=scope_name+"cls_out_w", + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr( + name=scope_name+"cls_out_b", + initializer=fluid.initializer.Constant(0.))) + probs = fluid.layers.softmax(logits) + if self._is_training: + ce_loss = fluid.layers.cross_entropy( + input=probs, label=labels) + loss = fluid.layers.mean(x=ce_loss) + return {'loss': loss} + # for pred + else: + return {'logits': logits, + 'probs': probs} + # for pairwise + elif self._learning_strategy == 'pairwise': + pos_score = fluid.layers.fc( + input=cls_feats, + size=1, + act = "sigmoid", + param_attr=fluid.ParamAttr( + name=scope_name+"cls_out_w_pr", + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr( + name=scope_name+"cls_out_b_pr", + initializer=fluid.initializer.Constant(0.))) + pos_score = fluid.layers.reshape(x=pos_score, shape=[-1, 1], inplace=True) - logits = fluid.layers.fc( - input=cls_feats, - size=2, - param_attr=fluid.ParamAttr( - name=scope_name+"cls_out_w", - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr( - name=scope_name+"cls_out_b", - initializer=fluid.initializer.Constant(0.))) + if self._is_training: + neg_score = fluid.layers.fc( + input=cls_feats_neg, + size=1, + act = "sigmoid", + param_attr=fluid.ParamAttr( + name=scope_name+"cls_out_w_pr", + initializer=self._param_initializer), + bias_attr=fluid.ParamAttr( + name=scope_name+"cls_out_b_pr", + initializer=fluid.initializer.Constant(0.))) + neg_score = fluid.layers.reshape(x=neg_score, shape=[-1, 1], inplace=True) + + loss = fluid.layers.mean(computeHingeLoss(pos_score, neg_score, self._margin)) + return {'loss': loss} + # for pred + else: + return {'probs': pos_score} + - if self._is_training: - ce_loss, probs = fluid.layers.softmax_with_cross_entropy( - logits=logits, label=labels, return_softmax=True) - loss = fluid.layers.mean(x=ce_loss) - return {'loss': loss} - else: - return {'logits': logits} - def postprocess(self, rt_outputs): + def batch_postprocess(self, rt_outputs): if not self._is_training: - logits = rt_outputs['logits'] - preds = np.argmax(logits, -1) - self._preds.extend(preds.tolist()) - - def epoch_postprocess(self, post_inputs): + probs = [] + logits = [] + probs = rt_outputs['probs'] + self._preds.extend(probs.tolist()) + if self._learning_strategy == 'pointwise': + logits = rt_outputs['logits'] + self._preds_logits.extend(logits.tolist()) + + def epoch_postprocess(self, post_inputs, output_dir=None): # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs if not self._is_training: - if self._pred_output_path is None: - raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') - with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer: - for p in self._preds: - writer.write(str(p)+'\n') - print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json')) - - + if output_dir is None: + raise ValueError('argument output_dir not found in config. Please add it into config dict/file.') + with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer: + for i in range(len(self._preds)): + if self._learning_strategy == 'pointwise': + label = 0 if self._preds[i][0] > self._preds[i][1] else 1 + result = {'index': i, 'label': label, 'logits': self._preds_logits[i], 'probs': self._preds[i]} + elif self._learning_strategy == 'pairwise': + label = 0 if self._preds[i][0] < 0.5 else 1 + result = {'index': i, 'label': label, 'probs': self._preds[i][0]} + result = json.dumps(result) + writer.write(result+'\n') + print('Predictions saved at '+os.path.join(output_dir, 'predictions.json')) \ No newline at end of file diff --git a/paddlepalm/head/mlm.py b/paddlepalm/head/mlm.py index ec86dd151e8b0f86c345120f4a5907f0afb91d5c..871820e99e20a34682d7a2dd8c2f628e25a5fc3c 100644 --- a/paddlepalm/head/mlm.py +++ b/paddlepalm/head/mlm.py @@ -14,30 +14,39 @@ # limitations under the License. import paddle.fluid as fluid -from paddlepalm.interface import task_paradigm +from paddlepalm.head.base_head import Head from paddle.fluid import layers +import numpy as np +import os from paddlepalm.backbone.utils.transformer import pre_process_layer -class TaskParadigm(task_paradigm): +class MaskLM(Head): ''' - matching + mlm ''' - def __init__(self, config, phase, backbone_config=None): + def __init__(self, input_dim, vocab_size, hidden_act, initializer_range, dropout_prob=0.0, \ + param_initializer_range=0.02, phase='train'): self._is_training = phase == 'train' - self._emb_size = backbone_config['hidden_size'] - self._hidden_size = backbone_config['hidden_size'] - self._vocab_size = backbone_config['vocab_size'] - self._hidden_act = backbone_config['hidden_act'] - self._initializer_range = backbone_config['initializer_range'] + self._emb_size = input_dim + self._hidden_size = input_dim + self._dropout_prob = dropout_prob if phase == 'train' else 0.0 + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=param_initializer_range) + self._preds = [] + + self._vocab_size = vocab_size + self._hidden_act = hidden_act + self._initializer_range = initializer_range @property def inputs_attrs(self): reader = { - "mask_label": [[-1, 1], 'int64'], - "mask_pos": [[-1, 1], 'int64']} + "token_ids":[[-1, -1], 'int64'], + "mask_label": [[-1], 'int64'], + "mask_pos": [[-1], 'int64'], + } if not self._is_training: del reader['mask_label'] - del reader['batchsize_x_seqlen'] bb = { "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'], "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']} @@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm): mask_pos = inputs["reader"]["mask_pos"] if self._is_training: mask_label = inputs["reader"]["mask_label"] - max_position = inputs["reader"]["batchsize_x_seqlen"] - 1 + l1 = fluid.layers.shape(inputs["reader"]["token_ids"] )[0] + # bxs = inputs["reader"]["token_ids"].shape[2].value + l2 = fluid.layers.shape(inputs["reader"]["token_ids"][0])[0] + bxs = (l1*l2).astype(np.int64) + # max_position = inputs["reader"]["batchsize_x_seqlen"] - 1 + max_position = bxs - 1 + mask_pos = fluid.layers.elementwise_min(mask_pos, max_position) mask_pos.stop_gradient = True @@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm): is_bias=True) if self._is_training: - mask_lm_loss = fluid.layers.softmax_with_cross_entropy( - logits=fc_out, label=mask_label) + inputs = fluid.layers.softmax(fc_out) + mask_lm_loss = fluid.layers.cross_entropy( + input=inputs, label=mask_label) loss = fluid.layers.mean(mask_lm_loss) return {'loss': loss} else: return {'logits': fc_out} + def batch_postprocess(self, rt_outputs): + if not self._is_training: + logits = rt_outputs['logits'] + preds = np.argmax(logits, -1) + self._preds.extend(preds.tolist()) + return preds + + def epoch_postprocess(self, post_inputs, output_dir=None): + # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs + if not self._is_training: + if output_dir is None: + for p in self._preds: + print(p) + else: + with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer: + for p in self._preds: + writer.write(str(p)+'\n') + print('Predictions saved at '+os.path.join(output_dir, 'predictions.json')) + diff --git a/paddlepalm/head/mrc.py b/paddlepalm/head/mrc.py index b1f0b5688d18fc9c84156a1570f24815febba17f..b1f25c07a0d782601c155b03d78cda7443b1c60b 100644 --- a/paddlepalm/head/mrc.py +++ b/paddlepalm/head/mrc.py @@ -14,7 +14,7 @@ # limitations under the License. import paddle.fluid as fluid -from paddlepalm.interface import task_paradigm +from paddlepalm.head.base_head import Head import collections import numpy as np import os @@ -26,34 +26,37 @@ import json RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) -class TaskParadigm(task_paradigm): - """""" +class MRC(Head): + """ + Machine Reading Comprehension + """ + + def __init__(self, max_query_len, input_dim, pred_output_path=None, verbose=False, with_negative=False, do_lower_case=False, max_ans_len=None, null_score_diff_threshold=0.0, n_best_size=20, phase='train'): - def __init__(self, config, phase, backbone_config=None): - self._is_training = phase == 'train' - self._max_sequence_length = config['max_seq_len'] - self._hidden_size = backbone_config['hidden_size'] + self._hidden_size = input_dim + self._max_sequence_length = max_query_len + self._pred_results = [] - if phase == 'pred': - self._max_answer_length = config.get('max_answer_len', None) - self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0) - self._n_best_size = config.get('n_best_size', 20) - self._pred_output_path = config.get('pred_output_path', None) - self._verbose = config.get('verbose', False) - self._with_negative = config.get('with_negative', False) - self._do_lower_case = config.get('do_lower_case', False) + output_dir = pred_output_path + self._max_answer_length = max_ans_len + self._null_score_diff_threshold = null_score_diff_threshold + self._n_best_size = n_best_size + output_dir = pred_output_path + self._verbose = verbose + self._with_negative = with_negative + self._do_lower_case = do_lower_case @property def inputs_attrs(self): if self._is_training: - reader = {"start_positions": [[-1, 1], 'int64'], - "end_positions": [[-1, 1], 'int64'], + reader = {"start_positions": [[-1], 'int64'], + "end_positions": [[-1], 'int64'], } else: - reader = {'unique_ids': [[-1, 1], 'int64']} + reader = {'unique_ids': [[-1], 'int64']} bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']} return {'reader': reader, 'backbone': bb} @@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm): else: return {'start_logits': [[-1, -1, 1], 'float32'], 'end_logits': [[-1, -1, 1], 'float32'], - 'unique_ids': [[-1, 1], 'int64']} + 'unique_ids': [[-1], 'int64']} def build(self, inputs, scope_name=""): if self._is_training: start_positions = inputs['reader']['start_positions'] end_positions = inputs['reader']['end_positions'] - max_position = inputs["reader"]["seqlen"] - 1 - start_positions = fluid.layers.elementwise_min(start_positions, max_position) - end_positions = fluid.layers.elementwise_min(end_positions, max_position) + # max_position = inputs["reader"]["seqlen"] - 1 + # start_positions = fluid.layers.elementwise_min(start_positions, max_position) + # end_positions = fluid.layers.elementwise_min(end_positions, max_position) start_positions.stop_gradient = True end_positions.stop_gradient = True else: unique_id = inputs['reader']['unique_ids'] + # It's used to help fetch variable 'unique_ids' that will be removed in the future + helper_constant = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64') + fluid.layers.elementwise_mul(unique_id, helper_constant) + + enc_out = inputs['backbone']['encoder_outputs'] logits = fluid.layers.fc( input=enc_out, @@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm): start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) def _compute_single_loss(logits, positions): - """Compute start/end loss for mrc model""" - loss = fluid.layers.softmax_with_cross_entropy( - logits=logits, label=positions) + """Compute start/en + d loss for mrc model""" + inputs = fluid.layers.softmax(logits) + loss = fluid.layers.cross_entropy( + input=inputs, label=positions) loss = fluid.layers.mean(x=loss) return loss @@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm): 'unique_ids': unique_id} - def postprocess(self, rt_outputs): + def batch_postprocess(self, rt_outputs): """this func will be called after each step(batch) of training/evaluating/predicting process.""" if not self._is_training: - unique_ids = np.squeeze(rt_outputs['unique_ids'], -1) + unique_ids = rt_outputs['unique_ids'] start_logits = rt_outputs['start_logits'] end_logits = rt_outputs['end_logits'] for idx in range(len(unique_ids)): @@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm): start_logits=s, end_logits=e)) - def epoch_postprocess(self, post_inputs): + def epoch_postprocess(self, post_inputs, output_dir=None): """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process.""" if not self._is_training: - if self._pred_output_path is None: - raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') + if output_dir is None: + raise ValueError('argument output_dir not found in config. Please add it into config dict/file.') examples = post_inputs['reader']['examples'] features = post_inputs['reader']['features'] - if not os.path.exists(self._pred_output_path): - os.makedirs(self._pred_output_path) - output_prediction_file = os.path.join(self._pred_output_path, "predictions.json") - output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json") - output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json") + if not os.path.exists(output_dir): + os.makedirs(output_dir) + output_prediction_file = os.path.join(output_dir, "predictions.json") + output_nbest_file = os.path.join(output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(output_dir, "null_odds.json") _write_predictions(examples, features, self._pred_results, self._n_best_size, self._max_answer_length, self._do_lower_case, output_prediction_file, @@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size, # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive min_null_feature_index = 0 # the paragraph slice with min mull score - null_start_logit = 0 # the start logit at the slice with min null score + ull_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) diff --git a/paddlepalm/head/ner.py b/paddlepalm/head/ner.py new file mode 100644 index 0000000000000000000000000000000000000000..dfec122d623a8b8a54463de1b75b328f7402afa5 --- /dev/null +++ b/paddlepalm/head/ner.py @@ -0,0 +1,126 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +from paddle.fluid import layers +from paddlepalm.head.base_head import Head +import numpy as np +import os +import math + +class SequenceLabel(Head): + ''' + Sequence label + ''' + def __init__(self, num_classes, input_dim, dropout_prob=0.0, learning_rate=1e-3, \ + param_initializer_range=0.02, phase='train'): + + """ + Args: + phase: train, eval, pred + lang: en, ch, ... + """ + + self._is_training = phase == 'train' + self._hidden_size = input_dim + + self.num_classes = num_classes + + self._dropout_prob = dropout_prob if phase == 'train' else 0.0 + self._param_initializer = fluid.initializer.TruncatedNormal( + scale=param_initializer_range) + + self.learning_rate = learning_rate + self._preds = [] + + + @property + def inputs_attrs(self): + reader = {} + bb = {"encoder_outputs": [[-1, -1, -1], 'float32']} + if self._is_training: + reader["label_ids"] = [[-1, -1], 'int64'] + reader["seq_lens"] = [[-1], 'int64'] + return {'reader': reader, 'backbone': bb} + + @property + def outputs_attrs(self): + if self._is_training: + return {'loss': [[1], 'float32']} + else: + return {'emission': [[-1, self.num_classes], 'float32']} + + def build(self, inputs, scope_name=''): + token_emb = inputs['backbone']['encoder_outputs'] + if self._is_training: + label_ids = inputs['reader']['label_ids'] + seq_lens = inputs['reader']['seq_lens'] + + emission = fluid.layers.fc( + size=self.num_classes, + input=token_emb, + param_attr=fluid.ParamAttr( + initializer=self._param_initializer, + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4)), + bias_attr=fluid.ParamAttr( + name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)), + num_flatten_dims=2) + + if self._is_training: + + # compute loss + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=label_ids, + param_attr=fluid.ParamAttr( + name=scope_name+'crfw', learning_rate=self.learning_rate), + length=seq_lens) + + avg_cost = fluid.layers.mean(x=crf_cost) + crf_decode = fluid.layers.crf_decoding( + input=emission, + param_attr=fluid.ParamAttr(name=scope_name+'crfw'), + length=seq_lens) + + (precision, recall, f1_score, num_infer_chunks, num_label_chunks, + num_correct_chunks) = fluid.layers.chunk_eval( + input=crf_decode, + label=label_ids, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((self.num_classes - 1) / 2.0)), + seq_length=seq_lens) + chunk_evaluator = fluid.metrics.ChunkEvaluator() + chunk_evaluator.reset() + + return {"loss": avg_cost} + else: + return {"emission": emission} + + def batch_postprocess(self, rt_outputs): + if not self._is_training: + emission = rt_outputs['emission'] + preds = np.argmax(emission, -1) + self._preds.extend(preds.tolist()) + + def epoch_postprocess(self, post_inputs, output_dir=None): + # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs + if not self._is_training: + if output_dir is None: + raise ValueError('argument output_dir not found in config. Please add it into config dict/file.') + with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer: + for p in self._preds: + writer.write(str(p)+'\n') + print('Predictions saved at '+os.path.join(output_dir, 'predictions.json')) diff --git a/paddlepalm/reader/__init__.py b/paddlepalm/reader/__init__.py index 10421172976e81883393d88a399414aa53427659..81dc0833e2e677704b4651e2fff5d02d697408f4 100644 --- a/paddlepalm/reader/__init__.py +++ b/paddlepalm/reader/__init__.py @@ -1,3 +1,6 @@ from cls import ClassifyReader - +from match import MatchReader +from ner import SequenceLabelReader +from mrc import MrcReader +from mlm import MaskLMReader diff --git a/paddlepalm/reader/base_reader.py b/paddlepalm/reader/base_reader.py index ddad7130dfdea5e103dc6e1ad7fcce48f3d30e60..b35e1d158e9c7b018fdbcf427a9f3079c32468c6 100644 --- a/paddlepalm/reader/base_reader.py +++ b/paddlepalm/reader/base_reader.py @@ -42,7 +42,6 @@ class Reader(object): self._register.add(attr_name) def register_with(self, backbone): - print(backbone) for attr in backbone.inputs_attr: self.require_attr(attr) self._registered_backbone = backbone diff --git a/paddlepalm/reader/match.py b/paddlepalm/reader/match.py index d6be0f82a4bac255341f3c58a14430c078314bf1..7f25c33bcff6ef1dacbad0f4bad9475b4fb8fb9f 100644 --- a/paddlepalm/reader/match.py +++ b/paddlepalm/reader/match.py @@ -13,87 +13,104 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddlepalm.interface import reader -from paddlepalm.reader.utils.reader4ernie import ClassifyReader +from paddlepalm.reader.base_reader import Reader +from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader -class Reader(reader): + +class MatchReader(Reader): - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): - """ + def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \ + do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么 + """ Args: phase: train, eval, pred - """ + lang: en, ch, ... + learning_strategy: pointwise, pairwise + """ - self._is_training = phase == 'train' + Reader.__init__(self, phase) - reader = ClassifyReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', True), - for_cn=config.get('for_cn', False), - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count + assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)." + assert phase in ['train', 'predict'], "supported phase: train, predict." - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] + for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese' + + self._register.add('token_ids') if phase == 'train': - self._input_file = config['train_file'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - self._shuffle_buffer = config.get('shuffle_buffer', 5000) - elif phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) + if learning_strategy == 'pointwise': + self._register.add('label_ids') + if learning_strategy == 'pairwise': + self._register.add('token_ids_neg') + self._register.add('position_ids_neg') + self._register.add('segment_ids_neg') + self._register.add('input_mask_neg') + self._register.add('task_ids_neg') + + self._is_training = phase == 'train' + self._learning_strategy = learning_strategy + + match_reader = CLSReader(vocab_path, + max_seq_len=max_len, + do_lower_case=do_lower_case, + for_cn=for_cn, + random_seed=seed, + learning_strategy = learning_strategy) + + self._reader = match_reader + self._dev_count = dev_count self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 1) @property def outputs_attr(self): - if self._is_training: - return {"token_ids": [[-1, -1], 'int64'], - "position_ids": [[-1, -1], 'int64'], - "segment_ids": [[-1, -1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "label_ids": [[-1], 'int64'], - "task_ids": [[-1, -1], 'int64'] - } - else: - return {"token_ids": [[-1, -1], 'int64'], - "position_ids": [[-1, -1], 'int64'], - "segment_ids": [[-1, -1], 'int64'], - "task_ids": [[-1, -1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'] - } - - - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - - def iterator(self): - - def list_to_dict(x): - names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', - 'label_ids', 'unique_ids'] - outputs = {n: i for n,i in zip(names, x)} - del outputs['unique_ids'] - if not self._is_training: - del outputs['label_ids'] - return outputs + attrs = {"token_ids": [[-1, -1], 'int64'], + "position_ids": [[-1, -1], 'int64'], + "segment_ids": [[-1, -1], 'int64'], + "input_mask": [[-1, -1, 1], 'float32'], + "task_ids": [[-1, -1], 'int64'], + "label_ids": [[-1], 'int64'], + "token_ids_neg": [[-1, -1], 'int64'], + "position_ids_neg": [[-1, -1], 'int64'], + "segment_ids_neg": [[-1, -1], 'int64'], + "input_mask_neg": [[-1, -1, 1], 'float32'], + "task_ids_neg": [[-1, -1], 'int64'] + } + return self._get_registed_attrs(attrs) + + + def load_data(self, input_file, batch_size, num_epochs=None, \ + file_format='tsv', shuffle_train=True): + self._batch_size = batch_size + self._num_epochs = num_epochs + self._data_generator = self._reader.data_generator( \ + input_file, batch_size, num_epochs if self._phase == 'train' else 1, \ + shuffle=shuffle_train if self._phase == 'train' else False, \ + phase=self._phase) + + def _iterator(self): + + + names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 'label_ids', \ + 'token_ids_neg', 'segment_ids_neg', 'position_ids_neg', 'task_ids_neg', 'input_mask_neg'] + + if self._learning_strategy == 'pairwise': + names.remove('label_ids') + for batch in self._data_generator(): - yield list_to_dict(batch) + outputs = {n: i for n,i in zip(names, batch)} + ret = {} + # TODO: move runtime shape check here + for attr in self.outputs_attr.keys(): + ret[attr] = outputs[attr] + yield ret @property def num_examples(self): return self._reader.get_num_examples(phase=self._phase) + @property + def num_epochs(self): + return self._num_epochs + diff --git a/paddlepalm/reader/mlm.py b/paddlepalm/reader/mlm.py index e4dff3477f3ffd56864bcbaed439f7bd03716377..0e49804774bb075d8d32f8a9ff20173bcb9a6925 100644 --- a/paddlepalm/reader/mlm.py +++ b/paddlepalm/reader/mlm.py @@ -13,79 +13,79 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddlepalm.interface import reader -from paddlepalm.reader.utils.reader4ernie import MaskLMReader +from paddlepalm.reader.base_reader import Reader +from paddlepalm.reader.utils.reader4ernie import MaskLMReader as MLMReader import numpy as np -class Reader(reader): +class MaskLMReader(Reader): - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): + def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \ + lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''): """ Args: phase: train, eval, pred - """ + """ - self._is_training = phase == 'train' - reader = MaskLMReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', False), - for_cn=config.get('for_cn', False), - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count + Reader.__init__(self, phase) - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] + assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)." + assert phase in ['train', 'predict'], "supported phase: train, predict." + + for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese' + + self._register.add('token_ids') + self._register.add('mask_pos') if phase == 'train': - self._input_file = config['train_file'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - self._shuffle_buffer = config.get('shuffle_buffer', 5000) - elif phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) + self._register.add('mask_label') + self._is_training = phase == 'train' + + mlm_reader = MLMReader(vocab_path, + max_seq_len=max_len, + do_lower_case=do_lower_case, + for_cn=for_cn, + random_seed=seed) + self._reader = mlm_reader self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 1) + self._dev_count = dev_count @property def outputs_attr(self): - return {"token_ids": [[-1, -1], 'int64'], + attrs = {"token_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'], "input_mask": [[-1, -1, 1], 'float32'], "task_ids": [[-1, -1], 'int64'], "mask_label": [[-1], 'int64'], - "mask_pos": [[-1], 'int64'], + "mask_pos": [[-1], 'int64'] } + return self._get_registed_attrs(attrs) - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - def iterator(self): + def load_data(self, input_file, batch_size, num_epochs=None, \ + file_format='csv', shuffle_train=True): + self._batch_size = batch_size + self._num_epochs = num_epochs + self._data_generator = self._reader.data_generator( \ + input_file, batch_size, num_epochs if self._phase == 'train' else 1, \ + shuffle=shuffle_train if self._phase == 'train' else False, \ + phase=self._phase) - def list_to_dict(x): - names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', - 'task_ids', 'mask_label', 'mask_pos'] - outputs = {n: i for n,i in zip(names, x)} - # outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1] - return outputs + def _iterator(self): + names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', + 'task_ids', 'mask_label', 'mask_pos'] for batch in self._data_generator(): - # print(np.shape(list_to_dict(batch)['token_ids'])) - # print(list_to_dict(batch)['mask_label'].tolist()) - yield list_to_dict(batch) + outputs = {n: i for n,i in zip(names, batch)} + ret = {} + # TODO: move runtime shape check here + for attr in self.outputs_attr.keys(): + ret[attr] = outputs[attr] + + yield ret def get_epoch_outputs(self): return {'examples': self._reader.get_examples(self._phase), @@ -95,3 +95,7 @@ class Reader(reader): def num_examples(self): return self._reader.get_num_examples(phase=self._phase) + @property + def num_epochs(self): + return self._num_epochs + diff --git a/paddlepalm/reader/mrc.py b/paddlepalm/reader/mrc.py index 2906b97ecb591fd6cc65f3a246c6d88e87dfccb8..61c9158647cbf2ea34b750451bd4b74b027e38ef 100644 --- a/paddlepalm/reader/mrc.py +++ b/paddlepalm/reader/mrc.py @@ -13,77 +13,66 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddlepalm.interface import reader +from paddlepalm.reader.base_reader import Reader from paddlepalm.reader.utils.reader4ernie import MRCReader +import numpy as np -class Reader(reader): - - def __init__(self, config, phase='train', dev_count=1, print_prefix=''): +class MrcReader(Reader): + + def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \ + remove_noanswer=True, phase='train', dev_count=1, print_prefix=''): """ Args: phase: train, eval, pred - """ + lang: en, ch, ... + """ - self._is_training = phase == 'train' + Reader.__init__(self, phase) - reader = MRCReader(config['vocab_path'], - max_seq_len=config['max_seq_len'], - do_lower_case=config.get('do_lower_case', False), - tokenizer='FullTokenizer', - for_cn=config.get('for_cn', False), - doc_stride=config['doc_stride'], - remove_noanswer=config.get('remove_noanswer', True), - max_query_length=config['max_query_len'], - random_seed=config.get('seed', None)) - self._reader = reader - self._dev_count = dev_count - self._batch_size = config['batch_size'] - self._max_seq_len = config['max_seq_len'] + assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)." + assert phase in ['train', 'predict'], "supported phase: train, predict." + + for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese' + + + self._register.add('token_ids') if phase == 'train': - self._input_file = config['train_file'] - # self._num_epochs = config['num_epochs'] - self._num_epochs = None # 防止iteartor终止 - self._shuffle = config.get('shuffle', True) - self._shuffle_buffer = config.get('shuffle_buffer', 5000) - if phase == 'eval': - self._input_file = config['dev_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) - elif phase == 'pred': - self._input_file = config['pred_file'] - self._num_epochs = 1 - self._shuffle = False - self._batch_size = config.get('pred_batch_size', self._batch_size) + self._register.add("start_positions") + self._register.add("end_positions") + else: + self._register.add("unique_ids") + - self._phase = phase - # self._batch_size = - self._print_first_n = config.get('print_first_n', 1) + self._is_training = phase == 'train' - # TODO: without slide window version - self._with_slide_window = config.get('with_slide_window', False) + mrc_reader = MRCReader(vocab_path, + max_seq_len=max_len, + do_lower_case=do_lower_case, + tokenizer=tokenizer, + doc_stride=doc_stride, + remove_noanswer=remove_noanswer, + max_query_length=max_query_len, + for_cn=for_cn, + random_seed=seed) + self._reader = mrc_reader + self._phase = phase + self._dev_count = dev_count + @property def outputs_attr(self): - if self._is_training: - return {"token_ids": [[-1, -1], 'int64'], - "position_ids": [[-1, -1], 'int64'], - "segment_ids": [[-1, -1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "start_positions": [[-1], 'int64'], - "end_positions": [[-1], 'int64'], - "task_ids": [[-1, -1], 'int64'] - } - else: - return {"token_ids": [[-1, -1], 'int64'], - "position_ids": [[-1, -1], 'int64'], - "segment_ids": [[-1, -1], 'int64'], - "task_ids": [[-1, -1], 'int64'], - "input_mask": [[-1, -1, 1], 'float32'], - "unique_ids": [[-1], 'int64'] - } + attrs = {"token_ids": [[-1, -1], 'int64'], + "position_ids": [[-1, -1], 'int64'], + "segment_ids": [[-1, -1], 'int64'], + "input_mask": [[-1, -1, 1], 'float32'], + "start_positions": [[-1], 'int64'], + "end_positions": [[-1], 'int64'], + "task_ids": [[-1, -1], 'int64'], + "unique_ids": [[-1], 'int64'] + } + return self._get_registed_attrs(attrs) @property def epoch_outputs_attr(self): @@ -91,26 +80,34 @@ class Reader(reader): return {"examples": None, "features": None} - def load_data(self): - self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) - - def iterator(self): - - def list_to_dict(x): - names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', - 'start_positions', 'end_positions', 'unique_ids'] - outputs = {n: i for n,i in zip(names, x)} - if self._is_training: - del outputs['unique_ids'] - else: - del outputs['start_positions'] - del outputs['end_positions'] - return outputs - + def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True): + self._batch_size = batch_size + self._num_epochs = num_epochs + self._data_generator = self._reader.data_generator( \ + input_file, batch_size, num_epochs if self._phase == 'train' else 1, \ + shuffle=shuffle_train if self._phase == 'train' else False, \ + phase=self._phase) + def _iterator(self): + + names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', + 'start_positions', 'end_positions', 'unique_ids'] + + if self._is_training: + names.remove('unique_ids') + for batch in self._data_generator(): - yield list_to_dict(batch) + outputs = {n: i for n,i in zip(names, batch)} + ret = {} + # TODO: move runtime shape check here + for attr in self.outputs_attr.keys(): + ret[attr] = outputs[attr] + if not self._is_training: + assert 'unique_ids' in ret, ret + yield ret + def get_epoch_outputs(self): + return {'examples': self._reader.get_examples(self._phase), 'features': self._reader.get_features(self._phase)} @@ -118,3 +115,7 @@ class Reader(reader): def num_examples(self): return self._reader.get_num_examples(phase=self._phase) + @property + def num_epochs(self): + return self._num_epochs + diff --git a/paddlepalm/reader/ner.py b/paddlepalm/reader/ner.py new file mode 100644 index 0000000000000000000000000000000000000000..4a610e672e6eebcfdff98f013b8ffb22b07c5959 --- /dev/null +++ b/paddlepalm/reader/ner.py @@ -0,0 +1,97 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddlepalm.reader.base_reader import Reader +from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader + +class SequenceLabelReader(Reader): + + def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \ + lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''): + """ + Args: + phase: train, eval, pred + lang: en, ch, ... + """ + + Reader.__init__(self, phase) + + assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)." + assert phase in ['train', 'predict'], "supported phase: train, predict." + + for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese' + + self._register.add('token_ids') + self._register.add('seq_lens') + if phase == 'train': + self._register.add('label_ids') + + self._is_training = phase == 'train' + + ner_reader = SLReader(vocab_path, + max_seq_len=max_len, + do_lower_case=do_lower_case, + for_cn=for_cn, + random_seed=seed, + label_map_config=label_map_config) + self._reader = ner_reader + self._phase = phase + self._dev_count = dev_count + + + @property + def outputs_attr(self): + attrs = {"token_ids": [[-1, -1], 'int64'], + "position_ids": [[-1, -1], 'int64'], + "segment_ids": [[-1, -1], 'int64'], + "task_ids": [[-1, -1], 'int64'], + "input_mask": [[-1, -1, 1], 'float32'], + "seq_lens": [[-1], 'int64'], + "label_ids": [[-1, -1], 'int64']} + return self._get_registed_attrs(attrs) + + + def load_data(self, input_file, batch_size, num_epochs=None, \ + file_format='tsv', shuffle_train=True): + self._batch_size = batch_size + self._num_epochs = num_epochs + self._data_generator = self._reader.data_generator( \ + input_file, batch_size, num_epochs if self._phase == 'train' else 1, \ + shuffle=shuffle_train if self._phase == 'train' else False, \ + phase=self._phase) + + def _iterator(self): + + names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', + 'label_ids', 'seq_lens', 'label_ids'] + for batch in self._data_generator(): + outputs = {n: i for n,i in zip(names, batch)} + ret = {} + # TODO: move runtime shape check here + for attr in self.outputs_attr.keys(): + ret[attr] = outputs[attr] + yield ret + + def get_epoch_outputs(self): + return {'examples': self._reader.get_examples(self._phase), + 'features': self._reader.get_features(self._phase)} + + @property + def num_examples(self): + return self._reader.get_num_examples(phase=self._phase) + + @property + def num_epochs(self): + return self._num_epochs diff --git a/paddlepalm/reader/utils/mlm_batching.py b/paddlepalm/reader/utils/mlm_batching.py index 8d6061d42a4ea096d17e4aeb6d3df394e852deb6..f824e44c9700682c2c90e4be440325f37bdfbb7a 100644 --- a/paddlepalm/reader/utils/mlm_batching.py +++ b/paddlepalm/reader/utils/mlm_batching.py @@ -19,57 +19,76 @@ from __future__ import print_function import numpy as np -def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): +def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3, dev_count=1): """ Add mask for batch_tokens, return out, mask_label, mask_pos; Note: mask_pos responding the batch_tokens after padded; """ max_len = max([len(sent) for sent in batch_tokens]) - mask_label = [] - mask_pos = [] - prob_mask = np.random.rand(total_token_num) - # Note: the first token is [CLS], so [low=1] - replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) - pre_sent_len = 0 - prob_index = 0 - for sent_index, sent in enumerate(batch_tokens): - mask_flag = False - prob_index += pre_sent_len - for token_index, token in enumerate(sent): - prob = prob_mask[prob_index + token_index] - if prob > 0.15: - continue - elif 0.03 < prob <= 0.15: - # mask - if token != SEP and token != CLS: + + multidev_batch_tokens = [] + multidev_mask_label = [] + multidev_mask_pos = [] + + big_batch_tokens = batch_tokens + stride = len(batch_tokens) // dev_count + if stride == 0: + return None, None, None + p = stride + + for i in range(dev_count): + batch_tokens = big_batch_tokens[p-stride:p] + p += stride + mask_label = [] + mask_pos = [] + prob_mask = np.random.rand(total_token_num) + # Note: the first token is [CLS], so [low=1] + replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) + pre_sent_len = 0 + prob_index = 0 + for sent_index, sent in enumerate(batch_tokens): + mask_flag = False + prob_index += pre_sent_len + for token_index, token in enumerate(sent): + prob = prob_mask[prob_index + token_index] + if prob > 0.15: + continue + elif 0.03 < prob <= 0.15: + # mask + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + sent[token_index] = MASK + mask_flag = True + mask_pos.append(sent_index * max_len + token_index) + elif 0.015 < prob <= 0.03: + # random replace + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + sent[token_index] = replace_ids[prob_index + token_index] + mask_flag = True + mask_pos.append(sent_index * max_len + token_index) + else: + # keep the original token + if token != SEP and token != CLS: + mask_label.append(sent[token_index]) + mask_pos.append(sent_index * max_len + token_index) + pre_sent_len = len(sent) + # ensure at least mask one word in a sentence + while not mask_flag: + token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) + if sent[token_index] != SEP and sent[token_index] != CLS: mask_label.append(sent[token_index]) sent[token_index] = MASK mask_flag = True mask_pos.append(sent_index * max_len + token_index) - elif 0.015 < prob <= 0.03: - # random replace - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = replace_ids[prob_index + token_index] - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - else: - # keep the original token - if token != SEP and token != CLS: - mask_label.append(sent[token_index]) - mask_pos.append(sent_index * max_len + token_index) - pre_sent_len = len(sent) - # ensure at least mask one word in a sentence - while not mask_flag: - token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) - if sent[token_index] != SEP and sent[token_index] != CLS: - mask_label.append(sent[token_index]) - sent[token_index] = MASK - mask_flag = True - mask_pos.append(sent_index * max_len + token_index) - mask_label = np.array(mask_label).astype("int64").reshape([-1]) - mask_pos = np.array(mask_pos).astype("int64").reshape([-1]) - return batch_tokens, mask_label, mask_pos + mask_label = np.array(mask_label).astype("int64").reshape([-1]) + mask_pos = np.array(mask_pos).astype("int64").reshape([-1]) + + multidev_batch_tokens.extend(batch_tokens) + multidev_mask_label.append(mask_label) + multidev_mask_pos.append(mask_pos) + + return multidev_batch_tokens, multidev_mask_label, multidev_mask_pos def prepare_batch_data(insts, @@ -83,7 +102,8 @@ def prepare_batch_data(insts, task_id=0, return_input_mask=True, return_max_len=True, - return_num_token=False): + return_num_token=False, + dev_count=1): """ 1. generate Tensor of data 2. generate Tensor of position @@ -101,7 +121,8 @@ def prepare_batch_data(insts, vocab_size=voc_size, CLS=cls_id, SEP=sep_id, - MASK=mask_id) + MASK=mask_id, + dev_count=dev_count) # Second step: padding src_id, self_input_mask = pad_batch_data( out, @@ -125,7 +146,7 @@ def prepare_batch_data(insts, return_list = [ src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos ] - return return_list if len(return_list) > 1 else return_list[0] + return return_list def pad_batch_data(insts, diff --git a/paddlepalm/reader/utils/reader4ernie.py b/paddlepalm/reader/utils/reader4ernie.py index 3b47cb9b723a13d14167acfe93f17c3cbcfd204b..19d145c0e0987baae74f13fd3ef749694fa399dc 100644 --- a/paddlepalm/reader/utils/reader4ernie.py +++ b/paddlepalm/reader/utils/reader4ernie.py @@ -29,11 +29,15 @@ import six from io import open from collections import namedtuple +# from . import gpu_dev_count +gpu_dev_count=1 +import paddlepalm as palm import paddlepalm.tokenizer.ernie_tokenizer as tokenization from paddlepalm.reader.utils.batching4ernie import pad_batch_data from paddlepalm.reader.utils.mlm_batching import prepare_batch_data + log = logging.getLogger(__name__) if six.PY3: @@ -49,20 +53,23 @@ def csv_reader(fd, delimiter='\t'): return gen() -class BaseReader(object): +class Reader(object): def __init__(self, vocab_path, label_map_config=None, max_seq_len=512, - do_lower_case=False, + do_lower_case=True, in_tokens=False, is_inference=False, + learning_strategy='pointwise', random_seed=None, tokenizer="FullTokenizer", + phase='train', is_classify=True, is_regression=False, - for_cn=False, + for_cn=True, task_id=0): + assert phase in ['train', 'predict'], "supported phase: train, predict." self.max_seq_len = max_seq_len self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) @@ -72,7 +79,9 @@ class BaseReader(object): self.sep_id = self.vocab["[SEP]"] self.mask_id = self.vocab["[MASK]"] self.in_tokens = in_tokens + self.phase = phase self.is_inference = is_inference + self.learning_strategy = learning_strategy self.for_cn = for_cn self.task_id = task_id @@ -83,7 +92,6 @@ class BaseReader(object): self.current_example = 0 self.current_epoch = 0 self.num_examples = 0 - self.examples = {} if label_map_config: @@ -124,6 +132,7 @@ class BaseReader(object): tokens_a.pop() else: tokens_b.pop() + def _convert_example_to_record(self, example, max_seq_length, tokenizer): """Converts a single `Example` into a single `Record`.""" @@ -131,26 +140,33 @@ class BaseReader(object): text_a = tokenization.convert_to_unicode(example.text_a) tokens_a = tokenizer.tokenize(text_a) tokens_b = None - has_text_b = False + has_text_b_neg = False if isinstance(example, dict): has_text_b = "text_b" in example.keys() + has_text_b_neg = "text_b_neg" in example.keys() else: has_text_b = "text_b" in example._fields + has_text_b_neg = "text_b_neg" in example._fields if has_text_b: text_b = tokenization.convert_to_unicode(example.text_b) tokens_b = tokenizer.tokenize(text_b) - - if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + + if has_text_b_neg and self.phase == 'train': + tokens_a_neg = tokenizer.tokenize(text_a) + text_b_neg = tokenization.convert_to_unicode(example.text_b_neg) + tokens_b_neg = tokenizer.tokenize(text_b_neg) + self._truncate_seq_pair(tokens_a_neg, tokens_b_neg, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] + # The convention in BERT/ERNIE is: # (a) For sequence pairs: @@ -173,6 +189,7 @@ class BaseReader(object): tokens = [] text_type_ids = [] tokens.append("[CLS]") + text_type_ids.append(0) for token in tokens_a: tokens.append(token) @@ -190,6 +207,29 @@ class BaseReader(object): token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) + + if has_text_b_neg and self.phase == 'train': + tokens_neg = [] + text_type_ids_neg = [] + tokens_neg.append("[CLS]") + text_type_ids_neg.append(0) + for token in tokens_a_neg: + tokens_neg.append(token) + text_type_ids_neg.append(0) + tokens_neg.append("[SEP]") + text_type_ids_neg.append(0) + + if tokens_b_neg: + for token in tokens_b_neg: + tokens_neg.append(token) + text_type_ids_neg.append(1) + tokens_neg.append("[SEP]") + text_type_ids_neg.append(1) + + token_ids_neg = tokenizer.convert_tokens_to_ids(tokens_neg) + position_ids_neg = list(range(len(token_ids_neg))) + + if self.is_inference: Record = namedtuple('Record', ['token_ids', 'text_type_ids', 'position_ids']) @@ -198,25 +238,38 @@ class BaseReader(object): text_type_ids=text_type_ids, position_ids=position_ids) else: - if self.label_map: - label_id = self.label_map[example.label] - else: - label_id = example.label - - Record = namedtuple('Record', [ - 'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid' - ]) - qid = None if "qid" in example._fields: qid = example.qid + if self.learning_strategy == 'pairwise' and self.phase == 'train': + Record = namedtuple('Record', + ['token_ids', 'text_type_ids', 'position_ids', 'token_ids_neg', 'text_type_ids_neg', 'position_ids_neg', 'qid']) + + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + token_ids_neg=token_ids_neg, + text_type_ids_neg=text_type_ids_neg, + position_ids_neg=position_ids_neg, + qid=qid) + + else: + if self.label_map: + label_id = self.label_map[example.label] + else: + label_id = example.label - record = Record( - token_ids=token_ids, - text_type_ids=text_type_ids, - position_ids=position_ids, - label_id=label_id, - qid=qid) + Record = namedtuple('Record', [ + 'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid' + ]) + + record = Record( + token_ids=token_ids, + text_type_ids=text_type_ids, + position_ids=position_ids, + label_id=label_id, + qid=qid) return record def _prepare_batch_data(self, examples, batch_size, phase='train'): @@ -228,7 +281,7 @@ class BaseReader(object): if phase == "train": self.current_example = index record = self._convert_example_to_record(example, self.max_seq_len, - self.tokenizer) + self.tokenizer) max_len = max(max_len, len(record.token_ids)) if self.in_tokens: to_append = (len(batch_records) + 1) * max_len <= batch_size @@ -240,7 +293,7 @@ class BaseReader(object): yield self._pad_batch_records(batch_records) batch_records, max_len = [record], len(record.token_ids) - if phase == 'pred' and batch_records: + if phase == 'predict' and batch_records: yield self._pad_batch_records(batch_records) def get_num_examples(self, input_file=None, phase='train'): @@ -283,6 +336,7 @@ class BaseReader(object): if len(all_dev_batches) == dev_count: for batch in all_dev_batches: yield batch + all_dev_batches = [] def f(): for i in wrapper(): @@ -299,71 +353,7 @@ class BaseReader(object): return f -class ClassifyReader(BaseReader): - def _read_tsv(self, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, 'r', encoding='utf8') as f: - reader = csv_reader(f) - headers = next(reader) - text_indices = [ - index for index, h in enumerate(headers) if h != "label" - ] - Example = namedtuple('Example', headers) - - examples = [] - for line in reader: - for index, text in enumerate(line): - if index in text_indices: - if self.for_cn: - line[index] = text.replace(' ', '') - else: - line[index] = text - example = Example(*line) - examples.append(example) - return examples - - def _pad_batch_records(self, batch_records): - batch_token_ids = [record.token_ids for record in batch_records] - batch_text_type_ids = [record.text_type_ids for record in batch_records] - batch_position_ids = [record.position_ids for record in batch_records] - - if not self.is_inference: - batch_labels = [record.label_id for record in batch_records] - if self.is_classify: - batch_labels = np.array(batch_labels).astype("int64").reshape( - [-1]) - elif self.is_regression: - batch_labels = np.array(batch_labels).astype("float32").reshape( - [-1]) - - if batch_records[0].qid: - batch_qids = [record.qid for record in batch_records] - batch_qids = np.array(batch_qids).astype("int64").reshape( - [-1]) - else: - batch_qids = np.array([]).astype("int64").reshape([-1]) - - # padding - padded_token_ids, input_mask = pad_batch_data( - batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) - padded_text_type_ids = pad_batch_data( - batch_text_type_ids, pad_idx=self.pad_id) - padded_position_ids = pad_batch_data( - batch_position_ids, pad_idx=self.pad_id) - padded_task_ids = np.ones_like( - padded_token_ids, dtype="int64") * self.task_id - - return_list = [ - padded_token_ids, padded_text_type_ids, padded_position_ids, - padded_task_ids, input_mask - ] - if not self.is_inference: - return_list += [batch_labels, batch_qids] - - return return_list - - -class MaskLMReader(BaseReader): +class MaskLMReader(Reader): def _convert_example_to_record(self, example, max_seq_length, tokenizer): """Converts a single `Example` into a single `Record`.""" @@ -430,13 +420,6 @@ class MaskLMReader(BaseReader): token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(len(token_ids))) - # Record = namedtuple('Record', - # ['token_ids', 'text_type_ids', 'position_ids']) - # record = Record( - # token_ids=token_ids, - # text_type_ids=text_type_ids, - # position_ids=position_ids) - return [token_ids, text_type_ids, position_ids] def batch_reader(self, examples, batch_size, in_tokens, phase): @@ -455,7 +438,7 @@ class MaskLMReader(BaseReader): batch = [parsed_line] total_token_num = len(parsed_line[0]) - if len(batch) > 0 and phase == 'pred': + if len(batch) > 0 and phase == 'predict': yield batch, total_token_num def data_generator(self, @@ -497,19 +480,102 @@ class MaskLMReader(BaseReader): # max_len=self.max_seq_len, # 注意,如果padding到最大长度,会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。 return_input_mask=True, return_max_len=False, - return_num_token=False) + return_num_token=False, + # dev_count=gpu_dev_count) + dev_count=1) - if len(all_dev_batches) < dev_count: - all_dev_batches.append(batch_data) - if len(all_dev_batches) == dev_count: - for batch in all_dev_batches: - yield batch - all_dev_batches = [] + + # yield batch + for piece in palm.distribute.yield_pieces(batch_data, ['s', 's', 's', 's', 's', 'u', 'u'], batch_size): + yield piece return wrapper -class SequenceLabelReader(BaseReader): +class ClassifyReader(Reader): + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, 'r', encoding='utf8') as f: + reader = csv_reader(f) + headers = next(reader) + text_indices = [ + index for index, h in enumerate(headers) if h != "label" + ] + Example = namedtuple('Example', headers) + examples = [] + for line in reader: + for index, text in enumerate(line): + if index in text_indices: + if self.for_cn: + line[index] = text.replace(' ', '') + else: + line[index] = text + example = Example(*line) + examples.append(example) + return examples + + def _pad_batch_records(self, batch_records): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + if self.phase=='train' and self.learning_strategy == 'pairwise': + batch_token_ids_neg = [record.token_ids_neg for record in batch_records] + batch_text_type_ids_neg = [record.text_type_ids_neg for record in batch_records] + batch_position_ids_neg = [record.position_ids_neg for record in batch_records] + + if not self.is_inference: + if not self.learning_strategy == 'pairwise': + batch_labels = [record.label_id for record in batch_records] + if self.is_classify: + batch_labels = np.array(batch_labels).astype("int64").reshape( + [-1]) + elif self.is_regression: + batch_labels = np.array(batch_labels).astype("float32").reshape( + [-1]) + + if batch_records[0].qid: + batch_qids = [record.qid for record in batch_records] + batch_qids = np.array(batch_qids).astype("int64").reshape( + [-1]) + else: + batch_qids = np.array([]).astype("int64").reshape([-1]) + + # padding + padded_token_ids, input_mask = pad_batch_data( + batch_token_ids, pad_idx=self.pad_id, return_input_mask=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, pad_idx=self.pad_id) + padded_task_ids = np.ones_like( + padded_token_ids, dtype="int64") * self.task_id + + return_list = [ + padded_token_ids, padded_text_type_ids, padded_position_ids, + padded_task_ids, input_mask + ] + + if self.phase=='train': + if self.learning_strategy == 'pairwise': + padded_token_ids_neg, input_mask_neg = pad_batch_data( + batch_token_ids_neg, pad_idx=self.pad_id, return_input_mask=True) + padded_text_type_ids_neg = pad_batch_data( + batch_text_type_ids_neg, pad_idx=self.pad_id) + padded_position_ids_neg = pad_batch_data( + batch_position_ids_neg, pad_idx=self.pad_id) + padded_task_ids_neg = np.ones_like( + padded_token_ids_neg, dtype="int64") * self.task_id + + return_list += [padded_token_ids_neg, padded_text_type_ids_neg, \ + padded_position_ids_neg, padded_task_ids_neg, input_mask_neg] + + elif self.learning_strategy == 'pointwise': + return_list += [batch_labels] + + return return_list + + +class SequenceLabelReader(Reader): def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] @@ -550,19 +616,7 @@ class SequenceLabelReader(BaseReader): ret_labels.append(label) continue - if label == "O" or label.startswith("I-"): - ret_labels.extend([label] * len(sub_token)) - elif label.startswith("B-"): - i_label = "I-" + label[2:] - ret_labels.extend([label] + [i_label] * (len(sub_token) - 1)) - elif label.startswith("S-"): - b_laebl = "B-" + label[2:] - e_label = "E-" + label[2:] - i_label = "I-" + label[2:] - ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label]) - elif label.startswith("E-"): - i_label = "I-" + label[2:] - ret_labels.extend([i_label] * (len(sub_token) - 1) + [label]) + ret_labels.extend([label] * len(sub_token)) assert len(ret_tokens) == len(ret_labels) return ret_tokens, ret_labels @@ -581,6 +635,9 @@ class SequenceLabelReader(BaseReader): position_ids = list(range(len(token_ids))) text_type_ids = [0] * len(token_ids) no_entity_id = len(self.label_map) - 1 + labels = [ + label if label in self.label_map else u"O" for label in labels + ] label_ids = [no_entity_id] + [ self.label_map[label] for label in labels ] + [no_entity_id] @@ -596,7 +653,7 @@ class SequenceLabelReader(BaseReader): return record -class ExtractEmbeddingReader(BaseReader): +class ExtractEmbeddingReader(Reader): def _pad_batch_records(self, batch_records): batch_token_ids = [record.token_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records] @@ -623,7 +680,7 @@ class ExtractEmbeddingReader(BaseReader): return return_list -class MRCReader(BaseReader): +class MRCReader(Reader): def __init__(self, vocab_path, label_map_config=None, @@ -804,7 +861,7 @@ class MRCReader(BaseReader): if start_offset + length == len(all_doc_tokens): break start_offset += min(length, self.doc_stride) - + for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} @@ -888,11 +945,20 @@ class MRCReader(BaseReader): if to_append: batch_records.append(record) else: - yield self._pad_batch_records(batch_records, phase == "train") + # yield self._pad_batch_records(batch_records, phase == "train") + ds = ['s'] * 8 + for piece in palm.distribute.yield_pieces(\ + self._pad_batch_records(batch_records, phase == 'train'), + ds, batch_size): + yield piece batch_records, max_len = [record], len(record.token_ids) + + if phase == 'predict' and batch_records: + for piece in palm.distribute.yield_pieces(\ + self._pad_batch_records(batch_records, phase == 'train'), + ds, batch_size): + yield piece - if phase == 'pred' and batch_records: - yield self._pad_batch_records(batch_records, phase == "train") def _pad_batch_records(self, batch_records, is_training): batch_token_ids = [record.token_ids for record in batch_records] @@ -976,15 +1042,11 @@ class MRCReader(BaseReader): self.current_epoch = epoch_index if phase == "train" and shuffle: np.random.shuffle(features) - + for batch_data in self._prepare_batch_data( features, batch_size, phase=phase): - if len(all_dev_batches) < dev_count: - all_dev_batches.append(batch_data) - if len(all_dev_batches) == dev_count: - for batch in all_dev_batches: - yield batch - all_dev_batches = [] + + yield batch_data return wrapper diff --git a/paddlepalm/task_instance.py b/paddlepalm/task_instance.py index 091526912cdaa3db8f475f10cc4bf2409d2bd31e..bcf053b7a9bbc47008d01e39eb493ad5d543422e 100644 --- a/paddlepalm/task_instance.py +++ b/paddlepalm/task_instance.py @@ -71,10 +71,10 @@ class TaskInstance(object): self._train_finish = False # 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例 - self._reader = {'train': None, 'eval': None, 'pred': None} + self._reader = {'train': None, 'eval': None, 'predict': None} self._input_layer = None self._inputname_to_varname = {} - self._task_layer = {'train': None, 'eval': None, 'pred': None} + self._task_layer = {'train': None, 'eval': None, 'predict': None} self._pred_input_name_list = [] self._pred_input_varname_list = [] self._pred_fetch_name_list = [] @@ -90,7 +90,7 @@ class TaskInstance(object): def build_task_layer(self, net_inputs, phase, scope=""): output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope) - if phase == 'pred': + if phase == 'predict': if output_vars is not None: self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items()) else: diff --git a/paddlepalm/utils/basic_helper.py b/paddlepalm/utils/basic_helper.py index dc15482f5eb5434d4cfeb745a0a5f6e8e3aed4af..a9d503472d855cc70db2ca6d7f8eddd31a69b223 100644 --- a/paddlepalm/utils/basic_helper.py +++ b/paddlepalm/utils/basic_helper.py @@ -3,6 +3,7 @@ import os import json import yaml from config_helper import PDConfig +import logging from paddle import fluid def get_basename(f): diff --git a/paddlepalm/utils/reader_helper.py b/paddlepalm/utils/reader_helper.py index 9e93dc5c83a5007830c15845c1c20a9bba68769c..2a62d034ebf65eeac05c9feeb20e69dac3de89cd 100644 --- a/paddlepalm/utils/reader_helper.py +++ b/paddlepalm/utils/reader_helper.py @@ -16,6 +16,7 @@ import os import sys import random +import logging import numpy as np import paddle from paddle import fluid