提交 174d3fec 编写于 作者: X xixiaoyao

Merge branch 'r0.3-api' of https://github.com/PaddlePaddle/PALM into r0.3-api

...@@ -50,7 +50,7 @@ cd PALM && python setup.py install ...@@ -50,7 +50,7 @@ cd PALM && python setup.py install
**环境依赖** **环境依赖**
- Python >= 2.7 (即将支持python3) - Python >= 2.7
- cuda >= 9.0 - cuda >= 9.0
- cudnn >= 7.0 - cudnn >= 7.0
- PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装) - PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装)
......
../../pretrain/
\ No newline at end of file
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddlepalm as palm
import sys
import argparse
# create parser
parser = argparse.ArgumentParser(prog='download_models.py', usage='python %(prog)s -l | -d <model_name> [-h]\n\nFor example,\n\tpython %(prog)s -d bert-en-uncased-large ',description = 'Download pretrain models for initializing params of backbones. ')
parser1= parser.add_argument_group("required arguments")
parser1.add_argument('-l','--list', action = 'store_true', help = 'show the list of available pretrain models', default = False)
parser1.add_argument('-d','--download', action = 'store', help = 'download pretrain models. The available pretrain models can be listed by run "python download_models.py -l"')
args = parser.parse_args()
if(args.list):
palm.downloader.ls('pretrain')
elif(args.download):
print('download~~~')
print(args.download)
palm.downloader.download('pretrain', args.download)
else:
print (parser.parse_args(['-h']))
...@@ -34,13 +34,16 @@ _items = { ...@@ -34,13 +34,16 @@ _items = {
'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz', 'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz',
'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz', 'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz',
'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz', 'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz',
'ernie-ch-uncased-base':'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz',
'roberta-cn-base': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz',
'roberta-cn-large': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz',
'utils': None}, 'utils': None},
'reader': {'utils': None}, 'reader': {'utils': None},
'backbone': {'utils': None}, 'backbone': {'utils': None},
'tasktype': {'utils': None}, 'tasktype': {'utils': None},
} }
def _download(item, scope, path, silent=False): def _download(item, scope, path, silent=False, convert=False):
data_url = _items[item][scope] data_url = _items[item][scope]
if data_url == None: if data_url == None:
return return
...@@ -100,6 +103,7 @@ def _download(item, scope, path, silent=False): ...@@ -100,6 +103,7 @@ def _download(item, scope, path, silent=False):
os.removedirs(source_path) os.removedirs(source_path)
if not silent: if not silent:
print ('done!') print ('done!')
if convert:
if not silent: if not silent:
print ('Converting params...', end=" ") print ('Converting params...', end=" ")
_convert(data_dir, silent) _convert(data_dir, silent)
......
...@@ -29,22 +29,31 @@ from paddlepalm.backbone.base_backbone import Backbone ...@@ -29,22 +29,31 @@ from paddlepalm.backbone.base_backbone import Backbone
class BERT(Backbone): class BERT(Backbone):
def __init__(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \ max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \
attention_probs_dropout_prob, initializer_range, phase='train'): attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
config = {}
config['hidden_size'] = hidden_size self._emb_size = hidden_size
config['num_hidden_layers'] = num_hidden_layers self._n_layer = num_hidden_layers
config['num_attention_heads'] = num_attention_heads self._n_head = num_attention_heads
config['vocab_size'] = vocab_size self._voc_size = vocab_size
config['max_position_embeddings'] = max_position_embeddings self._max_position_seq_len = max_position_embeddings
config['type_vocab_size'] = sent_type_vocab_size self._sent_types = type_vocab_size
config['hidden_act'] = hidden_act
config['hidden_dropout_prob'] = hidden_dropout_prob
config['attention_probs_dropout_prob'] = attention_probs_dropout_prob self._hidden_act = hidden_act
config['initializer_range'] = initializer_range self._prepostprocess_dropout = hidden_dropout_prob
self._attention_dropout = attention_probs_dropout_prob
self.from_config(config, phase=phase)
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
self._phase = phase
self._is_pairwise = is_pairwise
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range)
@classmethod @classmethod
def from_config(self, config, phase='train'): def from_config(self, config, phase='train'):
...@@ -62,40 +71,57 @@ class BERT(Backbone): ...@@ -62,40 +71,57 @@ class BERT(Backbone):
"{} is required to initialize ERNIE".format('attention_probs_dropout_prob') "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range') assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range')
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变 hidden_size = config['hidden_size']
self._emb_size = config["hidden_size"] num_hidden_layers = config['num_hidden_layers']
self._n_layer = config["num_hidden_layers"] num_attention_heads = config['num_attention_heads']
self._n_head = config["num_attention_heads"] vocab_size = config['vocab_size']
self._voc_size = config["vocab_size"] max_position_embeddings = config['max_position_embeddings']
self._max_position_seq_len = config["max_position_embeddings"] if 'sent_type_vocab_size' in config:
self._sent_types = config["type_vocab_size"] sent_type_vocab_size = config['sent_type_vocab_size']
self._hidden_act = config["hidden_act"] else:
self._prepostprocess_dropout = config["hidden_dropout_prob"] sent_type_vocab_size = config['type_vocab_size']
self._attention_dropout = config["attention_probs_dropout_prob"]
hidden_act = config['hidden_act']
self._word_emb_name = "word_embedding" hidden_dropout_prob = config['hidden_dropout_prob']
self._pos_emb_name = "pos_embedding" attention_probs_dropout_prob = config['attention_probs_dropout_prob']
self._sent_emb_name = "sent_embedding" initializer_range = config['initializer_range']
if 'is_pairwise' in config:
# Initialize all weigths by truncated normal initializer, and all biases is_pairwise = config['is_pairwise']
# will be initialized by constant zero by default. else:
self._param_initializer = fluid.initializer.TruncatedNormal( is_pairwise = False
scale=config["initializer_range"])
return self(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase)
@property @property
def inputs_attr(self): def inputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'], ret = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32']} "input_mask": [[-1, -1, 1], 'float32'],
}
if self._is_pairwise and self._phase=='train':
ret.update({"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
})
return ret
@property @property
def outputs_attr(self): def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'], ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'], "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'], "sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']} "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
if self._is_pairwise and self._phase == 'train':
ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
"encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
return ret
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids'] src_ids = inputs['token_ids']
...@@ -104,6 +130,21 @@ class BERT(Backbone): ...@@ -104,6 +130,21 @@ class BERT(Backbone):
input_mask = inputs['input_mask'] input_mask = inputs['input_mask']
self._emb_dtype = 'float32' self._emb_dtype = 'float32'
input_buffer = {}
output_buffer = {}
input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask]
output_buffer['base'] = {}
if self._is_pairwise and self._phase =='train':
src_ids = inputs['token_ids_neg']
pos_ids = inputs['position_ids_neg']
sent_ids = inputs['segment_ids_neg']
input_mask = inputs['input_mask_neg']
input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask]
output_buffer['neg'] = {}
for key, (src_ids, pos_ids, sent_ids, input_mask) in input_buffer.items():
# padding id in vocabulary must be set to 0 # padding id in vocabulary must be set to 0
emb_out = fluid.embedding( emb_out = fluid.embedding(
input=src_ids, input=src_ids,
...@@ -174,12 +215,25 @@ class BERT(Backbone): ...@@ -174,12 +215,25 @@ class BERT(Backbone):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0") bias_attr=scope_name+"pooled_fc.b_0")
output_buffer[key]['word_embedding'] = emb_out
return {'embedding_table': embedding_table, output_buffer[key]['encoder_outputs'] = enc_out
'word_embedding': emb_out, output_buffer[key]['sentence_embedding'] = next_sent_feat
'encoder_outputs': enc_out, output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat} ret = {}
ret['embedding_table'] = embedding_table
ret['word_embedding'] = output_buffer['base']['word_embedding']
ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
if self._is_pairwise and self._phase == 'train':
ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
return ret
def postprocess(self, rt_outputs): def postprocess(self, rt_outputs):
pass pass
......
...@@ -31,7 +31,7 @@ class ERNIE(Backbone): ...@@ -31,7 +31,7 @@ class ERNIE(Backbone):
def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \ max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase='train'): hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变 # self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
...@@ -53,7 +53,8 @@ class ERNIE(Backbone): ...@@ -53,7 +53,8 @@ class ERNIE(Backbone):
self._sent_emb_name = "sent_embedding" self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding" self._task_emb_name = "task_embedding"
self._emb_dtype = "float32" self._emb_dtype = "float32"
self._is_pairwise = is_pairwise
self._phase=phase
self._param_initializer = fluid.initializer.TruncatedNormal( self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range) scale=initializer_range)
...@@ -65,7 +66,7 @@ class ERNIE(Backbone): ...@@ -65,7 +66,7 @@ class ERNIE(Backbone):
assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size') assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size')
assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings') assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings')
assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size') assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size')
assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size') # assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act') assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act')
assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob') assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob')
assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob') assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
...@@ -80,40 +81,76 @@ class ERNIE(Backbone): ...@@ -80,40 +81,76 @@ class ERNIE(Backbone):
sent_type_vocab_size = config['sent_type_vocab_size'] sent_type_vocab_size = config['sent_type_vocab_size']
else: else:
sent_type_vocab_size = config['type_vocab_size'] sent_type_vocab_size = config['type_vocab_size']
if 'task_type_vocab_size' in config:
task_type_vocab_size = config['task_type_vocab_size'] task_type_vocab_size = config['task_type_vocab_size']
else:
task_type_vocab_size = config['type_vocab_size']
hidden_act = config['hidden_act'] hidden_act = config['hidden_act']
hidden_dropout_prob = config['hidden_dropout_prob'] hidden_dropout_prob = config['hidden_dropout_prob']
attention_probs_dropout_prob = config['attention_probs_dropout_prob'] attention_probs_dropout_prob = config['attention_probs_dropout_prob']
initializer_range = config['initializer_range'] initializer_range = config['initializer_range']
if 'is_pairwise' in config:
is_pairwise = config['is_pairwise']
else:
is_pairwise = False
return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \ max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase=phase) hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase=phase)
@property @property
def inputs_attr(self): def inputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'], ret = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'], "input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1,-1], 'int64']} "task_ids": [[-1,-1], 'int64']}
if self._is_pairwise and self._phase=='train':
ret.update({"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
"task_ids_neg": [[-1,-1], 'int64']
})
return ret
@property @property
def outputs_attr(self): def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'], ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'], "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'], "sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']} "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
if self._is_pairwise and self._phase == 'train':
ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
"encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
return ret
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids'] src_ids = inputs['token_ids']
pos_ids = inputs['position_ids'] pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids'] sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask'] input_mask = inputs['input_mask']
task_ids = inputs['task_ids'] task_ids = inputs['task_ids']
input_buffer = {}
output_buffer = {}
input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
output_buffer['base'] = {}
if self._is_pairwise and self._phase =='train':
src_ids = inputs['token_ids_neg']
pos_ids = inputs['position_ids_neg']
sent_ids = inputs['segment_ids_neg']
input_mask = inputs['input_mask_neg']
task_ids = inputs['task_ids_neg']
input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
output_buffer['neg'] = {}
for key, (src_ids, pos_ids, sent_ids, input_mask, task_ids) in input_buffer.items():
# padding id in vocabulary must be set to 0 # padding id in vocabulary must be set to 0
emb_out = fluid.embedding( emb_out = fluid.embedding(
input=src_ids, input=src_ids,
...@@ -183,7 +220,6 @@ class ERNIE(Backbone): ...@@ -183,7 +220,6 @@ class ERNIE(Backbone):
param_initializer=self._param_initializer, param_initializer=self._param_initializer,
name=scope_name+'encoder') name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice( next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1]) input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]]) next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
...@@ -195,11 +231,25 @@ class ERNIE(Backbone): ...@@ -195,11 +231,25 @@ class ERNIE(Backbone):
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0") bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table, output_buffer[key]['word_embedding'] = emb_out
'word_embedding': emb_out, output_buffer[key]['encoder_outputs'] = enc_out
'encoder_outputs': enc_out, output_buffer[key]['sentence_embedding'] = next_sent_feat
'sentence_embedding': next_sent_feat, output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
'sentence_pair_embedding': next_sent_feat}
ret = {}
ret['embedding_table'] = embedding_table
ret['word_embedding'] = output_buffer['base']['word_embedding']
ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
if self._is_pairwise and self._phase == 'train':
ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
return ret
def postprocess(self, rt_outputs): def postprocess(self, rt_outputs):
pass pass
......
...@@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size): ...@@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size):
assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors." assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors."
data_list = data data_list = data
ds_list = distribute_strategy ds_list = distribute_strategy
stride = batch_size // dev_count stride = batch_size // dev_count
p = stride p = stride
# while p < len(data_list) + stride: # while p < len(data_list) + stride:
...@@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size): ...@@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size):
# print(len(temp)) # print(len(temp))
yield temp yield temp
def data_feeder(reader, postprocess_fn=None, prefetch_steps=2, phase='train'):
def data_feeder(reader, postprocess_fn=None, prefetch_steps=2):
if postprocess_fn is None: if postprocess_fn is None:
def postprocess_fn(batch): def postprocess_fn(batch):
return batch return batch
......
from cls import Classify from cls import Classify
# from match import Match from match import Match
# from mrc import MRC from ner import SequenceLabel
# from mlm import MaskLM from mrc import MRC
from mlm import MaskLM
...@@ -13,41 +13,66 @@ ...@@ -13,41 +13,66 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import layers from paddle.fluid import layers
from paddlepalm.interface import task_paradigm from paddlepalm.head.base_head import Head
import numpy as np import numpy as np
import os import os
import json
def computeHingeLoss(pos, neg, margin):
loss_part1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=pos, shape=[-1, 1], value=margin, dtype='float32'), pos)
loss_part2 = fluid.layers.elementwise_add(loss_part1, neg)
loss_part3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2)
return loss_part3
class TaskParadigm(task_paradigm):
class Match(Head):
''' '''
matching matching
''' '''
def __init__(self, config, phase, backbone_config=None):
def __init__(self, num_classes, input_dim, dropout_prob=0.0, param_initializer_range=0.02, \
learning_strategy='pointwise', margin=0.5, phase='train'):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
learning_strategy: pointwise, pairwise
"""
self._is_training = phase == 'train' self._is_training = phase == 'train'
self._hidden_size = backbone_config['hidden_size'] self._hidden_size = input_dim
if 'initializer_range' in config: self._num_classes = num_classes
self._param_initializer = config['initializer_range']
else: self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = fluid.initializer.TruncatedNormal( self._param_initializer = fluid.initializer.TruncatedNormal(
scale=backbone_config.get('initializer_range', 0.02)) scale=param_initializer_range)
if 'dropout_prob' in config: self._learning_strategy = learning_strategy
self._dropout_prob = config['dropout_prob'] self._margin = margin
else:
self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
self._pred_output_path = config.get('pred_output_path', None)
self._preds = []
self._preds = []
self._preds_logits = []
@property @property
def inputs_attrs(self): def inputs_attrs(self):
if self._is_training:
reader = {"label_ids": [[-1, 1], 'int64']}
else:
reader = {} reader = {}
bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']} bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
if self._is_training:
if self._learning_strategy == 'pointwise':
reader["label_ids"] = [[-1], 'int64']
elif self._learning_strategy == 'pairwise':
bb["sentence_pair_embedding_neg"] = [[-1, self._hidden_size], 'float32']
return {'reader': reader, 'backbone': bb} return {'reader': reader, 'backbone': bb}
@property @property
...@@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm): ...@@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm):
if self._is_training: if self._is_training:
return {"loss": [[1], 'float32']} return {"loss": [[1], 'float32']}
else: else:
return {"logits": [[-1, 2], 'float32']} if self._learning_strategy=='paiwise':
return {"probs": [[-1, 1], 'float32']}
else:
return {"logits": [[-1, 2], 'float32'],
"probs": [[-1, 2], 'float32']}
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
if self._is_training:
labels = inputs["reader"]["label_ids"]
cls_feats = inputs["backbone"]["sentence_pair_embedding"]
# inputs
cls_feats = inputs["backbone"]["sentence_pair_embedding"]
if self._is_training: if self._is_training:
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
x=cls_feats, x=cls_feats,
dropout_prob=self._dropout_prob, dropout_prob=self._dropout_prob,
dropout_implementation="upscale_in_train") dropout_implementation="upscale_in_train")
if self._learning_strategy == 'pairwise':
cls_feats_neg = inputs["backbone"]["sentence_pair_embedding_neg"]
cls_feats_neg = fluid.layers.dropout(
x=cls_feats_neg,
dropout_prob=self._dropout_prob,
dropout_implementation="upscale_in_train")
elif self._learning_strategy == 'pointwise':
labels = inputs["reader"]["label_ids"]
# loss
# for pointwise
if self._learning_strategy == 'pointwise':
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=cls_feats, input=cls_feats,
size=2, size=self._num_classes,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w", name=scope_name+"cls_out_w",
initializer=self._param_initializer), initializer=self._param_initializer),
bias_attr=fluid.ParamAttr( bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b", name=scope_name+"cls_out_b",
initializer=fluid.initializer.Constant(0.))) initializer=fluid.initializer.Constant(0.)))
probs = fluid.layers.softmax(logits)
if self._is_training: if self._is_training:
ce_loss, probs = fluid.layers.softmax_with_cross_entropy( ce_loss = fluid.layers.cross_entropy(
logits=logits, label=labels, return_softmax=True) input=probs, label=labels)
loss = fluid.layers.mean(x=ce_loss) loss = fluid.layers.mean(x=ce_loss)
return {'loss': loss} return {'loss': loss}
# for pred
else: else:
return {'logits': logits} return {'logits': logits,
'probs': probs}
# for pairwise
elif self._learning_strategy == 'pairwise':
pos_score = fluid.layers.fc(
input=cls_feats,
size=1,
act = "sigmoid",
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w_pr",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b_pr",
initializer=fluid.initializer.Constant(0.)))
pos_score = fluid.layers.reshape(x=pos_score, shape=[-1, 1], inplace=True)
def postprocess(self, rt_outputs): if self._is_training:
neg_score = fluid.layers.fc(
input=cls_feats_neg,
size=1,
act = "sigmoid",
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w_pr",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b_pr",
initializer=fluid.initializer.Constant(0.)))
neg_score = fluid.layers.reshape(x=neg_score, shape=[-1, 1], inplace=True)
loss = fluid.layers.mean(computeHingeLoss(pos_score, neg_score, self._margin))
return {'loss': loss}
# for pred
else:
return {'probs': pos_score}
def batch_postprocess(self, rt_outputs):
if not self._is_training: if not self._is_training:
probs = []
logits = []
probs = rt_outputs['probs']
self._preds.extend(probs.tolist())
if self._learning_strategy == 'pointwise':
logits = rt_outputs['logits'] logits = rt_outputs['logits']
preds = np.argmax(logits, -1) self._preds_logits.extend(logits.tolist())
self._preds.extend(preds.tolist())
def epoch_postprocess(self, post_inputs): def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training: if not self._is_training:
if self._pred_output_path is None: if output_dir is None:
raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer: with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds: for i in range(len(self._preds)):
writer.write(str(p)+'\n') if self._learning_strategy == 'pointwise':
print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json')) label = 0 if self._preds[i][0] > self._preds[i][1] else 1
result = {'index': i, 'label': label, 'logits': self._preds_logits[i], 'probs': self._preds[i]}
elif self._learning_strategy == 'pairwise':
label = 0 if self._preds[i][0] < 0.5 else 1
result = {'index': i, 'label': label, 'probs': self._preds[i][0]}
result = json.dumps(result)
writer.write(result+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
\ No newline at end of file
...@@ -14,30 +14,39 @@ ...@@ -14,30 +14,39 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlepalm.interface import task_paradigm from paddlepalm.head.base_head import Head
from paddle.fluid import layers from paddle.fluid import layers
import numpy as np
import os
from paddlepalm.backbone.utils.transformer import pre_process_layer from paddlepalm.backbone.utils.transformer import pre_process_layer
class TaskParadigm(task_paradigm): class MaskLM(Head):
''' '''
matching mlm
''' '''
def __init__(self, config, phase, backbone_config=None): def __init__(self, input_dim, vocab_size, hidden_act, initializer_range, dropout_prob=0.0, \
param_initializer_range=0.02, phase='train'):
self._is_training = phase == 'train' self._is_training = phase == 'train'
self._emb_size = backbone_config['hidden_size'] self._emb_size = input_dim
self._hidden_size = backbone_config['hidden_size'] self._hidden_size = input_dim
self._vocab_size = backbone_config['vocab_size'] self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._hidden_act = backbone_config['hidden_act'] self._param_initializer = fluid.initializer.TruncatedNormal(
self._initializer_range = backbone_config['initializer_range'] scale=param_initializer_range)
self._preds = []
self._vocab_size = vocab_size
self._hidden_act = hidden_act
self._initializer_range = initializer_range
@property @property
def inputs_attrs(self): def inputs_attrs(self):
reader = { reader = {
"mask_label": [[-1, 1], 'int64'], "token_ids":[[-1, -1], 'int64'],
"mask_pos": [[-1, 1], 'int64']} "mask_label": [[-1], 'int64'],
"mask_pos": [[-1], 'int64'],
}
if not self._is_training: if not self._is_training:
del reader['mask_label'] del reader['mask_label']
del reader['batchsize_x_seqlen']
bb = { bb = {
"encoder_outputs": [[-1, -1, self._hidden_size], 'float32'], "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
"embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']} "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
...@@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm): ...@@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm):
mask_pos = inputs["reader"]["mask_pos"] mask_pos = inputs["reader"]["mask_pos"]
if self._is_training: if self._is_training:
mask_label = inputs["reader"]["mask_label"] mask_label = inputs["reader"]["mask_label"]
max_position = inputs["reader"]["batchsize_x_seqlen"] - 1 l1 = fluid.layers.shape(inputs["reader"]["token_ids"] )[0]
# bxs = inputs["reader"]["token_ids"].shape[2].value
l2 = fluid.layers.shape(inputs["reader"]["token_ids"][0])[0]
bxs = (l1*l2).astype(np.int64)
# max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
max_position = bxs - 1
mask_pos = fluid.layers.elementwise_min(mask_pos, max_position) mask_pos = fluid.layers.elementwise_min(mask_pos, max_position)
mask_pos.stop_gradient = True mask_pos.stop_gradient = True
...@@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm): ...@@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm):
is_bias=True) is_bias=True)
if self._is_training: if self._is_training:
mask_lm_loss = fluid.layers.softmax_with_cross_entropy( inputs = fluid.layers.softmax(fc_out)
logits=fc_out, label=mask_label) mask_lm_loss = fluid.layers.cross_entropy(
input=inputs, label=mask_label)
loss = fluid.layers.mean(mask_lm_loss) loss = fluid.layers.mean(mask_lm_loss)
return {'loss': loss} return {'loss': loss}
else: else:
return {'logits': fc_out} return {'logits': fc_out}
def batch_postprocess(self, rt_outputs):
if not self._is_training:
logits = rt_outputs['logits']
preds = np.argmax(logits, -1)
self._preds.extend(preds.tolist())
return preds
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if output_dir is None:
for p in self._preds:
print(p)
else:
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlepalm.interface import task_paradigm from paddlepalm.head.base_head import Head
import collections import collections
import numpy as np import numpy as np
import os import os
...@@ -26,34 +26,37 @@ import json ...@@ -26,34 +26,37 @@ import json
RawResult = collections.namedtuple("RawResult", RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"]) ["unique_id", "start_logits", "end_logits"])
class TaskParadigm(task_paradigm): class MRC(Head):
"""""" """
Machine Reading Comprehension
"""
def __init__(self, config, phase, backbone_config=None): def __init__(self, max_query_len, input_dim, pred_output_path=None, verbose=False, with_negative=False, do_lower_case=False, max_ans_len=None, null_score_diff_threshold=0.0, n_best_size=20, phase='train'):
self._is_training = phase == 'train' self._is_training = phase == 'train'
self._max_sequence_length = config['max_seq_len'] self._hidden_size = input_dim
self._hidden_size = backbone_config['hidden_size'] self._max_sequence_length = max_query_len
self._pred_results = [] self._pred_results = []
if phase == 'pred': output_dir = pred_output_path
self._max_answer_length = config.get('max_answer_len', None) self._max_answer_length = max_ans_len
self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0) self._null_score_diff_threshold = null_score_diff_threshold
self._n_best_size = config.get('n_best_size', 20) self._n_best_size = n_best_size
self._pred_output_path = config.get('pred_output_path', None) output_dir = pred_output_path
self._verbose = config.get('verbose', False) self._verbose = verbose
self._with_negative = config.get('with_negative', False) self._with_negative = with_negative
self._do_lower_case = config.get('do_lower_case', False) self._do_lower_case = do_lower_case
@property @property
def inputs_attrs(self): def inputs_attrs(self):
if self._is_training: if self._is_training:
reader = {"start_positions": [[-1, 1], 'int64'], reader = {"start_positions": [[-1], 'int64'],
"end_positions": [[-1, 1], 'int64'], "end_positions": [[-1], 'int64'],
} }
else: else:
reader = {'unique_ids': [[-1, 1], 'int64']} reader = {'unique_ids': [[-1], 'int64']}
bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']} bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
return {'reader': reader, 'backbone': bb} return {'reader': reader, 'backbone': bb}
...@@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm): ...@@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm):
else: else:
return {'start_logits': [[-1, -1, 1], 'float32'], return {'start_logits': [[-1, -1, 1], 'float32'],
'end_logits': [[-1, -1, 1], 'float32'], 'end_logits': [[-1, -1, 1], 'float32'],
'unique_ids': [[-1, 1], 'int64']} 'unique_ids': [[-1], 'int64']}
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
if self._is_training: if self._is_training:
start_positions = inputs['reader']['start_positions'] start_positions = inputs['reader']['start_positions']
end_positions = inputs['reader']['end_positions'] end_positions = inputs['reader']['end_positions']
max_position = inputs["reader"]["seqlen"] - 1 # max_position = inputs["reader"]["seqlen"] - 1
start_positions = fluid.layers.elementwise_min(start_positions, max_position) # start_positions = fluid.layers.elementwise_min(start_positions, max_position)
end_positions = fluid.layers.elementwise_min(end_positions, max_position) # end_positions = fluid.layers.elementwise_min(end_positions, max_position)
start_positions.stop_gradient = True start_positions.stop_gradient = True
end_positions.stop_gradient = True end_positions.stop_gradient = True
else: else:
unique_id = inputs['reader']['unique_ids'] unique_id = inputs['reader']['unique_ids']
# It's used to help fetch variable 'unique_ids' that will be removed in the future
helper_constant = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
fluid.layers.elementwise_mul(unique_id, helper_constant)
enc_out = inputs['backbone']['encoder_outputs'] enc_out = inputs['backbone']['encoder_outputs']
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=enc_out, input=enc_out,
...@@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm): ...@@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm):
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
def _compute_single_loss(logits, positions): def _compute_single_loss(logits, positions):
"""Compute start/end loss for mrc model""" """Compute start/en
loss = fluid.layers.softmax_with_cross_entropy( d loss for mrc model"""
logits=logits, label=positions) inputs = fluid.layers.softmax(logits)
loss = fluid.layers.cross_entropy(
input=inputs, label=positions)
loss = fluid.layers.mean(x=loss) loss = fluid.layers.mean(x=loss)
return loss return loss
...@@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm): ...@@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm):
'unique_ids': unique_id} 'unique_ids': unique_id}
def postprocess(self, rt_outputs): def batch_postprocess(self, rt_outputs):
"""this func will be called after each step(batch) of training/evaluating/predicting process.""" """this func will be called after each step(batch) of training/evaluating/predicting process."""
if not self._is_training: if not self._is_training:
unique_ids = np.squeeze(rt_outputs['unique_ids'], -1) unique_ids = rt_outputs['unique_ids']
start_logits = rt_outputs['start_logits'] start_logits = rt_outputs['start_logits']
end_logits = rt_outputs['end_logits'] end_logits = rt_outputs['end_logits']
for idx in range(len(unique_ids)): for idx in range(len(unique_ids)):
...@@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm): ...@@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm):
start_logits=s, start_logits=s,
end_logits=e)) end_logits=e))
def epoch_postprocess(self, post_inputs): def epoch_postprocess(self, post_inputs, output_dir=None):
"""(optional interface) this func will be called after evaluation/predicting process and each epoch during training process.""" """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
if not self._is_training: if not self._is_training:
if self._pred_output_path is None: if output_dir is None:
raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
examples = post_inputs['reader']['examples'] examples = post_inputs['reader']['examples']
features = post_inputs['reader']['features'] features = post_inputs['reader']['features']
if not os.path.exists(self._pred_output_path): if not os.path.exists(output_dir):
os.makedirs(self._pred_output_path) os.makedirs(output_dir)
output_prediction_file = os.path.join(self._pred_output_path, "predictions.json") output_prediction_file = os.path.join(output_dir, "predictions.json")
output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json") output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json") output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")
_write_predictions(examples, features, self._pred_results, _write_predictions(examples, features, self._pred_results,
self._n_best_size, self._max_answer_length, self._n_best_size, self._max_answer_length,
self._do_lower_case, output_prediction_file, self._do_lower_case, output_prediction_file,
...@@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size,
# keep track of the minimum score of null start+end of position 0 # keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score ull_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features): for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id] result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size) start_indexes = _get_best_indexes(result.start_logits, n_best_size)
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid import layers
from paddlepalm.head.base_head import Head
import numpy as np
import os
import math
class SequenceLabel(Head):
'''
Sequence label
'''
def __init__(self, num_classes, input_dim, dropout_prob=0.0, learning_rate=1e-3, \
param_initializer_range=0.02, phase='train'):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
"""
self._is_training = phase == 'train'
self._hidden_size = input_dim
self.num_classes = num_classes
self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=param_initializer_range)
self.learning_rate = learning_rate
self._preds = []
@property
def inputs_attrs(self):
reader = {}
bb = {"encoder_outputs": [[-1, -1, -1], 'float32']}
if self._is_training:
reader["label_ids"] = [[-1, -1], 'int64']
reader["seq_lens"] = [[-1], 'int64']
return {'reader': reader, 'backbone': bb}
@property
def outputs_attrs(self):
if self._is_training:
return {'loss': [[1], 'float32']}
else:
return {'emission': [[-1, self.num_classes], 'float32']}
def build(self, inputs, scope_name=''):
token_emb = inputs['backbone']['encoder_outputs']
if self._is_training:
label_ids = inputs['reader']['label_ids']
seq_lens = inputs['reader']['seq_lens']
emission = fluid.layers.fc(
size=self.num_classes,
input=token_emb,
param_attr=fluid.ParamAttr(
initializer=self._param_initializer,
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)),
num_flatten_dims=2)
if self._is_training:
# compute loss
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=label_ids,
param_attr=fluid.ParamAttr(
name=scope_name+'crfw', learning_rate=self.learning_rate),
length=seq_lens)
avg_cost = fluid.layers.mean(x=crf_cost)
crf_decode = fluid.layers.crf_decoding(
input=emission,
param_attr=fluid.ParamAttr(name=scope_name+'crfw'),
length=seq_lens)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval(
input=crf_decode,
label=label_ids,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((self.num_classes - 1) / 2.0)),
seq_length=seq_lens)
chunk_evaluator = fluid.metrics.ChunkEvaluator()
chunk_evaluator.reset()
return {"loss": avg_cost}
else:
return {"emission": emission}
def batch_postprocess(self, rt_outputs):
if not self._is_training:
emission = rt_outputs['emission']
preds = np.argmax(emission, -1)
self._preds.extend(preds.tolist())
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if output_dir is None:
raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
from cls import ClassifyReader from cls import ClassifyReader
from match import MatchReader
from ner import SequenceLabelReader
from mrc import MrcReader
from mlm import MaskLMReader
...@@ -42,7 +42,6 @@ class Reader(object): ...@@ -42,7 +42,6 @@ class Reader(object):
self._register.add(attr_name) self._register.add(attr_name)
def register_with(self, backbone): def register_with(self, backbone):
print(backbone)
for attr in backbone.inputs_attr: for attr in backbone.inputs_attr:
self.require_attr(attr) self.require_attr(attr)
self._registered_backbone = backbone self._registered_backbone = backbone
......
...@@ -13,87 +13,104 @@ ...@@ -13,87 +13,104 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlepalm.interface import reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import ClassifyReader from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader
class Reader(reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''): class MatchReader(Reader):
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \
do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么
""" """
Args: Args:
phase: train, eval, pred phase: train, eval, pred
lang: en, ch, ...
learning_strategy: pointwise, pairwise
""" """
self._is_training = phase == 'train' Reader.__init__(self, phase)
reader = ClassifyReader(config['vocab_path'], assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
max_seq_len=config['max_seq_len'], assert phase in ['train', 'predict'], "supported phase: train, predict."
do_lower_case=config.get('do_lower_case', True),
for_cn=config.get('for_cn', False),
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size'] for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._max_seq_len = config['max_seq_len']
self._register.add('token_ids')
if phase == 'train': if phase == 'train':
self._input_file = config['train_file'] if learning_strategy == 'pointwise':
self._num_epochs = None # 防止iteartor终止 self._register.add('label_ids')
self._shuffle = config.get('shuffle', True) if learning_strategy == 'pairwise':
self._shuffle_buffer = config.get('shuffle_buffer', 5000) self._register.add('token_ids_neg')
elif phase == 'eval': self._register.add('position_ids_neg')
self._input_file = config['dev_file'] self._register.add('segment_ids_neg')
self._num_epochs = 1 self._register.add('input_mask_neg')
self._shuffle = False self._register.add('task_ids_neg')
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred': self._is_training = phase == 'train'
self._input_file = config['pred_file'] self._learning_strategy = learning_strategy
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size) match_reader = CLSReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed,
learning_strategy = learning_strategy)
self._reader = match_reader
self._dev_count = dev_count
self._phase = phase self._phase = phase
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
@property @property
def outputs_attr(self): def outputs_attr(self):
if self._is_training: attrs = {"token_ids": [[-1, -1], 'int64'],
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'], "input_mask": [[-1, -1, 1], 'float32'],
"label_ids": [[-1], 'int64'],
"task_ids": [[-1, -1], 'int64']
}
else:
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'], "task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'] "label_ids": [[-1], 'int64'],
"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
"task_ids_neg": [[-1, -1], 'int64']
} }
return self._get_registed_attrs(attrs)
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def load_data(self): def _iterator(self):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
def iterator(self):
def list_to_dict(x): names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 'label_ids', \
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 'token_ids_neg', 'segment_ids_neg', 'position_ids_neg', 'task_ids_neg', 'input_mask_neg']
'label_ids', 'unique_ids']
outputs = {n: i for n,i in zip(names, x)} if self._learning_strategy == 'pairwise':
del outputs['unique_ids'] names.remove('label_ids')
if not self._is_training:
del outputs['label_ids']
return outputs
for batch in self._data_generator(): for batch in self._data_generator():
yield list_to_dict(batch) outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
@property @property
def num_examples(self): def num_examples(self):
return self._reader.get_num_examples(phase=self._phase) return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
...@@ -13,79 +13,79 @@ ...@@ -13,79 +13,79 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlepalm.interface import reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MaskLMReader from paddlepalm.reader.utils.reader4ernie import MaskLMReader as MLMReader
import numpy as np import numpy as np
class Reader(reader): class MaskLMReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''): def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
""" """
Args: Args:
phase: train, eval, pred phase: train, eval, pred
""" """
self._is_training = phase == 'train'
reader = MaskLMReader(config['vocab_path'], Reader.__init__(self, phase)
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', False), assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
for_cn=config.get('for_cn', False), assert phase in ['train', 'predict'], "supported phase: train, predict."
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size'] for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._max_seq_len = config['max_seq_len']
self._register.add('token_ids')
self._register.add('mask_pos')
if phase == 'train': if phase == 'train':
self._input_file = config['train_file'] self._register.add('mask_label')
self._num_epochs = None # 防止iteartor终止 self._is_training = phase == 'train'
self._shuffle = config.get('shuffle', True)
self._shuffle_buffer = config.get('shuffle_buffer', 5000) mlm_reader = MLMReader(vocab_path,
elif phase == 'eval': max_seq_len=max_len,
self._input_file = config['dev_file'] do_lower_case=do_lower_case,
self._num_epochs = 1 for_cn=for_cn,
self._shuffle = False random_seed=seed)
self._batch_size = config.get('pred_batch_size', self._batch_size) self._reader = mlm_reader
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
self._phase = phase self._phase = phase
# self._batch_size = self._dev_count = dev_count
self._print_first_n = config.get('print_first_n', 1)
@property @property
def outputs_attr(self): def outputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'], attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'], "input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1, -1], 'int64'], "task_ids": [[-1, -1], 'int64'],
"mask_label": [[-1], 'int64'], "mask_label": [[-1], 'int64'],
"mask_pos": [[-1], 'int64'], "mask_pos": [[-1], 'int64']
} }
return self._get_registed_attrs(attrs)
def load_data(self): def load_data(self, input_file, batch_size, num_epochs=None, \
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) file_format='csv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def iterator(self): def _iterator(self):
def list_to_dict(x):
names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask',
'task_ids', 'mask_label', 'mask_pos'] 'task_ids', 'mask_label', 'mask_pos']
outputs = {n: i for n,i in zip(names, x)}
# outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
return outputs
for batch in self._data_generator(): for batch in self._data_generator():
# print(np.shape(list_to_dict(batch)['token_ids'])) outputs = {n: i for n,i in zip(names, batch)}
# print(list_to_dict(batch)['mask_label'].tolist()) ret = {}
yield list_to_dict(batch) # TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
def get_epoch_outputs(self): def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase), return {'examples': self._reader.get_examples(self._phase),
...@@ -95,3 +95,7 @@ class Reader(reader): ...@@ -95,3 +95,7 @@ class Reader(reader):
def num_examples(self): def num_examples(self):
return self._reader.get_num_examples(phase=self._phase) return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
...@@ -13,77 +13,66 @@ ...@@ -13,77 +13,66 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlepalm.interface import reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MRCReader from paddlepalm.reader.utils.reader4ernie import MRCReader
import numpy as np
class Reader(reader): class MrcReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''): def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train', dev_count=1, print_prefix=''):
""" """
Args: Args:
phase: train, eval, pred phase: train, eval, pred
lang: en, ch, ...
""" """
self._is_training = phase == 'train' Reader.__init__(self, phase)
reader = MRCReader(config['vocab_path'],
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', False),
tokenizer='FullTokenizer',
for_cn=config.get('for_cn', False),
doc_stride=config['doc_stride'],
remove_noanswer=config.get('remove_noanswer', True),
max_query_length=config['max_query_len'],
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size'] assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
self._max_seq_len = config['max_seq_len'] assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
if phase == 'train': if phase == 'train':
self._input_file = config['train_file'] self._register.add("start_positions")
# self._num_epochs = config['num_epochs'] self._register.add("end_positions")
self._num_epochs = None # 防止iteartor终止 else:
self._shuffle = config.get('shuffle', True) self._register.add("unique_ids")
self._shuffle_buffer = config.get('shuffle_buffer', 5000)
if phase == 'eval':
self._input_file = config['dev_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
self._phase = phase
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
# TODO: without slide window version self._is_training = phase == 'train'
self._with_slide_window = config.get('with_slide_window', False)
mrc_reader = MRCReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
tokenizer=tokenizer,
doc_stride=doc_stride,
remove_noanswer=remove_noanswer,
max_query_length=max_query_len,
for_cn=for_cn,
random_seed=seed)
self._reader = mrc_reader
self._phase = phase
self._dev_count = dev_count
@property @property
def outputs_attr(self): def outputs_attr(self):
if self._is_training: attrs = {"token_ids": [[-1, -1], 'int64'],
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'], "input_mask": [[-1, -1, 1], 'float32'],
"start_positions": [[-1], 'int64'], "start_positions": [[-1], 'int64'],
"end_positions": [[-1], 'int64'], "end_positions": [[-1], 'int64'],
"task_ids": [[-1, -1], 'int64']
}
else:
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'], "task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"unique_ids": [[-1], 'int64'] "unique_ids": [[-1], 'int64']
} }
return self._get_registed_attrs(attrs)
@property @property
def epoch_outputs_attr(self): def epoch_outputs_attr(self):
...@@ -91,26 +80,34 @@ class Reader(reader): ...@@ -91,26 +80,34 @@ class Reader(reader):
return {"examples": None, return {"examples": None,
"features": None} "features": None}
def load_data(self): def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) self._batch_size = batch_size
self._num_epochs = num_epochs
def iterator(self): self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def _iterator(self):
def list_to_dict(x):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'start_positions', 'end_positions', 'unique_ids'] 'start_positions', 'end_positions', 'unique_ids']
outputs = {n: i for n,i in zip(names, x)}
if self._is_training: if self._is_training:
del outputs['unique_ids'] names.remove('unique_ids')
else:
del outputs['start_positions']
del outputs['end_positions']
return outputs
for batch in self._data_generator(): for batch in self._data_generator():
yield list_to_dict(batch) outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
if not self._is_training:
assert 'unique_ids' in ret, ret
yield ret
def get_epoch_outputs(self): def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase), return {'examples': self._reader.get_examples(self._phase),
'features': self._reader.get_features(self._phase)} 'features': self._reader.get_features(self._phase)}
...@@ -118,3 +115,7 @@ class Reader(reader): ...@@ -118,3 +115,7 @@ class Reader(reader):
def num_examples(self): def num_examples(self):
return self._reader.get_num_examples(phase=self._phase) return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader
class SequenceLabelReader(Reader):
def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
"""
Reader.__init__(self, phase)
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
self._register.add('seq_lens')
if phase == 'train':
self._register.add('label_ids')
self._is_training = phase == 'train'
ner_reader = SLReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed,
label_map_config=label_map_config)
self._reader = ner_reader
self._phase = phase
self._dev_count = dev_count
@property
def outputs_attr(self):
attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"seq_lens": [[-1], 'int64'],
"label_ids": [[-1, -1], 'int64']}
return self._get_registed_attrs(attrs)
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def _iterator(self):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'label_ids', 'seq_lens', 'label_ids']
for batch in self._data_generator():
outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase),
'features': self._reader.get_features(self._phase)}
@property
def num_examples(self):
return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
...@@ -19,12 +19,26 @@ from __future__ import print_function ...@@ -19,12 +19,26 @@ from __future__ import print_function
import numpy as np import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3, dev_count=1):
""" """
Add mask for batch_tokens, return out, mask_label, mask_pos; Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded; Note: mask_pos responding the batch_tokens after padded;
""" """
max_len = max([len(sent) for sent in batch_tokens]) max_len = max([len(sent) for sent in batch_tokens])
multidev_batch_tokens = []
multidev_mask_label = []
multidev_mask_pos = []
big_batch_tokens = batch_tokens
stride = len(batch_tokens) // dev_count
if stride == 0:
return None, None, None
p = stride
for i in range(dev_count):
batch_tokens = big_batch_tokens[p-stride:p]
p += stride
mask_label = [] mask_label = []
mask_pos = [] mask_pos = []
prob_mask = np.random.rand(total_token_num) prob_mask = np.random.rand(total_token_num)
...@@ -69,7 +83,12 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): ...@@ -69,7 +83,12 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
mask_pos.append(sent_index * max_len + token_index) mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1]) mask_label = np.array(mask_label).astype("int64").reshape([-1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1]) mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
return batch_tokens, mask_label, mask_pos
multidev_batch_tokens.extend(batch_tokens)
multidev_mask_label.append(mask_label)
multidev_mask_pos.append(mask_pos)
return multidev_batch_tokens, multidev_mask_label, multidev_mask_pos
def prepare_batch_data(insts, def prepare_batch_data(insts,
...@@ -83,7 +102,8 @@ def prepare_batch_data(insts, ...@@ -83,7 +102,8 @@ def prepare_batch_data(insts,
task_id=0, task_id=0,
return_input_mask=True, return_input_mask=True,
return_max_len=True, return_max_len=True,
return_num_token=False): return_num_token=False,
dev_count=1):
""" """
1. generate Tensor of data 1. generate Tensor of data
2. generate Tensor of position 2. generate Tensor of position
...@@ -101,7 +121,8 @@ def prepare_batch_data(insts, ...@@ -101,7 +121,8 @@ def prepare_batch_data(insts,
vocab_size=voc_size, vocab_size=voc_size,
CLS=cls_id, CLS=cls_id,
SEP=sep_id, SEP=sep_id,
MASK=mask_id) MASK=mask_id,
dev_count=dev_count)
# Second step: padding # Second step: padding
src_id, self_input_mask = pad_batch_data( src_id, self_input_mask = pad_batch_data(
out, out,
...@@ -125,7 +146,7 @@ def prepare_batch_data(insts, ...@@ -125,7 +146,7 @@ def prepare_batch_data(insts,
return_list = [ return_list = [
src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
] ]
return return_list if len(return_list) > 1 else return_list[0] return return_list
def pad_batch_data(insts, def pad_batch_data(insts,
......
...@@ -29,11 +29,15 @@ import six ...@@ -29,11 +29,15 @@ import six
from io import open from io import open
from collections import namedtuple from collections import namedtuple
# from . import gpu_dev_count
gpu_dev_count=1
import paddlepalm as palm
import paddlepalm.tokenizer.ernie_tokenizer as tokenization import paddlepalm.tokenizer.ernie_tokenizer as tokenization
from paddlepalm.reader.utils.batching4ernie import pad_batch_data from paddlepalm.reader.utils.batching4ernie import pad_batch_data
from paddlepalm.reader.utils.mlm_batching import prepare_batch_data from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
if six.PY3: if six.PY3:
...@@ -49,20 +53,23 @@ def csv_reader(fd, delimiter='\t'): ...@@ -49,20 +53,23 @@ def csv_reader(fd, delimiter='\t'):
return gen() return gen()
class BaseReader(object): class Reader(object):
def __init__(self, def __init__(self,
vocab_path, vocab_path,
label_map_config=None, label_map_config=None,
max_seq_len=512, max_seq_len=512,
do_lower_case=False, do_lower_case=True,
in_tokens=False, in_tokens=False,
is_inference=False, is_inference=False,
learning_strategy='pointwise',
random_seed=None, random_seed=None,
tokenizer="FullTokenizer", tokenizer="FullTokenizer",
phase='train',
is_classify=True, is_classify=True,
is_regression=False, is_regression=False,
for_cn=False, for_cn=True,
task_id=0): task_id=0):
assert phase in ['train', 'predict'], "supported phase: train, predict."
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer( self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case) vocab_file=vocab_path, do_lower_case=do_lower_case)
...@@ -72,7 +79,9 @@ class BaseReader(object): ...@@ -72,7 +79,9 @@ class BaseReader(object):
self.sep_id = self.vocab["[SEP]"] self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"] self.mask_id = self.vocab["[MASK]"]
self.in_tokens = in_tokens self.in_tokens = in_tokens
self.phase = phase
self.is_inference = is_inference self.is_inference = is_inference
self.learning_strategy = learning_strategy
self.for_cn = for_cn self.for_cn = for_cn
self.task_id = task_id self.task_id = task_id
...@@ -83,7 +92,6 @@ class BaseReader(object): ...@@ -83,7 +92,6 @@ class BaseReader(object):
self.current_example = 0 self.current_example = 0
self.current_epoch = 0 self.current_epoch = 0
self.num_examples = 0 self.num_examples = 0
self.examples = {} self.examples = {}
if label_map_config: if label_map_config:
...@@ -125,33 +133,41 @@ class BaseReader(object): ...@@ -125,33 +133,41 @@ class BaseReader(object):
else: else:
tokens_b.pop() tokens_b.pop()
def _convert_example_to_record(self, example, max_seq_length, tokenizer): def _convert_example_to_record(self, example, max_seq_length, tokenizer):
"""Converts a single `Example` into a single `Record`.""" """Converts a single `Example` into a single `Record`."""
text_a = tokenization.convert_to_unicode(example.text_a) text_a = tokenization.convert_to_unicode(example.text_a)
tokens_a = tokenizer.tokenize(text_a) tokens_a = tokenizer.tokenize(text_a)
tokens_b = None tokens_b = None
has_text_b = False has_text_b = False
has_text_b_neg = False
if isinstance(example, dict): if isinstance(example, dict):
has_text_b = "text_b" in example.keys() has_text_b = "text_b" in example.keys()
has_text_b_neg = "text_b_neg" in example.keys()
else: else:
has_text_b = "text_b" in example._fields has_text_b = "text_b" in example._fields
has_text_b_neg = "text_b_neg" in example._fields
if has_text_b: if has_text_b:
text_b = tokenization.convert_to_unicode(example.text_b) text_b = tokenization.convert_to_unicode(example.text_b)
tokens_b = tokenizer.tokenize(text_b) tokens_b = tokenizer.tokenize(text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total # Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length. # length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3" # Account for [CLS], [SEP], [SEP] with "- 3"
self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
if has_text_b_neg and self.phase == 'train':
tokens_a_neg = tokenizer.tokenize(text_a)
text_b_neg = tokenization.convert_to_unicode(example.text_b_neg)
tokens_b_neg = tokenizer.tokenize(text_b_neg)
self._truncate_seq_pair(tokens_a_neg, tokens_b_neg, max_seq_length - 3)
else: else:
# Account for [CLS] and [SEP] with "- 2" # Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2: if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)] tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT/ERNIE is: # The convention in BERT/ERNIE is:
# (a) For sequence pairs: # (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
...@@ -173,6 +189,7 @@ class BaseReader(object): ...@@ -173,6 +189,7 @@ class BaseReader(object):
tokens = [] tokens = []
text_type_ids = [] text_type_ids = []
tokens.append("[CLS]") tokens.append("[CLS]")
text_type_ids.append(0) text_type_ids.append(0)
for token in tokens_a: for token in tokens_a:
tokens.append(token) tokens.append(token)
...@@ -190,6 +207,29 @@ class BaseReader(object): ...@@ -190,6 +207,29 @@ class BaseReader(object):
token_ids = tokenizer.convert_tokens_to_ids(tokens) token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids))) position_ids = list(range(len(token_ids)))
if has_text_b_neg and self.phase == 'train':
tokens_neg = []
text_type_ids_neg = []
tokens_neg.append("[CLS]")
text_type_ids_neg.append(0)
for token in tokens_a_neg:
tokens_neg.append(token)
text_type_ids_neg.append(0)
tokens_neg.append("[SEP]")
text_type_ids_neg.append(0)
if tokens_b_neg:
for token in tokens_b_neg:
tokens_neg.append(token)
text_type_ids_neg.append(1)
tokens_neg.append("[SEP]")
text_type_ids_neg.append(1)
token_ids_neg = tokenizer.convert_tokens_to_ids(tokens_neg)
position_ids_neg = list(range(len(token_ids_neg)))
if self.is_inference: if self.is_inference:
Record = namedtuple('Record', Record = namedtuple('Record',
['token_ids', 'text_type_ids', 'position_ids']) ['token_ids', 'text_type_ids', 'position_ids'])
...@@ -197,6 +237,23 @@ class BaseReader(object): ...@@ -197,6 +237,23 @@ class BaseReader(object):
token_ids=token_ids, token_ids=token_ids,
text_type_ids=text_type_ids, text_type_ids=text_type_ids,
position_ids=position_ids) position_ids=position_ids)
else:
qid = None
if "qid" in example._fields:
qid = example.qid
if self.learning_strategy == 'pairwise' and self.phase == 'train':
Record = namedtuple('Record',
['token_ids', 'text_type_ids', 'position_ids', 'token_ids_neg', 'text_type_ids_neg', 'position_ids_neg', 'qid'])
record = Record(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
token_ids_neg=token_ids_neg,
text_type_ids_neg=text_type_ids_neg,
position_ids_neg=position_ids_neg,
qid=qid)
else: else:
if self.label_map: if self.label_map:
label_id = self.label_map[example.label] label_id = self.label_map[example.label]
...@@ -207,10 +264,6 @@ class BaseReader(object): ...@@ -207,10 +264,6 @@ class BaseReader(object):
'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid' 'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'
]) ])
qid = None
if "qid" in example._fields:
qid = example.qid
record = Record( record = Record(
token_ids=token_ids, token_ids=token_ids,
text_type_ids=text_type_ids, text_type_ids=text_type_ids,
...@@ -240,7 +293,7 @@ class BaseReader(object): ...@@ -240,7 +293,7 @@ class BaseReader(object):
yield self._pad_batch_records(batch_records) yield self._pad_batch_records(batch_records)
batch_records, max_len = [record], len(record.token_ids) batch_records, max_len = [record], len(record.token_ids)
if phase == 'pred' and batch_records: if phase == 'predict' and batch_records:
yield self._pad_batch_records(batch_records) yield self._pad_batch_records(batch_records)
def get_num_examples(self, input_file=None, phase='train'): def get_num_examples(self, input_file=None, phase='train'):
...@@ -283,6 +336,7 @@ class BaseReader(object): ...@@ -283,6 +336,7 @@ class BaseReader(object):
if len(all_dev_batches) == dev_count: if len(all_dev_batches) == dev_count:
for batch in all_dev_batches: for batch in all_dev_batches:
yield batch yield batch
all_dev_batches = [] all_dev_batches = []
def f(): def f():
for i in wrapper(): for i in wrapper():
...@@ -299,71 +353,7 @@ class BaseReader(object): ...@@ -299,71 +353,7 @@ class BaseReader(object):
return f return f
class ClassifyReader(BaseReader): class MaskLMReader(Reader):
def _read_tsv(self, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, 'r', encoding='utf8') as f:
reader = csv_reader(f)
headers = next(reader)
text_indices = [
index for index, h in enumerate(headers) if h != "label"
]
Example = namedtuple('Example', headers)
examples = []
for line in reader:
for index, text in enumerate(line):
if index in text_indices:
if self.for_cn:
line[index] = text.replace(' ', '')
else:
line[index] = text
example = Example(*line)
examples.append(example)
return examples
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
if not self.is_inference:
batch_labels = [record.label_id for record in batch_records]
if self.is_classify:
batch_labels = np.array(batch_labels).astype("int64").reshape(
[-1])
elif self.is_regression:
batch_labels = np.array(batch_labels).astype("float32").reshape(
[-1])
if batch_records[0].qid:
batch_qids = [record.qid for record in batch_records]
batch_qids = np.array(batch_qids).astype("int64").reshape(
[-1])
else:
batch_qids = np.array([]).astype("int64").reshape([-1])
# padding
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids, pad_idx=self.pad_id)
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
padded_task_ids, input_mask
]
if not self.is_inference:
return_list += [batch_labels, batch_qids]
return return_list
class MaskLMReader(BaseReader):
def _convert_example_to_record(self, example, max_seq_length, tokenizer): def _convert_example_to_record(self, example, max_seq_length, tokenizer):
"""Converts a single `Example` into a single `Record`.""" """Converts a single `Example` into a single `Record`."""
...@@ -430,13 +420,6 @@ class MaskLMReader(BaseReader): ...@@ -430,13 +420,6 @@ class MaskLMReader(BaseReader):
token_ids = tokenizer.convert_tokens_to_ids(tokens) token_ids = tokenizer.convert_tokens_to_ids(tokens)
position_ids = list(range(len(token_ids))) position_ids = list(range(len(token_ids)))
# Record = namedtuple('Record',
# ['token_ids', 'text_type_ids', 'position_ids'])
# record = Record(
# token_ids=token_ids,
# text_type_ids=text_type_ids,
# position_ids=position_ids)
return [token_ids, text_type_ids, position_ids] return [token_ids, text_type_ids, position_ids]
def batch_reader(self, examples, batch_size, in_tokens, phase): def batch_reader(self, examples, batch_size, in_tokens, phase):
...@@ -455,7 +438,7 @@ class MaskLMReader(BaseReader): ...@@ -455,7 +438,7 @@ class MaskLMReader(BaseReader):
batch = [parsed_line] batch = [parsed_line]
total_token_num = len(parsed_line[0]) total_token_num = len(parsed_line[0])
if len(batch) > 0 and phase == 'pred': if len(batch) > 0 and phase == 'predict':
yield batch, total_token_num yield batch, total_token_num
def data_generator(self, def data_generator(self,
...@@ -497,19 +480,102 @@ class MaskLMReader(BaseReader): ...@@ -497,19 +480,102 @@ class MaskLMReader(BaseReader):
# max_len=self.max_seq_len, # 注意,如果padding到最大长度,会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。 # max_len=self.max_seq_len, # 注意,如果padding到最大长度,会导致mask_pos与实际位置不对应。因为mask pos是基于batch内最大长度来计算的。
return_input_mask=True, return_input_mask=True,
return_max_len=False, return_max_len=False,
return_num_token=False) return_num_token=False,
# dev_count=gpu_dev_count)
dev_count=1)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data) # yield batch
if len(all_dev_batches) == dev_count: for piece in palm.distribute.yield_pieces(batch_data, ['s', 's', 's', 's', 's', 'u', 'u'], batch_size):
for batch in all_dev_batches: yield piece
yield batch
all_dev_batches = []
return wrapper return wrapper
class SequenceLabelReader(BaseReader): class ClassifyReader(Reader):
def _read_tsv(self, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, 'r', encoding='utf8') as f:
reader = csv_reader(f)
headers = next(reader)
text_indices = [
index for index, h in enumerate(headers) if h != "label"
]
Example = namedtuple('Example', headers)
examples = []
for line in reader:
for index, text in enumerate(line):
if index in text_indices:
if self.for_cn:
line[index] = text.replace(' ', '')
else:
line[index] = text
example = Example(*line)
examples.append(example)
return examples
def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
if self.phase=='train' and self.learning_strategy == 'pairwise':
batch_token_ids_neg = [record.token_ids_neg for record in batch_records]
batch_text_type_ids_neg = [record.text_type_ids_neg for record in batch_records]
batch_position_ids_neg = [record.position_ids_neg for record in batch_records]
if not self.is_inference:
if not self.learning_strategy == 'pairwise':
batch_labels = [record.label_id for record in batch_records]
if self.is_classify:
batch_labels = np.array(batch_labels).astype("int64").reshape(
[-1])
elif self.is_regression:
batch_labels = np.array(batch_labels).astype("float32").reshape(
[-1])
if batch_records[0].qid:
batch_qids = [record.qid for record in batch_records]
batch_qids = np.array(batch_qids).astype("int64").reshape(
[-1])
else:
batch_qids = np.array([]).astype("int64").reshape([-1])
# padding
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids, pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids, pad_idx=self.pad_id)
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_text_type_ids, padded_position_ids,
padded_task_ids, input_mask
]
if self.phase=='train':
if self.learning_strategy == 'pairwise':
padded_token_ids_neg, input_mask_neg = pad_batch_data(
batch_token_ids_neg, pad_idx=self.pad_id, return_input_mask=True)
padded_text_type_ids_neg = pad_batch_data(
batch_text_type_ids_neg, pad_idx=self.pad_id)
padded_position_ids_neg = pad_batch_data(
batch_position_ids_neg, pad_idx=self.pad_id)
padded_task_ids_neg = np.ones_like(
padded_token_ids_neg, dtype="int64") * self.task_id
return_list += [padded_token_ids_neg, padded_text_type_ids_neg, \
padded_position_ids_neg, padded_task_ids_neg, input_mask_neg]
elif self.learning_strategy == 'pointwise':
return_list += [batch_labels]
return return_list
class SequenceLabelReader(Reader):
def _pad_batch_records(self, batch_records): def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records] batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records]
...@@ -550,19 +616,7 @@ class SequenceLabelReader(BaseReader): ...@@ -550,19 +616,7 @@ class SequenceLabelReader(BaseReader):
ret_labels.append(label) ret_labels.append(label)
continue continue
if label == "O" or label.startswith("I-"):
ret_labels.extend([label] * len(sub_token)) ret_labels.extend([label] * len(sub_token))
elif label.startswith("B-"):
i_label = "I-" + label[2:]
ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
elif label.startswith("S-"):
b_laebl = "B-" + label[2:]
e_label = "E-" + label[2:]
i_label = "I-" + label[2:]
ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
elif label.startswith("E-"):
i_label = "I-" + label[2:]
ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
assert len(ret_tokens) == len(ret_labels) assert len(ret_tokens) == len(ret_labels)
return ret_tokens, ret_labels return ret_tokens, ret_labels
...@@ -581,6 +635,9 @@ class SequenceLabelReader(BaseReader): ...@@ -581,6 +635,9 @@ class SequenceLabelReader(BaseReader):
position_ids = list(range(len(token_ids))) position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids) text_type_ids = [0] * len(token_ids)
no_entity_id = len(self.label_map) - 1 no_entity_id = len(self.label_map) - 1
labels = [
label if label in self.label_map else u"O" for label in labels
]
label_ids = [no_entity_id] + [ label_ids = [no_entity_id] + [
self.label_map[label] for label in labels self.label_map[label] for label in labels
] + [no_entity_id] ] + [no_entity_id]
...@@ -596,7 +653,7 @@ class SequenceLabelReader(BaseReader): ...@@ -596,7 +653,7 @@ class SequenceLabelReader(BaseReader):
return record return record
class ExtractEmbeddingReader(BaseReader): class ExtractEmbeddingReader(Reader):
def _pad_batch_records(self, batch_records): def _pad_batch_records(self, batch_records):
batch_token_ids = [record.token_ids for record in batch_records] batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records] batch_text_type_ids = [record.text_type_ids for record in batch_records]
...@@ -623,7 +680,7 @@ class ExtractEmbeddingReader(BaseReader): ...@@ -623,7 +680,7 @@ class ExtractEmbeddingReader(BaseReader):
return return_list return return_list
class MRCReader(BaseReader): class MRCReader(Reader):
def __init__(self, def __init__(self,
vocab_path, vocab_path,
label_map_config=None, label_map_config=None,
...@@ -888,11 +945,20 @@ class MRCReader(BaseReader): ...@@ -888,11 +945,20 @@ class MRCReader(BaseReader):
if to_append: if to_append:
batch_records.append(record) batch_records.append(record)
else: else:
yield self._pad_batch_records(batch_records, phase == "train") # yield self._pad_batch_records(batch_records, phase == "train")
ds = ['s'] * 8
for piece in palm.distribute.yield_pieces(\
self._pad_batch_records(batch_records, phase == 'train'),
ds, batch_size):
yield piece
batch_records, max_len = [record], len(record.token_ids) batch_records, max_len = [record], len(record.token_ids)
if phase == 'pred' and batch_records: if phase == 'predict' and batch_records:
yield self._pad_batch_records(batch_records, phase == "train") for piece in palm.distribute.yield_pieces(\
self._pad_batch_records(batch_records, phase == 'train'),
ds, batch_size):
yield piece
def _pad_batch_records(self, batch_records, is_training): def _pad_batch_records(self, batch_records, is_training):
batch_token_ids = [record.token_ids for record in batch_records] batch_token_ids = [record.token_ids for record in batch_records]
...@@ -979,12 +1045,8 @@ class MRCReader(BaseReader): ...@@ -979,12 +1045,8 @@ class MRCReader(BaseReader):
for batch_data in self._prepare_batch_data( for batch_data in self._prepare_batch_data(
features, batch_size, phase=phase): features, batch_size, phase=phase):
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data) yield batch_data
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper return wrapper
......
...@@ -71,10 +71,10 @@ class TaskInstance(object): ...@@ -71,10 +71,10 @@ class TaskInstance(object):
self._train_finish = False self._train_finish = False
# 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例 # 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例
self._reader = {'train': None, 'eval': None, 'pred': None} self._reader = {'train': None, 'eval': None, 'predict': None}
self._input_layer = None self._input_layer = None
self._inputname_to_varname = {} self._inputname_to_varname = {}
self._task_layer = {'train': None, 'eval': None, 'pred': None} self._task_layer = {'train': None, 'eval': None, 'predict': None}
self._pred_input_name_list = [] self._pred_input_name_list = []
self._pred_input_varname_list = [] self._pred_input_varname_list = []
self._pred_fetch_name_list = [] self._pred_fetch_name_list = []
...@@ -90,7 +90,7 @@ class TaskInstance(object): ...@@ -90,7 +90,7 @@ class TaskInstance(object):
def build_task_layer(self, net_inputs, phase, scope=""): def build_task_layer(self, net_inputs, phase, scope=""):
output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope) output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope)
if phase == 'pred': if phase == 'predict':
if output_vars is not None: if output_vars is not None:
self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items()) self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items())
else: else:
......
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
import json import json
import yaml import yaml
from config_helper import PDConfig from config_helper import PDConfig
import logging
from paddle import fluid from paddle import fluid
def get_basename(f): def get_basename(f):
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import os import os
import sys import sys
import random import random
import logging
import numpy as np import numpy as np
import paddle import paddle
from paddle import fluid from paddle import fluid
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册