提交 174d3fec 编写于 作者: X xixiaoyao

Merge branch 'r0.3-api' of https://github.com/PaddlePaddle/PALM into r0.3-api

...@@ -50,7 +50,7 @@ cd PALM && python setup.py install ...@@ -50,7 +50,7 @@ cd PALM && python setup.py install
**环境依赖** **环境依赖**
- Python >= 2.7 (即将支持python3) - Python >= 2.7
- cuda >= 9.0 - cuda >= 9.0
- cudnn >= 7.0 - cudnn >= 7.0
- PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装) - PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装)
......
../../pretrain/
\ No newline at end of file
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddlepalm as palm
import sys
import argparse
# create parser
parser = argparse.ArgumentParser(prog='download_models.py', usage='python %(prog)s -l | -d <model_name> [-h]\n\nFor example,\n\tpython %(prog)s -d bert-en-uncased-large ',description = 'Download pretrain models for initializing params of backbones. ')
parser1= parser.add_argument_group("required arguments")
parser1.add_argument('-l','--list', action = 'store_true', help = 'show the list of available pretrain models', default = False)
parser1.add_argument('-d','--download', action = 'store', help = 'download pretrain models. The available pretrain models can be listed by run "python download_models.py -l"')
args = parser.parse_args()
if(args.list):
palm.downloader.ls('pretrain')
elif(args.download):
print('download~~~')
print(args.download)
palm.downloader.download('pretrain', args.download)
else:
print (parser.parse_args(['-h']))
...@@ -34,13 +34,16 @@ _items = { ...@@ -34,13 +34,16 @@ _items = {
'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz', 'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz',
'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz', 'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz',
'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz', 'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz',
'ernie-ch-uncased-base':'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz',
'roberta-cn-base': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz',
'roberta-cn-large': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz',
'utils': None}, 'utils': None},
'reader': {'utils': None}, 'reader': {'utils': None},
'backbone': {'utils': None}, 'backbone': {'utils': None},
'tasktype': {'utils': None}, 'tasktype': {'utils': None},
} }
def _download(item, scope, path, silent=False): def _download(item, scope, path, silent=False, convert=False):
data_url = _items[item][scope] data_url = _items[item][scope]
if data_url == None: if data_url == None:
return return
...@@ -100,9 +103,10 @@ def _download(item, scope, path, silent=False): ...@@ -100,9 +103,10 @@ def _download(item, scope, path, silent=False):
os.removedirs(source_path) os.removedirs(source_path)
if not silent: if not silent:
print ('done!') print ('done!')
if not silent: if convert:
print ('Converting params...', end=" ") if not silent:
_convert(data_dir, silent) print ('Converting params...', end=" ")
_convert(data_dir, silent)
if not silent: if not silent:
print ('done!') print ('done!')
......
...@@ -29,22 +29,31 @@ from paddlepalm.backbone.base_backbone import Backbone ...@@ -29,22 +29,31 @@ from paddlepalm.backbone.base_backbone import Backbone
class BERT(Backbone): class BERT(Backbone):
def __init__(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \ max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \
attention_probs_dropout_prob, initializer_range, phase='train'): attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
config = {}
config['hidden_size'] = hidden_size self._emb_size = hidden_size
config['num_hidden_layers'] = num_hidden_layers self._n_layer = num_hidden_layers
config['num_attention_heads'] = num_attention_heads self._n_head = num_attention_heads
config['vocab_size'] = vocab_size self._voc_size = vocab_size
config['max_position_embeddings'] = max_position_embeddings self._max_position_seq_len = max_position_embeddings
config['type_vocab_size'] = sent_type_vocab_size self._sent_types = type_vocab_size
config['hidden_act'] = hidden_act
config['hidden_dropout_prob'] = hidden_dropout_prob
config['attention_probs_dropout_prob'] = attention_probs_dropout_prob self._hidden_act = hidden_act
config['initializer_range'] = initializer_range self._prepostprocess_dropout = hidden_dropout_prob
self._attention_dropout = attention_probs_dropout_prob
self.from_config(config, phase=phase)
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
self._phase = phase
self._is_pairwise = is_pairwise
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range)
@classmethod @classmethod
def from_config(self, config, phase='train'): def from_config(self, config, phase='train'):
...@@ -62,40 +71,57 @@ class BERT(Backbone): ...@@ -62,40 +71,57 @@ class BERT(Backbone):
"{} is required to initialize ERNIE".format('attention_probs_dropout_prob') "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range') assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range')
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变 hidden_size = config['hidden_size']
self._emb_size = config["hidden_size"] num_hidden_layers = config['num_hidden_layers']
self._n_layer = config["num_hidden_layers"] num_attention_heads = config['num_attention_heads']
self._n_head = config["num_attention_heads"] vocab_size = config['vocab_size']
self._voc_size = config["vocab_size"] max_position_embeddings = config['max_position_embeddings']
self._max_position_seq_len = config["max_position_embeddings"] if 'sent_type_vocab_size' in config:
self._sent_types = config["type_vocab_size"] sent_type_vocab_size = config['sent_type_vocab_size']
self._hidden_act = config["hidden_act"] else:
self._prepostprocess_dropout = config["hidden_dropout_prob"] sent_type_vocab_size = config['type_vocab_size']
self._attention_dropout = config["attention_probs_dropout_prob"]
hidden_act = config['hidden_act']
self._word_emb_name = "word_embedding" hidden_dropout_prob = config['hidden_dropout_prob']
self._pos_emb_name = "pos_embedding" attention_probs_dropout_prob = config['attention_probs_dropout_prob']
self._sent_emb_name = "sent_embedding" initializer_range = config['initializer_range']
if 'is_pairwise' in config:
# Initialize all weigths by truncated normal initializer, and all biases is_pairwise = config['is_pairwise']
# will be initialized by constant zero by default. else:
self._param_initializer = fluid.initializer.TruncatedNormal( is_pairwise = False
scale=config["initializer_range"])
return self(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase)
@property @property
def inputs_attr(self): def inputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'], ret = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32']} "input_mask": [[-1, -1, 1], 'float32'],
}
if self._is_pairwise and self._phase=='train':
ret.update({"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
})
return ret
@property @property
def outputs_attr(self): def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'], ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'], "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'], "sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']} "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
if self._is_pairwise and self._phase == 'train':
ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
"encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
return ret
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids'] src_ids = inputs['token_ids']
...@@ -104,83 +130,111 @@ class BERT(Backbone): ...@@ -104,83 +130,111 @@ class BERT(Backbone):
input_mask = inputs['input_mask'] input_mask = inputs['input_mask']
self._emb_dtype = 'float32' self._emb_dtype = 'float32'
# padding id in vocabulary must be set to 0
emb_out = fluid.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
input_buffer = {}
output_buffer = {}
input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask]
output_buffer['base'] = {}
if self._is_pairwise and self._phase =='train':
src_ids = inputs['token_ids_neg']
pos_ids = inputs['position_ids_neg']
sent_ids = inputs['segment_ids_neg']
input_mask = inputs['input_mask_neg']
input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask]
output_buffer['neg'] = {}
next_sent_feat = fluid.layers.slice( for key, (src_ids, pos_ids, sent_ids, input_mask) in input_buffer.items():
input=enc_out, axes=[1], starts=[0], ends=[1]) # padding id in vocabulary must be set to 0
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]]) emb_out = fluid.embedding(
next_sent_feat = fluid.layers.fc( input=src_ids,
input=next_sent_feat, size=[self._voc_size, self._emb_size],
size=self._emb_size, dtype=self._emb_dtype,
act="tanh", param_attr=fluid.ParamAttr(
param_attr=fluid.ParamAttr( name=scope_name+self._word_emb_name, initializer=self._param_initializer),
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), is_sparse=False)
bias_attr=scope_name+"pooled_fc.b_0")
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
return {'embedding_table': embedding_table, embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
'word_embedding': emb_out,
'encoder_outputs': enc_out, position_emb_out = fluid.embedding(
'sentence_embedding': next_sent_feat, input=pos_ids,
'sentence_pair_embedding': next_sent_feat} size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
output_buffer[key]['word_embedding'] = emb_out
output_buffer[key]['encoder_outputs'] = enc_out
output_buffer[key]['sentence_embedding'] = next_sent_feat
output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
ret = {}
ret['embedding_table'] = embedding_table
ret['word_embedding'] = output_buffer['base']['word_embedding']
ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
if self._is_pairwise and self._phase == 'train':
ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
return ret
def postprocess(self, rt_outputs): def postprocess(self, rt_outputs):
pass pass
......
...@@ -31,10 +31,10 @@ class ERNIE(Backbone): ...@@ -31,10 +31,10 @@ class ERNIE(Backbone):
def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \ max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase='train'): hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变 # self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = hidden_size self._emb_size = hidden_size
self._n_layer = num_hidden_layers self._n_layer = num_hidden_layers
self._n_head = num_attention_heads self._n_head = num_attention_heads
...@@ -53,7 +53,8 @@ class ERNIE(Backbone): ...@@ -53,7 +53,8 @@ class ERNIE(Backbone):
self._sent_emb_name = "sent_embedding" self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding" self._task_emb_name = "task_embedding"
self._emb_dtype = "float32" self._emb_dtype = "float32"
self._is_pairwise = is_pairwise
self._phase=phase
self._param_initializer = fluid.initializer.TruncatedNormal( self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range) scale=initializer_range)
...@@ -65,7 +66,7 @@ class ERNIE(Backbone): ...@@ -65,7 +66,7 @@ class ERNIE(Backbone):
assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size') assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size')
assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings') assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings')
assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size') assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size')
assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size') # assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act') assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act')
assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob') assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob')
assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob') assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
...@@ -80,126 +81,175 @@ class ERNIE(Backbone): ...@@ -80,126 +81,175 @@ class ERNIE(Backbone):
sent_type_vocab_size = config['sent_type_vocab_size'] sent_type_vocab_size = config['sent_type_vocab_size']
else: else:
sent_type_vocab_size = config['type_vocab_size'] sent_type_vocab_size = config['type_vocab_size']
task_type_vocab_size = config['task_type_vocab_size'] if 'task_type_vocab_size' in config:
task_type_vocab_size = config['task_type_vocab_size']
else:
task_type_vocab_size = config['type_vocab_size']
hidden_act = config['hidden_act'] hidden_act = config['hidden_act']
hidden_dropout_prob = config['hidden_dropout_prob'] hidden_dropout_prob = config['hidden_dropout_prob']
attention_probs_dropout_prob = config['attention_probs_dropout_prob'] attention_probs_dropout_prob = config['attention_probs_dropout_prob']
initializer_range = config['initializer_range'] initializer_range = config['initializer_range']
if 'is_pairwise' in config:
is_pairwise = config['is_pairwise']
else:
is_pairwise = False
return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \ return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \ max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase=phase) hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase=phase)
@property @property
def inputs_attr(self): def inputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'], ret = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'], "input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1,-1], 'int64']} "task_ids": [[-1,-1], 'int64']}
if self._is_pairwise and self._phase=='train':
ret.update({"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
"task_ids_neg": [[-1,-1], 'int64']
})
return ret
@property @property
def outputs_attr(self): def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'], ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'], "embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'], "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'], "sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']} "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
if self._is_pairwise and self._phase == 'train':
ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
"encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
return ret
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids'] src_ids = inputs['token_ids']
pos_ids = inputs['position_ids'] pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids'] sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask'] input_mask = inputs['input_mask']
task_ids = inputs['task_ids'] task_ids = inputs['task_ids']
# padding id in vocabulary must be set to 0 input_buffer = {}
emb_out = fluid.embedding( output_buffer = {}
input=src_ids, input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
size=[self._voc_size, self._emb_size], output_buffer['base'] = {}
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr( if self._is_pairwise and self._phase =='train':
name=scope_name+self._word_emb_name, initializer=self._param_initializer), src_ids = inputs['token_ids_neg']
is_sparse=False) pos_ids = inputs['position_ids_neg']
sent_ids = inputs['segment_ids_neg']
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor() input_mask = inputs['input_mask_neg']
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name) task_ids = inputs['task_ids_neg']
input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
output_buffer['neg'] = {}
for key, (src_ids, pos_ids, sent_ids, input_mask, task_ids) in input_buffer.items():
# padding id in vocabulary must be set to 0
emb_out = fluid.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
position_emb_out = fluid.embedding( # fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
input=pos_ids, embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype, position_emb_out = fluid.embedding(
param_attr=fluid.ParamAttr( input=pos_ids,
name=scope_name+self._pos_emb_name, initializer=self._param_initializer)) size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
sent_emb_out = fluid.embedding( param_attr=fluid.ParamAttr(
sent_ids, name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype, sent_emb_out = fluid.embedding(
param_attr=fluid.ParamAttr( sent_ids,
name=scope_name+self._sent_emb_name, initializer=self._param_initializer)) size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
emb_out = emb_out + position_emb_out param_attr=fluid.ParamAttr(
emb_out = emb_out + sent_emb_out name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
task_emb_out = fluid.embedding( emb_out = emb_out + position_emb_out
task_ids, emb_out = emb_out + sent_emb_out
size=[self._task_types, self._emb_size],
dtype=self._emb_dtype, task_emb_out = fluid.embedding(
param_attr=fluid.ParamAttr( task_ids,
name=scope_name+self._task_emb_name, size=[self._task_types, self._emb_size],
initializer=self._param_initializer)) dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
emb_out = emb_out + task_emb_out name=scope_name+self._task_emb_name,
initializer=self._param_initializer))
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder') emb_out = emb_out + task_emb_out
self_attn_mask = fluid.layers.matmul( emb_out = pre_process_layer(
x=input_mask, y=input_mask, transpose_y=True) emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.scale( self_attn_mask = fluid.layers.matmul(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) x=input_mask, y=input_mask, transpose_y=True)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1) self_attn_mask = fluid.layers.scale(
n_head_self_attn_mask.stop_gradient = True x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
enc_out = encoder( x=[self_attn_mask] * self._n_head, axis=1)
enc_input=emb_out, n_head_self_attn_mask.stop_gradient = True
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer, enc_out = encoder(
n_head=self._n_head, enc_input=emb_out,
d_key=self._emb_size // self._n_head, attn_bias=n_head_self_attn_mask,
d_value=self._emb_size // self._n_head, n_layer=self._n_layer,
d_model=self._emb_size, n_head=self._n_head,
d_inner_hid=self._emb_size * 4, d_key=self._emb_size // self._n_head,
prepostprocess_dropout=self._prepostprocess_dropout, d_value=self._emb_size // self._n_head,
attention_dropout=self._attention_dropout, d_model=self._emb_size,
relu_dropout=0, d_inner_hid=self._emb_size * 4,
hidden_act=self._hidden_act, prepostprocess_dropout=self._prepostprocess_dropout,
preprocess_cmd="", attention_dropout=self._attention_dropout,
postprocess_cmd="dan", relu_dropout=0,
param_initializer=self._param_initializer, hidden_act=self._hidden_act,
name=scope_name+'encoder') preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
output_buffer[key]['word_embedding'] = emb_out
output_buffer[key]['encoder_outputs'] = enc_out
output_buffer[key]['sentence_embedding'] = next_sent_feat
output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
ret = {}
ret['embedding_table'] = embedding_table
ret['word_embedding'] = output_buffer['base']['word_embedding']
ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
if self._is_pairwise and self._phase == 'train':
ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
next_sent_feat = fluid.layers.slice( return ret
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
def postprocess(self, rt_outputs): def postprocess(self, rt_outputs):
pass pass
......
...@@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size): ...@@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size):
assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors." assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors."
data_list = data data_list = data
ds_list = distribute_strategy ds_list = distribute_strategy
stride = batch_size // dev_count stride = batch_size // dev_count
p = stride p = stride
# while p < len(data_list) + stride: # while p < len(data_list) + stride:
...@@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size): ...@@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size):
# print(len(temp)) # print(len(temp))
yield temp yield temp
def data_feeder(reader, postprocess_fn=None, prefetch_steps=2, phase='train'):
def data_feeder(reader, postprocess_fn=None, prefetch_steps=2):
if postprocess_fn is None: if postprocess_fn is None:
def postprocess_fn(batch): def postprocess_fn(batch):
return batch return batch
......
from cls import Classify from cls import Classify
# from match import Match from match import Match
# from mrc import MRC from ner import SequenceLabel
# from mlm import MaskLM from mrc import MRC
from mlm import MaskLM
...@@ -13,41 +13,66 @@ ...@@ -13,41 +13,66 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import layers from paddle.fluid import layers
from paddlepalm.interface import task_paradigm from paddlepalm.head.base_head import Head
import numpy as np import numpy as np
import os import os
import json
def computeHingeLoss(pos, neg, margin):
loss_part1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=pos, shape=[-1, 1], value=margin, dtype='float32'), pos)
loss_part2 = fluid.layers.elementwise_add(loss_part1, neg)
loss_part3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2)
return loss_part3
class TaskParadigm(task_paradigm):
class Match(Head):
''' '''
matching matching
''' '''
def __init__(self, config, phase, backbone_config=None):
def __init__(self, num_classes, input_dim, dropout_prob=0.0, param_initializer_range=0.02, \
learning_strategy='pointwise', margin=0.5, phase='train'):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
learning_strategy: pointwise, pairwise
"""
self._is_training = phase == 'train' self._is_training = phase == 'train'
self._hidden_size = backbone_config['hidden_size'] self._hidden_size = input_dim
self._num_classes = num_classes
if 'initializer_range' in config: self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = config['initializer_range'] self._param_initializer = fluid.initializer.TruncatedNormal(
else: scale=param_initializer_range)
self._param_initializer = fluid.initializer.TruncatedNormal( self._learning_strategy = learning_strategy
scale=backbone_config.get('initializer_range', 0.02)) self._margin = margin
if 'dropout_prob' in config:
self._dropout_prob = config['dropout_prob']
else:
self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
self._pred_output_path = config.get('pred_output_path', None)
self._preds = [] self._preds = []
self._preds_logits = []
@property @property
def inputs_attrs(self): def inputs_attrs(self):
if self._is_training: reader = {}
reader = {"label_ids": [[-1, 1], 'int64']}
else:
reader = {}
bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']} bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
if self._is_training:
if self._learning_strategy == 'pointwise':
reader["label_ids"] = [[-1], 'int64']
elif self._learning_strategy == 'pairwise':
bb["sentence_pair_embedding_neg"] = [[-1, self._hidden_size], 'float32']
return {'reader': reader, 'backbone': bb} return {'reader': reader, 'backbone': bb}
@property @property
...@@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm): ...@@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm):
if self._is_training: if self._is_training:
return {"loss": [[1], 'float32']} return {"loss": [[1], 'float32']}
else: else:
return {"logits": [[-1, 2], 'float32']} if self._learning_strategy=='paiwise':
return {"probs": [[-1, 1], 'float32']}
else:
return {"logits": [[-1, 2], 'float32'],
"probs": [[-1, 2], 'float32']}
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
if self._is_training:
labels = inputs["reader"]["label_ids"]
cls_feats = inputs["backbone"]["sentence_pair_embedding"]
# inputs
cls_feats = inputs["backbone"]["sentence_pair_embedding"]
if self._is_training: if self._is_training:
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
x=cls_feats, x=cls_feats,
dropout_prob=self._dropout_prob, dropout_prob=self._dropout_prob,
dropout_implementation="upscale_in_train") dropout_implementation="upscale_in_train")
if self._learning_strategy == 'pairwise':
cls_feats_neg = inputs["backbone"]["sentence_pair_embedding_neg"]
cls_feats_neg = fluid.layers.dropout(
x=cls_feats_neg,
dropout_prob=self._dropout_prob,
dropout_implementation="upscale_in_train")
elif self._learning_strategy == 'pointwise':
labels = inputs["reader"]["label_ids"]
# loss
# for pointwise
if self._learning_strategy == 'pointwise':
logits = fluid.layers.fc(
input=cls_feats,
size=self._num_classes,
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b",
initializer=fluid.initializer.Constant(0.)))
probs = fluid.layers.softmax(logits)
if self._is_training:
ce_loss = fluid.layers.cross_entropy(
input=probs, label=labels)
loss = fluid.layers.mean(x=ce_loss)
return {'loss': loss}
# for pred
else:
return {'logits': logits,
'probs': probs}
# for pairwise
elif self._learning_strategy == 'pairwise':
pos_score = fluid.layers.fc(
input=cls_feats,
size=1,
act = "sigmoid",
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w_pr",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b_pr",
initializer=fluid.initializer.Constant(0.)))
pos_score = fluid.layers.reshape(x=pos_score, shape=[-1, 1], inplace=True)
logits = fluid.layers.fc( if self._is_training:
input=cls_feats, neg_score = fluid.layers.fc(
size=2, input=cls_feats_neg,
param_attr=fluid.ParamAttr( size=1,
name=scope_name+"cls_out_w", act = "sigmoid",
initializer=self._param_initializer), param_attr=fluid.ParamAttr(
bias_attr=fluid.ParamAttr( name=scope_name+"cls_out_w_pr",
name=scope_name+"cls_out_b", initializer=self._param_initializer),
initializer=fluid.initializer.Constant(0.))) bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b_pr",
initializer=fluid.initializer.Constant(0.)))
neg_score = fluid.layers.reshape(x=neg_score, shape=[-1, 1], inplace=True)
loss = fluid.layers.mean(computeHingeLoss(pos_score, neg_score, self._margin))
return {'loss': loss}
# for pred
else:
return {'probs': pos_score}
if self._is_training:
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
return {'loss': loss}
else:
return {'logits': logits}
def postprocess(self, rt_outputs): def batch_postprocess(self, rt_outputs):
if not self._is_training: if not self._is_training:
logits = rt_outputs['logits'] probs = []
preds = np.argmax(logits, -1) logits = []
self._preds.extend(preds.tolist()) probs = rt_outputs['probs']
self._preds.extend(probs.tolist())
def epoch_postprocess(self, post_inputs): if self._learning_strategy == 'pointwise':
logits = rt_outputs['logits']
self._preds_logits.extend(logits.tolist())
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs # there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training: if not self._is_training:
if self._pred_output_path is None: if output_dir is None:
raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer: with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds: for i in range(len(self._preds)):
writer.write(str(p)+'\n') if self._learning_strategy == 'pointwise':
print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json')) label = 0 if self._preds[i][0] > self._preds[i][1] else 1
result = {'index': i, 'label': label, 'logits': self._preds_logits[i], 'probs': self._preds[i]}
elif self._learning_strategy == 'pairwise':
label = 0 if self._preds[i][0] < 0.5 else 1
result = {'index': i, 'label': label, 'probs': self._preds[i][0]}
result = json.dumps(result)
writer.write(result+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
\ No newline at end of file
...@@ -14,30 +14,39 @@ ...@@ -14,30 +14,39 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlepalm.interface import task_paradigm from paddlepalm.head.base_head import Head
from paddle.fluid import layers from paddle.fluid import layers
import numpy as np
import os
from paddlepalm.backbone.utils.transformer import pre_process_layer from paddlepalm.backbone.utils.transformer import pre_process_layer
class TaskParadigm(task_paradigm): class MaskLM(Head):
''' '''
matching mlm
''' '''
def __init__(self, config, phase, backbone_config=None): def __init__(self, input_dim, vocab_size, hidden_act, initializer_range, dropout_prob=0.0, \
param_initializer_range=0.02, phase='train'):
self._is_training = phase == 'train' self._is_training = phase == 'train'
self._emb_size = backbone_config['hidden_size'] self._emb_size = input_dim
self._hidden_size = backbone_config['hidden_size'] self._hidden_size = input_dim
self._vocab_size = backbone_config['vocab_size'] self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._hidden_act = backbone_config['hidden_act'] self._param_initializer = fluid.initializer.TruncatedNormal(
self._initializer_range = backbone_config['initializer_range'] scale=param_initializer_range)
self._preds = []
self._vocab_size = vocab_size
self._hidden_act = hidden_act
self._initializer_range = initializer_range
@property @property
def inputs_attrs(self): def inputs_attrs(self):
reader = { reader = {
"mask_label": [[-1, 1], 'int64'], "token_ids":[[-1, -1], 'int64'],
"mask_pos": [[-1, 1], 'int64']} "mask_label": [[-1], 'int64'],
"mask_pos": [[-1], 'int64'],
}
if not self._is_training: if not self._is_training:
del reader['mask_label'] del reader['mask_label']
del reader['batchsize_x_seqlen']
bb = { bb = {
"encoder_outputs": [[-1, -1, self._hidden_size], 'float32'], "encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
"embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']} "embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
...@@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm): ...@@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm):
mask_pos = inputs["reader"]["mask_pos"] mask_pos = inputs["reader"]["mask_pos"]
if self._is_training: if self._is_training:
mask_label = inputs["reader"]["mask_label"] mask_label = inputs["reader"]["mask_label"]
max_position = inputs["reader"]["batchsize_x_seqlen"] - 1 l1 = fluid.layers.shape(inputs["reader"]["token_ids"] )[0]
# bxs = inputs["reader"]["token_ids"].shape[2].value
l2 = fluid.layers.shape(inputs["reader"]["token_ids"][0])[0]
bxs = (l1*l2).astype(np.int64)
# max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
max_position = bxs - 1
mask_pos = fluid.layers.elementwise_min(mask_pos, max_position) mask_pos = fluid.layers.elementwise_min(mask_pos, max_position)
mask_pos.stop_gradient = True mask_pos.stop_gradient = True
...@@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm): ...@@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm):
is_bias=True) is_bias=True)
if self._is_training: if self._is_training:
mask_lm_loss = fluid.layers.softmax_with_cross_entropy( inputs = fluid.layers.softmax(fc_out)
logits=fc_out, label=mask_label) mask_lm_loss = fluid.layers.cross_entropy(
input=inputs, label=mask_label)
loss = fluid.layers.mean(mask_lm_loss) loss = fluid.layers.mean(mask_lm_loss)
return {'loss': loss} return {'loss': loss}
else: else:
return {'logits': fc_out} return {'logits': fc_out}
def batch_postprocess(self, rt_outputs):
if not self._is_training:
logits = rt_outputs['logits']
preds = np.argmax(logits, -1)
self._preds.extend(preds.tolist())
return preds
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if output_dir is None:
for p in self._preds:
print(p)
else:
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlepalm.interface import task_paradigm from paddlepalm.head.base_head import Head
import collections import collections
import numpy as np import numpy as np
import os import os
...@@ -26,34 +26,37 @@ import json ...@@ -26,34 +26,37 @@ import json
RawResult = collections.namedtuple("RawResult", RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"]) ["unique_id", "start_logits", "end_logits"])
class TaskParadigm(task_paradigm): class MRC(Head):
"""""" """
Machine Reading Comprehension
"""
def __init__(self, max_query_len, input_dim, pred_output_path=None, verbose=False, with_negative=False, do_lower_case=False, max_ans_len=None, null_score_diff_threshold=0.0, n_best_size=20, phase='train'):
def __init__(self, config, phase, backbone_config=None):
self._is_training = phase == 'train' self._is_training = phase == 'train'
self._max_sequence_length = config['max_seq_len'] self._hidden_size = input_dim
self._hidden_size = backbone_config['hidden_size'] self._max_sequence_length = max_query_len
self._pred_results = [] self._pred_results = []
if phase == 'pred': output_dir = pred_output_path
self._max_answer_length = config.get('max_answer_len', None) self._max_answer_length = max_ans_len
self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0) self._null_score_diff_threshold = null_score_diff_threshold
self._n_best_size = config.get('n_best_size', 20) self._n_best_size = n_best_size
self._pred_output_path = config.get('pred_output_path', None) output_dir = pred_output_path
self._verbose = config.get('verbose', False) self._verbose = verbose
self._with_negative = config.get('with_negative', False) self._with_negative = with_negative
self._do_lower_case = config.get('do_lower_case', False) self._do_lower_case = do_lower_case
@property @property
def inputs_attrs(self): def inputs_attrs(self):
if self._is_training: if self._is_training:
reader = {"start_positions": [[-1, 1], 'int64'], reader = {"start_positions": [[-1], 'int64'],
"end_positions": [[-1, 1], 'int64'], "end_positions": [[-1], 'int64'],
} }
else: else:
reader = {'unique_ids': [[-1, 1], 'int64']} reader = {'unique_ids': [[-1], 'int64']}
bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']} bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
return {'reader': reader, 'backbone': bb} return {'reader': reader, 'backbone': bb}
...@@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm): ...@@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm):
else: else:
return {'start_logits': [[-1, -1, 1], 'float32'], return {'start_logits': [[-1, -1, 1], 'float32'],
'end_logits': [[-1, -1, 1], 'float32'], 'end_logits': [[-1, -1, 1], 'float32'],
'unique_ids': [[-1, 1], 'int64']} 'unique_ids': [[-1], 'int64']}
def build(self, inputs, scope_name=""): def build(self, inputs, scope_name=""):
if self._is_training: if self._is_training:
start_positions = inputs['reader']['start_positions'] start_positions = inputs['reader']['start_positions']
end_positions = inputs['reader']['end_positions'] end_positions = inputs['reader']['end_positions']
max_position = inputs["reader"]["seqlen"] - 1 # max_position = inputs["reader"]["seqlen"] - 1
start_positions = fluid.layers.elementwise_min(start_positions, max_position) # start_positions = fluid.layers.elementwise_min(start_positions, max_position)
end_positions = fluid.layers.elementwise_min(end_positions, max_position) # end_positions = fluid.layers.elementwise_min(end_positions, max_position)
start_positions.stop_gradient = True start_positions.stop_gradient = True
end_positions.stop_gradient = True end_positions.stop_gradient = True
else: else:
unique_id = inputs['reader']['unique_ids'] unique_id = inputs['reader']['unique_ids']
# It's used to help fetch variable 'unique_ids' that will be removed in the future
helper_constant = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
fluid.layers.elementwise_mul(unique_id, helper_constant)
enc_out = inputs['backbone']['encoder_outputs'] enc_out = inputs['backbone']['encoder_outputs']
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=enc_out, input=enc_out,
...@@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm): ...@@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm):
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
def _compute_single_loss(logits, positions): def _compute_single_loss(logits, positions):
"""Compute start/end loss for mrc model""" """Compute start/en
loss = fluid.layers.softmax_with_cross_entropy( d loss for mrc model"""
logits=logits, label=positions) inputs = fluid.layers.softmax(logits)
loss = fluid.layers.cross_entropy(
input=inputs, label=positions)
loss = fluid.layers.mean(x=loss) loss = fluid.layers.mean(x=loss)
return loss return loss
...@@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm): ...@@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm):
'unique_ids': unique_id} 'unique_ids': unique_id}
def postprocess(self, rt_outputs): def batch_postprocess(self, rt_outputs):
"""this func will be called after each step(batch) of training/evaluating/predicting process.""" """this func will be called after each step(batch) of training/evaluating/predicting process."""
if not self._is_training: if not self._is_training:
unique_ids = np.squeeze(rt_outputs['unique_ids'], -1) unique_ids = rt_outputs['unique_ids']
start_logits = rt_outputs['start_logits'] start_logits = rt_outputs['start_logits']
end_logits = rt_outputs['end_logits'] end_logits = rt_outputs['end_logits']
for idx in range(len(unique_ids)): for idx in range(len(unique_ids)):
...@@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm): ...@@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm):
start_logits=s, start_logits=s,
end_logits=e)) end_logits=e))
def epoch_postprocess(self, post_inputs): def epoch_postprocess(self, post_inputs, output_dir=None):
"""(optional interface) this func will be called after evaluation/predicting process and each epoch during training process.""" """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
if not self._is_training: if not self._is_training:
if self._pred_output_path is None: if output_dir is None:
raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.') raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
examples = post_inputs['reader']['examples'] examples = post_inputs['reader']['examples']
features = post_inputs['reader']['features'] features = post_inputs['reader']['features']
if not os.path.exists(self._pred_output_path): if not os.path.exists(output_dir):
os.makedirs(self._pred_output_path) os.makedirs(output_dir)
output_prediction_file = os.path.join(self._pred_output_path, "predictions.json") output_prediction_file = os.path.join(output_dir, "predictions.json")
output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json") output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json") output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")
_write_predictions(examples, features, self._pred_results, _write_predictions(examples, features, self._pred_results,
self._n_best_size, self._max_answer_length, self._n_best_size, self._max_answer_length,
self._do_lower_case, output_prediction_file, self._do_lower_case, output_prediction_file,
...@@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size,
# keep track of the minimum score of null start+end of position 0 # keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score ull_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features): for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id] result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size) start_indexes = _get_best_indexes(result.start_logits, n_best_size)
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid import layers
from paddlepalm.head.base_head import Head
import numpy as np
import os
import math
class SequenceLabel(Head):
'''
Sequence label
'''
def __init__(self, num_classes, input_dim, dropout_prob=0.0, learning_rate=1e-3, \
param_initializer_range=0.02, phase='train'):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
"""
self._is_training = phase == 'train'
self._hidden_size = input_dim
self.num_classes = num_classes
self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=param_initializer_range)
self.learning_rate = learning_rate
self._preds = []
@property
def inputs_attrs(self):
reader = {}
bb = {"encoder_outputs": [[-1, -1, -1], 'float32']}
if self._is_training:
reader["label_ids"] = [[-1, -1], 'int64']
reader["seq_lens"] = [[-1], 'int64']
return {'reader': reader, 'backbone': bb}
@property
def outputs_attrs(self):
if self._is_training:
return {'loss': [[1], 'float32']}
else:
return {'emission': [[-1, self.num_classes], 'float32']}
def build(self, inputs, scope_name=''):
token_emb = inputs['backbone']['encoder_outputs']
if self._is_training:
label_ids = inputs['reader']['label_ids']
seq_lens = inputs['reader']['seq_lens']
emission = fluid.layers.fc(
size=self.num_classes,
input=token_emb,
param_attr=fluid.ParamAttr(
initializer=self._param_initializer,
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)),
num_flatten_dims=2)
if self._is_training:
# compute loss
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=label_ids,
param_attr=fluid.ParamAttr(
name=scope_name+'crfw', learning_rate=self.learning_rate),
length=seq_lens)
avg_cost = fluid.layers.mean(x=crf_cost)
crf_decode = fluid.layers.crf_decoding(
input=emission,
param_attr=fluid.ParamAttr(name=scope_name+'crfw'),
length=seq_lens)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval(
input=crf_decode,
label=label_ids,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((self.num_classes - 1) / 2.0)),
seq_length=seq_lens)
chunk_evaluator = fluid.metrics.ChunkEvaluator()
chunk_evaluator.reset()
return {"loss": avg_cost}
else:
return {"emission": emission}
def batch_postprocess(self, rt_outputs):
if not self._is_training:
emission = rt_outputs['emission']
preds = np.argmax(emission, -1)
self._preds.extend(preds.tolist())
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if output_dir is None:
raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
from cls import ClassifyReader from cls import ClassifyReader
from match import MatchReader
from ner import SequenceLabelReader
from mrc import MrcReader
from mlm import MaskLMReader
...@@ -42,7 +42,6 @@ class Reader(object): ...@@ -42,7 +42,6 @@ class Reader(object):
self._register.add(attr_name) self._register.add(attr_name)
def register_with(self, backbone): def register_with(self, backbone):
print(backbone)
for attr in backbone.inputs_attr: for attr in backbone.inputs_attr:
self.require_attr(attr) self.require_attr(attr)
self._registered_backbone = backbone self._registered_backbone = backbone
......
...@@ -13,87 +13,104 @@ ...@@ -13,87 +13,104 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlepalm.interface import reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import ClassifyReader from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader
class Reader(reader):
class MatchReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''): def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \
""" do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么
"""
Args: Args:
phase: train, eval, pred phase: train, eval, pred
""" lang: en, ch, ...
learning_strategy: pointwise, pairwise
"""
self._is_training = phase == 'train' Reader.__init__(self, phase)
reader = ClassifyReader(config['vocab_path'], assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
max_seq_len=config['max_seq_len'], assert phase in ['train', 'predict'], "supported phase: train, predict."
do_lower_case=config.get('do_lower_case', True),
for_cn=config.get('for_cn', False),
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size'] for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._max_seq_len = config['max_seq_len']
self._register.add('token_ids')
if phase == 'train': if phase == 'train':
self._input_file = config['train_file'] if learning_strategy == 'pointwise':
self._num_epochs = None # 防止iteartor终止 self._register.add('label_ids')
self._shuffle = config.get('shuffle', True) if learning_strategy == 'pairwise':
self._shuffle_buffer = config.get('shuffle_buffer', 5000) self._register.add('token_ids_neg')
elif phase == 'eval': self._register.add('position_ids_neg')
self._input_file = config['dev_file'] self._register.add('segment_ids_neg')
self._num_epochs = 1 self._register.add('input_mask_neg')
self._shuffle = False self._register.add('task_ids_neg')
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred': self._is_training = phase == 'train'
self._input_file = config['pred_file'] self._learning_strategy = learning_strategy
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
match_reader = CLSReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed,
learning_strategy = learning_strategy)
self._reader = match_reader
self._dev_count = dev_count
self._phase = phase self._phase = phase
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
@property @property
def outputs_attr(self): def outputs_attr(self):
if self._is_training: attrs = {"token_ids": [[-1, -1], 'int64'],
return {"token_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "input_mask": [[-1, -1, 1], 'float32'],
"input_mask": [[-1, -1, 1], 'float32'], "task_ids": [[-1, -1], 'int64'],
"label_ids": [[-1], 'int64'], "label_ids": [[-1], 'int64'],
"task_ids": [[-1, -1], 'int64'] "token_ids_neg": [[-1, -1], 'int64'],
} "position_ids_neg": [[-1, -1], 'int64'],
else: "segment_ids_neg": [[-1, -1], 'int64'],
return {"token_ids": [[-1, -1], 'int64'], "input_mask_neg": [[-1, -1, 1], 'float32'],
"position_ids": [[-1, -1], 'int64'], "task_ids_neg": [[-1, -1], 'int64']
"segment_ids": [[-1, -1], 'int64'], }
"task_ids": [[-1, -1], 'int64'], return self._get_registed_attrs(attrs)
"input_mask": [[-1, -1, 1], 'float32']
}
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
def load_data(self): self._batch_size = batch_size
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
def iterator(self): input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
def list_to_dict(x): phase=self._phase)
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'label_ids', 'unique_ids'] def _iterator(self):
outputs = {n: i for n,i in zip(names, x)}
del outputs['unique_ids']
if not self._is_training: names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 'label_ids', \
del outputs['label_ids'] 'token_ids_neg', 'segment_ids_neg', 'position_ids_neg', 'task_ids_neg', 'input_mask_neg']
return outputs
if self._learning_strategy == 'pairwise':
names.remove('label_ids')
for batch in self._data_generator(): for batch in self._data_generator():
yield list_to_dict(batch) outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
@property @property
def num_examples(self): def num_examples(self):
return self._reader.get_num_examples(phase=self._phase) return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
...@@ -13,79 +13,79 @@ ...@@ -13,79 +13,79 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlepalm.interface import reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MaskLMReader from paddlepalm.reader.utils.reader4ernie import MaskLMReader as MLMReader
import numpy as np import numpy as np
class Reader(reader): class MaskLMReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''): def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
""" """
Args: Args:
phase: train, eval, pred phase: train, eval, pred
""" """
self._is_training = phase == 'train'
reader = MaskLMReader(config['vocab_path'], Reader.__init__(self, phase)
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', False),
for_cn=config.get('for_cn', False),
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size'] assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
self._max_seq_len = config['max_seq_len'] assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
self._register.add('mask_pos')
if phase == 'train': if phase == 'train':
self._input_file = config['train_file'] self._register.add('mask_label')
self._num_epochs = None # 防止iteartor终止 self._is_training = phase == 'train'
self._shuffle = config.get('shuffle', True)
self._shuffle_buffer = config.get('shuffle_buffer', 5000) mlm_reader = MLMReader(vocab_path,
elif phase == 'eval': max_seq_len=max_len,
self._input_file = config['dev_file'] do_lower_case=do_lower_case,
self._num_epochs = 1 for_cn=for_cn,
self._shuffle = False random_seed=seed)
self._batch_size = config.get('pred_batch_size', self._batch_size) self._reader = mlm_reader
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
self._phase = phase self._phase = phase
# self._batch_size = self._dev_count = dev_count
self._print_first_n = config.get('print_first_n', 1)
@property @property
def outputs_attr(self): def outputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'], attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'], "input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1, -1], 'int64'], "task_ids": [[-1, -1], 'int64'],
"mask_label": [[-1], 'int64'], "mask_label": [[-1], 'int64'],
"mask_pos": [[-1], 'int64'], "mask_pos": [[-1], 'int64']
} }
return self._get_registed_attrs(attrs)
def load_data(self):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
def iterator(self): def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='csv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def list_to_dict(x): def _iterator(self):
names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask',
'task_ids', 'mask_label', 'mask_pos']
outputs = {n: i for n,i in zip(names, x)}
# outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
return outputs
names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask',
'task_ids', 'mask_label', 'mask_pos']
for batch in self._data_generator(): for batch in self._data_generator():
# print(np.shape(list_to_dict(batch)['token_ids'])) outputs = {n: i for n,i in zip(names, batch)}
# print(list_to_dict(batch)['mask_label'].tolist()) ret = {}
yield list_to_dict(batch) # TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
def get_epoch_outputs(self): def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase), return {'examples': self._reader.get_examples(self._phase),
...@@ -95,3 +95,7 @@ class Reader(reader): ...@@ -95,3 +95,7 @@ class Reader(reader):
def num_examples(self): def num_examples(self):
return self._reader.get_num_examples(phase=self._phase) return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
...@@ -13,77 +13,66 @@ ...@@ -13,77 +13,66 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddlepalm.interface import reader from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MRCReader from paddlepalm.reader.utils.reader4ernie import MRCReader
import numpy as np
class Reader(reader): class MrcReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''): def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train', dev_count=1, print_prefix=''):
""" """
Args: Args:
phase: train, eval, pred phase: train, eval, pred
""" lang: en, ch, ...
"""
self._is_training = phase == 'train' Reader.__init__(self, phase)
reader = MRCReader(config['vocab_path'],
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', False),
tokenizer='FullTokenizer',
for_cn=config.get('for_cn', False),
doc_stride=config['doc_stride'],
remove_noanswer=config.get('remove_noanswer', True),
max_query_length=config['max_query_len'],
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size'] assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
self._max_seq_len = config['max_seq_len'] assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
if phase == 'train': if phase == 'train':
self._input_file = config['train_file'] self._register.add("start_positions")
# self._num_epochs = config['num_epochs'] self._register.add("end_positions")
self._num_epochs = None # 防止iteartor终止 else:
self._shuffle = config.get('shuffle', True) self._register.add("unique_ids")
self._shuffle_buffer = config.get('shuffle_buffer', 5000)
if phase == 'eval':
self._input_file = config['dev_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
self._phase = phase self._is_training = phase == 'train'
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
# TODO: without slide window version mrc_reader = MRCReader(vocab_path,
self._with_slide_window = config.get('with_slide_window', False) max_seq_len=max_len,
do_lower_case=do_lower_case,
tokenizer=tokenizer,
doc_stride=doc_stride,
remove_noanswer=remove_noanswer,
max_query_length=max_query_len,
for_cn=for_cn,
random_seed=seed)
self._reader = mrc_reader
self._phase = phase
self._dev_count = dev_count
@property @property
def outputs_attr(self): def outputs_attr(self):
if self._is_training: attrs = {"token_ids": [[-1, -1], 'int64'],
return {"token_ids": [[-1, -1], 'int64'], "position_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'], "segment_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'], "input_mask": [[-1, -1, 1], 'float32'],
"input_mask": [[-1, -1, 1], 'float32'], "start_positions": [[-1], 'int64'],
"start_positions": [[-1], 'int64'], "end_positions": [[-1], 'int64'],
"end_positions": [[-1], 'int64'], "task_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'] "unique_ids": [[-1], 'int64']
} }
else: return self._get_registed_attrs(attrs)
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"unique_ids": [[-1], 'int64']
}
@property @property
def epoch_outputs_attr(self): def epoch_outputs_attr(self):
...@@ -91,26 +80,34 @@ class Reader(reader): ...@@ -91,26 +80,34 @@ class Reader(reader):
return {"examples": None, return {"examples": None,
"features": None} "features": None}
def load_data(self): def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase) self._batch_size = batch_size
self._num_epochs = num_epochs
def iterator(self): self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
def list_to_dict(x): shuffle=shuffle_train if self._phase == 'train' else False, \
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', phase=self._phase)
'start_positions', 'end_positions', 'unique_ids'] def _iterator(self):
outputs = {n: i for n,i in zip(names, x)}
if self._is_training: names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
del outputs['unique_ids'] 'start_positions', 'end_positions', 'unique_ids']
else:
del outputs['start_positions'] if self._is_training:
del outputs['end_positions'] names.remove('unique_ids')
return outputs
for batch in self._data_generator(): for batch in self._data_generator():
yield list_to_dict(batch) outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
if not self._is_training:
assert 'unique_ids' in ret, ret
yield ret
def get_epoch_outputs(self): def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase), return {'examples': self._reader.get_examples(self._phase),
'features': self._reader.get_features(self._phase)} 'features': self._reader.get_features(self._phase)}
...@@ -118,3 +115,7 @@ class Reader(reader): ...@@ -118,3 +115,7 @@ class Reader(reader):
def num_examples(self): def num_examples(self):
return self._reader.get_num_examples(phase=self._phase) return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader
class SequenceLabelReader(Reader):
def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
"""
Reader.__init__(self, phase)
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
self._register.add('seq_lens')
if phase == 'train':
self._register.add('label_ids')
self._is_training = phase == 'train'
ner_reader = SLReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed,
label_map_config=label_map_config)
self._reader = ner_reader
self._phase = phase
self._dev_count = dev_count
@property
def outputs_attr(self):
attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"seq_lens": [[-1], 'int64'],
"label_ids": [[-1, -1], 'int64']}
return self._get_registed_attrs(attrs)
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def _iterator(self):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'label_ids', 'seq_lens', 'label_ids']
for batch in self._data_generator():
outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase),
'features': self._reader.get_features(self._phase)}
@property
def num_examples(self):
return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
...@@ -19,57 +19,76 @@ from __future__ import print_function ...@@ -19,57 +19,76 @@ from __future__ import print_function
import numpy as np import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3, dev_count=1):
""" """
Add mask for batch_tokens, return out, mask_label, mask_pos; Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded; Note: mask_pos responding the batch_tokens after padded;
""" """
max_len = max([len(sent) for sent in batch_tokens]) max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = [] multidev_batch_tokens = []
prob_mask = np.random.rand(total_token_num) multidev_mask_label = []
# Note: the first token is [CLS], so [low=1] multidev_mask_pos = []
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0 big_batch_tokens = batch_tokens
prob_index = 0 stride = len(batch_tokens) // dev_count
for sent_index, sent in enumerate(batch_tokens): if stride == 0:
mask_flag = False return None, None, None
prob_index += pre_sent_len p = stride
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index] for i in range(dev_count):
if prob > 0.15: batch_tokens = big_batch_tokens[p-stride:p]
continue p += stride
elif 0.03 < prob <= 0.15: mask_label = []
# mask mask_pos = []
if token != SEP and token != CLS: prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index]) mask_label.append(sent[token_index])
sent[token_index] = MASK sent[token_index] = MASK
mask_flag = True mask_flag = True
mask_pos.append(sent_index * max_len + token_index) mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03: mask_label = np.array(mask_label).astype("int64").reshape([-1])
# random replace mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
if token != SEP and token != CLS:
mask_label.append(sent[token_index]) multidev_batch_tokens.extend(batch_tokens)
sent[token_index] = replace_ids[prob_index + token_index] multidev_mask_label.append(mask_label)
mask_flag = True multidev_mask_pos.append(mask_pos)
mask_pos.append(sent_index * max_len + token_index)
else: return multidev_batch_tokens, multidev_mask_label, multidev_mask_pos
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts, def prepare_batch_data(insts,
...@@ -83,7 +102,8 @@ def prepare_batch_data(insts, ...@@ -83,7 +102,8 @@ def prepare_batch_data(insts,
task_id=0, task_id=0,
return_input_mask=True, return_input_mask=True,
return_max_len=True, return_max_len=True,
return_num_token=False): return_num_token=False,
dev_count=1):
""" """
1. generate Tensor of data 1. generate Tensor of data
2. generate Tensor of position 2. generate Tensor of position
...@@ -101,7 +121,8 @@ def prepare_batch_data(insts, ...@@ -101,7 +121,8 @@ def prepare_batch_data(insts,
vocab_size=voc_size, vocab_size=voc_size,
CLS=cls_id, CLS=cls_id,
SEP=sep_id, SEP=sep_id,
MASK=mask_id) MASK=mask_id,
dev_count=dev_count)
# Second step: padding # Second step: padding
src_id, self_input_mask = pad_batch_data( src_id, self_input_mask = pad_batch_data(
out, out,
...@@ -125,7 +146,7 @@ def prepare_batch_data(insts, ...@@ -125,7 +146,7 @@ def prepare_batch_data(insts,
return_list = [ return_list = [
src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
] ]
return return_list if len(return_list) > 1 else return_list[0] return return_list
def pad_batch_data(insts, def pad_batch_data(insts,
......
...@@ -71,10 +71,10 @@ class TaskInstance(object): ...@@ -71,10 +71,10 @@ class TaskInstance(object):
self._train_finish = False self._train_finish = False
# 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例 # 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例
self._reader = {'train': None, 'eval': None, 'pred': None} self._reader = {'train': None, 'eval': None, 'predict': None}
self._input_layer = None self._input_layer = None
self._inputname_to_varname = {} self._inputname_to_varname = {}
self._task_layer = {'train': None, 'eval': None, 'pred': None} self._task_layer = {'train': None, 'eval': None, 'predict': None}
self._pred_input_name_list = [] self._pred_input_name_list = []
self._pred_input_varname_list = [] self._pred_input_varname_list = []
self._pred_fetch_name_list = [] self._pred_fetch_name_list = []
...@@ -90,7 +90,7 @@ class TaskInstance(object): ...@@ -90,7 +90,7 @@ class TaskInstance(object):
def build_task_layer(self, net_inputs, phase, scope=""): def build_task_layer(self, net_inputs, phase, scope=""):
output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope) output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope)
if phase == 'pred': if phase == 'predict':
if output_vars is not None: if output_vars is not None:
self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items()) self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items())
else: else:
......
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
import json import json
import yaml import yaml
from config_helper import PDConfig from config_helper import PDConfig
import logging
from paddle import fluid from paddle import fluid
def get_basename(f): def get_basename(f):
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import os import os
import sys import sys
import random import random
import logging
import numpy as np import numpy as np
import paddle import paddle
from paddle import fluid from paddle import fluid
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册