未验证 提交 3cf78f84 编写于 作者: X Xiaoyao Xi 提交者: GitHub

Merge pull request #48 from wangxiao1021/adapt components to APIs

adapt components to APIs
......@@ -50,7 +50,7 @@ cd PALM && python setup.py install
**环境依赖**
- Python >= 2.7 (即将支持python3)
- Python >= 2.7
- cuda >= 9.0
- cudnn >= 7.0
- PaddlePaddle >= 1.6.1 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装)
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""v1.1
BERT model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import fluid
from paddle.fluid import layers
from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
from paddlepalm.interface import backbone
class Model(backbone):
def __init__(self, config, phase):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = config["hidden_size"]
self._n_layer = config["num_hidden_layers"]
self._n_head = config["num_attention_heads"]
self._voc_size = config["vocab_size"]
self._max_position_seq_len = config["max_position_embeddings"]
self._sent_types = config["type_vocab_size"]
self._hidden_act = config["hidden_act"]
self._prepostprocess_dropout = config["hidden_dropout_prob"]
self._attention_dropout = config["attention_probs_dropout_prob"]
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
# Initialize all weigths by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config["initializer_range"])
@property
def inputs_attr(self):
return {"token_ids": [[-1, -1, 1], 'int64'],
"position_ids": [[-1, -1, 1], 'int64'],
"segment_ids": [[-1, -1, 1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32']}
@property
def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids']
pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask']
self._emb_dtype = 'float32'
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.layers.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
def postprocess(self, rt_outputs):
pass
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ernie model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from paddle import fluid
from paddle.fluid import layers
from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
from paddlepalm.interface import backbone
class Model(backbone):
def __init__(self,
config,
phase):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
if config['sent_type_vocab_size']:
self._sent_types = config['sent_type_vocab_size']
else:
self._sent_types = config['type_vocab_size']
self._task_types = config['task_type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
@property
def inputs_attr(self):
return {"token_ids": [[-1, -1, 1], 'int64'],
"position_ids": [[-1, -1, 1], 'int64'],
"segment_ids": [[-1, -1, 1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1,-1, 1], 'int64']}
@property
def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids']
pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask']
task_ids = inputs['task_ids']
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.layers.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.layers.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
task_emb_out = fluid.layers.embedding(
task_ids,
size=[self._task_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._task_emb_name,
initializer=self._param_initializer))
emb_out = emb_out + task_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
def postprocess(self, rt_outputs):
pass
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from functools import partial
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layer_helper import LayerHelper as LayerHelper
from functools import reduce # py3
def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
helper = LayerHelper('layer_norm', **locals())
mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
r_stdev = layers.rsqrt(variance + epsilon)
norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
param_dtype = norm_x.dtype
scale = helper.create_parameter(
attr=param_attr,
shape=param_shape,
dtype=param_dtype,
default_initializer=fluid.initializer.Constant(1.))
bias = helper.create_parameter(
attr=bias_attr,
shape=param_shape,
dtype=param_dtype,
is_bias=True,
default_initializer=fluid.initializer.Constant(0.))
out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
out = layers.elementwise_add(x=out, y=bias, axis=-1)
return out
def multi_head_attention(queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
param_initializer=None,
name='multi_head_att'):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs: quries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(input=queries,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
k = layers.fc(input=keys,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
v = layers.fc(input=values,
size=d_value * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size = x.shape[-1]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape(
x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of inpunt tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k = cache["k"] = layers.concat(
[layers.reshape(
cache["k"], shape=[0, 0, d_model]), k], axis=1)
v = cache["v"] = layers.concat(
[layers.reshape(
cache["v"], shape=[0, 0, d_model]), v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(input=out,
size=d_model,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0',
initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
return proj_out
def positionwise_feed_forward(x,
d_inner_hid,
d_hid,
dropout_rate,
hidden_act,
param_initializer=None,
name='ffn'):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(input=x,
size=d_inner_hid,
num_flatten_dims=2,
act=hidden_act,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0',
initializer=param_initializer),
bias_attr=name + '_fc_0.b_0')
if dropout_rate:
hidden = layers.dropout(
hidden,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = layers.fc(input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
name=''):
"""
Add residual connection, layer normalization and droput to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out_dtype = out.dtype
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float32")
out = layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.ParamAttr(
name=name + '_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
name=name + '_layer_norm_bias',
initializer=fluid.initializer.Constant(0.)))
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float16")
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out,
dropout_prob=dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def encoder_layer(enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and droput.
"""
attn_output = multi_head_attention(
pre_process_layer(
enc_input,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_att'),
None,
None,
attn_bias,
d_key,
d_value,
d_model,
n_head,
attention_dropout,
param_initializer=param_initializer,
name=name + '_multi_head_att')
attn_output = post_process_layer(
enc_input,
attn_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_att')
ffd_output = positionwise_feed_forward(
pre_process_layer(
attn_output,
preprocess_cmd,
prepostprocess_dropout,
name=name + '_pre_ffn'),
d_inner_hid,
d_model,
relu_dropout,
hidden_act,
param_initializer=param_initializer,
name=name + '_ffn')
return post_process_layer(
attn_output,
ffd_output,
postprocess_cmd,
prepostprocess_dropout,
name=name + '_post_ffn')
def encoder(enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=''):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
for i in range(n_layer):
enc_output = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd,
postprocess_cmd,
param_initializer=param_initializer,
name=name + '_layer_' + str(i))
enc_input = enc_output
enc_output = pre_process_layer(
enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
return enc_output
../../pretrain/
\ No newline at end of file
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddlepalm as palm
import sys
import argparse
# create parser
parser = argparse.ArgumentParser(prog='download_models.py', usage='python %(prog)s -l | -d <model_name> [-h]\n\nFor example,\n\tpython %(prog)s -d bert-en-uncased-large ',description = 'Download pretrain models for initializing params of backbones. ')
parser1= parser.add_argument_group("required arguments")
parser1.add_argument('-l','--list', action = 'store_true', help = 'show the list of available pretrain models', default = False)
parser1.add_argument('-d','--download', action = 'store', help = 'download pretrain models. The available pretrain models can be listed by run "python download_models.py -l"')
args = parser.parse_args()
if(args.list):
palm.downloader.ls('pretrain')
elif(args.download):
print('download~~~')
print(args.download)
palm.downloader.download('pretrain', args.download)
else:
print (parser.parse_args(['-h']))
......@@ -34,13 +34,16 @@ _items = {
'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz',
'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz',
'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz',
'ernie-ch-uncased-base':'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz',
'roberta-cn-base': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz',
'roberta-cn-large': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz',
'utils': None},
'reader': {'utils': None},
'backbone': {'utils': None},
'tasktype': {'utils': None},
}
def _download(item, scope, path, silent=False):
def _download(item, scope, path, silent=False, convert=False):
data_url = _items[item][scope]
if data_url == None:
return
......@@ -100,9 +103,10 @@ def _download(item, scope, path, silent=False):
os.removedirs(source_path)
if not silent:
print ('done!')
if not silent:
print ('Converting params...', end=" ")
_convert(data_dir, silent)
if convert:
if not silent:
print ('Converting params...', end=" ")
_convert(data_dir, silent)
if not silent:
print ('done!')
......
......@@ -15,7 +15,7 @@
"""v1.1"""
class BaseBackbone(object):
class Backbone(object):
"""interface of backbone model."""
def __init__(self, config, phase):
......
......@@ -23,28 +23,37 @@ from paddle import fluid
from paddle.fluid import layers
from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
from paddlepalm.backbone.base_backbone import BaseBackbone
from paddlepalm.backbone.base_backbone import Backbone
class BERT(BaseBackbone):
class BERT(Backbone):
def __init__(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, type_vocab_size, hidden_act, hidden_dropout_prob, \
attention_probs_dropout_prob, initializer_range, phase='train'):
config = {}
config['hidden_size'] = hidden_size
config['num_hidden_layers'] = num_hidden_layers
config['num_attention_heads'] = num_attention_heads
config['vocab_size'] = vocab_size
config['max_position_embeddings'] = max_position_embeddings
config['type_vocab_size'] = sent_type_vocab_size
config['hidden_act'] = hidden_act
config['hidden_dropout_prob'] = hidden_dropout_prob
config['attention_probs_dropout_prob'] = attention_probs_dropout_prob
config['initializer_range'] = initializer_range
self.from_config(config, phase=phase)
attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
self._emb_size = hidden_size
self._n_layer = num_hidden_layers
self._n_head = num_attention_heads
self._voc_size = vocab_size
self._max_position_seq_len = max_position_embeddings
self._sent_types = type_vocab_size
self._hidden_act = hidden_act
self._prepostprocess_dropout = hidden_dropout_prob
self._attention_dropout = attention_probs_dropout_prob
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
self._phase = phase
self._is_pairwise = is_pairwise
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range)
@classmethod
def from_config(self, config, phase='train'):
......@@ -62,40 +71,57 @@ class BERT(BaseBackbone):
"{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
assert 'initializer_range' in config, "{} is required to initialize ERNIE".format('initializer_range')
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = config["hidden_size"]
self._n_layer = config["num_hidden_layers"]
self._n_head = config["num_attention_heads"]
self._voc_size = config["vocab_size"]
self._max_position_seq_len = config["max_position_embeddings"]
self._sent_types = config["type_vocab_size"]
self._hidden_act = config["hidden_act"]
self._prepostprocess_dropout = config["hidden_dropout_prob"]
self._attention_dropout = config["attention_probs_dropout_prob"]
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
# Initialize all weigths by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config["initializer_range"])
hidden_size = config['hidden_size']
num_hidden_layers = config['num_hidden_layers']
num_attention_heads = config['num_attention_heads']
vocab_size = config['vocab_size']
max_position_embeddings = config['max_position_embeddings']
if 'sent_type_vocab_size' in config:
sent_type_vocab_size = config['sent_type_vocab_size']
else:
sent_type_vocab_size = config['type_vocab_size']
hidden_act = config['hidden_act']
hidden_dropout_prob = config['hidden_dropout_prob']
attention_probs_dropout_prob = config['attention_probs_dropout_prob']
initializer_range = config['initializer_range']
if 'is_pairwise' in config:
is_pairwise = config['is_pairwise']
else:
is_pairwise = False
return self(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase)
@property
def inputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32']}
ret = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
}
if self._is_pairwise and self._phase=='train':
ret.update({"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
})
return ret
@property
def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
if self._is_pairwise and self._phase == 'train':
ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
"encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
return ret
def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids']
......@@ -104,83 +130,111 @@ class BERT(BaseBackbone):
input_mask = inputs['input_mask']
self._emb_dtype = 'float32'
# padding id in vocabulary must be set to 0
emb_out = fluid.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
input_buffer = {}
output_buffer = {}
input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask]
output_buffer['base'] = {}
if self._is_pairwise and self._phase =='train':
src_ids = inputs['token_ids_neg']
pos_ids = inputs['position_ids_neg']
sent_ids = inputs['segment_ids_neg']
input_mask = inputs['input_mask_neg']
input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask]
output_buffer['neg'] = {}
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
for key, (src_ids, pos_ids, sent_ids, input_mask) in input_buffer.items():
# padding id in vocabulary must be set to 0
emb_out = fluid.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
output_buffer[key]['word_embedding'] = emb_out
output_buffer[key]['encoder_outputs'] = enc_out
output_buffer[key]['sentence_embedding'] = next_sent_feat
output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
ret = {}
ret['embedding_table'] = embedding_table
ret['word_embedding'] = output_buffer['base']['word_embedding']
ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
if self._is_pairwise and self._phase == 'train':
ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
return ret
def postprocess(self, rt_outputs):
pass
......
......@@ -24,17 +24,17 @@ from paddle import fluid
from paddle.fluid import layers
from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
from paddlepalm.backbone.base_backbone import BaseBackbone
from paddlepalm.backbone.base_backbone import Backbone
class ERNIE(BaseBackbone):
class ERNIE(Backbone):
def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase='train'):
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise=False, phase='train'):
# self._is_training = phase == 'train' # backbone一般不用关心运行阶段,因为outputs在任何阶段基本不会变
self._emb_size = hidden_size
self._n_layer = num_hidden_layers
self._n_head = num_attention_heads
......@@ -53,7 +53,8 @@ class ERNIE(BaseBackbone):
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
self._is_pairwise = is_pairwise
self._phase=phase
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range)
......@@ -65,7 +66,7 @@ class ERNIE(BaseBackbone):
assert 'vocab_size' in config, "{} is required to initialize ERNIE".format('vocab_size')
assert 'max_position_embeddings' in config, "{} is required to initialize ERNIE".format('max_position_embeddings')
assert 'sent_type_vocab_size' in config or 'type_vocab_size' in config, "{} is required to initialize ERNIE".format('sent_type_vocab_size')
assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
# assert 'task_type_vocab_size' in config, "{} is required to initialize ERNIE".format('task_type_vocab_size')
assert 'hidden_act' in config, "{} is required to initialize ERNIE".format('hidden_act')
assert 'hidden_dropout_prob' in config, "{} is required to initialize ERNIE".format('hidden_dropout_prob')
assert 'attention_probs_dropout_prob' in config, "{} is required to initialize ERNIE".format('attention_probs_dropout_prob')
......@@ -80,126 +81,175 @@ class ERNIE(BaseBackbone):
sent_type_vocab_size = config['sent_type_vocab_size']
else:
sent_type_vocab_size = config['type_vocab_size']
task_type_vocab_size = config['task_type_vocab_size']
if 'task_type_vocab_size' in config:
task_type_vocab_size = config['task_type_vocab_size']
else:
task_type_vocab_size = config['type_vocab_size']
hidden_act = config['hidden_act']
hidden_dropout_prob = config['hidden_dropout_prob']
attention_probs_dropout_prob = config['attention_probs_dropout_prob']
initializer_range = config['initializer_range']
if 'is_pairwise' in config:
is_pairwise = config['is_pairwise']
else:
is_pairwise = False
return cls(hidden_size, num_hidden_layers, num_attention_heads, vocab_size, \
max_position_embeddings, sent_type_vocab_size, task_type_vocab_size, \
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, phase=phase)
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, is_pairwise, phase=phase)
@property
def inputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1,-1], 'int64']}
ret = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1,-1], 'int64']}
if self._is_pairwise and self._phase=='train':
ret.update({"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
"task_ids_neg": [[-1,-1], 'int64']
})
return ret
@property
def outputs_attr(self):
return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
ret = {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
"embedding_table": [[-1, self._voc_size, self._emb_size], 'float32'],
"encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
if self._is_pairwise and self._phase == 'train':
ret.update({"word_embedding_neg": [[-1, -1, self._emb_size], 'float32'],
"encoder_outputs_neg": [[-1, -1, self._emb_size], 'float32'],
"sentence_embedding_neg": [[-1, self._emb_size], 'float32'],
"sentence_pair_embedding_neg": [[-1, self._emb_size], 'float32']})
return ret
def build(self, inputs, scope_name=""):
src_ids = inputs['token_ids']
pos_ids = inputs['position_ids']
sent_ids = inputs['segment_ids']
input_mask = inputs['input_mask']
task_ids = inputs['task_ids']
# padding id in vocabulary must be set to 0
emb_out = fluid.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
input_buffer = {}
output_buffer = {}
input_buffer['base'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
output_buffer['base'] = {}
if self._is_pairwise and self._phase =='train':
src_ids = inputs['token_ids_neg']
pos_ids = inputs['position_ids_neg']
sent_ids = inputs['segment_ids_neg']
input_mask = inputs['input_mask_neg']
task_ids = inputs['task_ids_neg']
input_buffer['neg'] = [src_ids, pos_ids, sent_ids, input_mask, task_ids]
output_buffer['neg'] = {}
for key, (src_ids, pos_ids, sent_ids, input_mask, task_ids) in input_buffer.items():
# padding id in vocabulary must be set to 0
emb_out = fluid.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._word_emb_name, initializer=self._param_initializer),
is_sparse=False)
position_emb_out = fluid.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
task_emb_out = fluid.embedding(
task_ids,
size=[self._task_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._task_emb_name,
initializer=self._param_initializer))
emb_out = emb_out + task_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
# fluid.global_scope().find_var('backbone-word_embedding').get_tensor()
embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name)
position_emb_out = fluid.embedding(
input=pos_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._pos_emb_name, initializer=self._param_initializer))
sent_emb_out = fluid.embedding(
sent_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._sent_emb_name, initializer=self._param_initializer))
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
task_emb_out = fluid.embedding(
task_ids,
size=[self._task_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(
name=scope_name+self._task_emb_name,
initializer=self._param_initializer))
emb_out = emb_out + task_emb_out
emb_out = pre_process_layer(
emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder')
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_out = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
name=scope_name+'encoder')
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
output_buffer[key]['word_embedding'] = emb_out
output_buffer[key]['encoder_outputs'] = enc_out
output_buffer[key]['sentence_embedding'] = next_sent_feat
output_buffer[key]['sentence_pair_embedding'] = next_sent_feat
ret = {}
ret['embedding_table'] = embedding_table
ret['word_embedding'] = output_buffer['base']['word_embedding']
ret['encoder_outputs'] = output_buffer['base']['encoder_outputs']
ret['sentence_embedding'] = output_buffer['base']['sentence_embedding']
ret['sentence_pair_embedding'] = output_buffer['base']['sentence_pair_embedding']
if self._is_pairwise and self._phase == 'train':
ret['word_embedding_neg'] = output_buffer['neg']['word_embedding']
ret['encoder_outputs_neg'] = output_buffer['neg']['encoder_outputs']
ret['sentence_embedding_neg'] = output_buffer['neg']['sentence_embedding']
ret['sentence_pair_embedding_neg'] = output_buffer['neg']['sentence_pair_embedding']
next_sent_feat = fluid.layers.slice(
input=enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(
name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer),
bias_attr=scope_name+"pooled_fc.b_0")
return {'embedding_table': embedding_table,
'word_embedding': emb_out,
'encoder_outputs': enc_out,
'sentence_embedding': next_sent_feat,
'sentence_pair_embedding': next_sent_feat}
return ret
def postprocess(self, rt_outputs):
pass
......
......@@ -24,7 +24,6 @@ def yield_pieces(data, distribute_strategy, batch_size):
assert isinstance(data, list), "the input data must be a list or dict, and contained with multiple tensors."
data_list = data
ds_list = distribute_strategy
stride = batch_size // dev_count
p = stride
# while p < len(data_list) + stride:
......@@ -57,7 +56,8 @@ def yield_pieces(data, distribute_strategy, batch_size):
# print(len(temp))
yield temp
def data_feeder(reader, postprocess_fn=None, prefetch_steps=2, phase='train'):
def data_feeder(reader, postprocess_fn=None, prefetch_steps=2):
if postprocess_fn is None:
def postprocess_fn(batch):
return batch
......
from cls import Classify
# from match import Match
# from mrc import MRC
# from mlm import MaskLM
from match import Match
from ner import SequenceLabel
from mrc import MRC
from mlm import MaskLM
......@@ -16,7 +16,7 @@
import os
import json
class BaseHead(object):
class Head(object):
def __init__(self, phase='train'):
"""
......
......@@ -13,41 +13,66 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid import layers
from paddlepalm.interface import task_paradigm
from paddlepalm.head.base_head import Head
import numpy as np
import os
import json
def computeHingeLoss(pos, neg, margin):
loss_part1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(
input=pos, shape=[-1, 1], value=margin, dtype='float32'), pos)
loss_part2 = fluid.layers.elementwise_add(loss_part1, neg)
loss_part3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(
input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2)
return loss_part3
class TaskParadigm(task_paradigm):
class Match(Head):
'''
matching
'''
def __init__(self, config, phase, backbone_config=None):
def __init__(self, num_classes, input_dim, dropout_prob=0.0, param_initializer_range=0.02, \
learning_strategy='pointwise', margin=0.5, phase='train'):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
learning_strategy: pointwise, pairwise
"""
self._is_training = phase == 'train'
self._hidden_size = backbone_config['hidden_size']
self._hidden_size = input_dim
self._num_classes = num_classes
if 'initializer_range' in config:
self._param_initializer = config['initializer_range']
else:
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=backbone_config.get('initializer_range', 0.02))
if 'dropout_prob' in config:
self._dropout_prob = config['dropout_prob']
else:
self._dropout_prob = backbone_config.get('hidden_dropout_prob', 0.0)
self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=param_initializer_range)
self._learning_strategy = learning_strategy
self._margin = margin
self._pred_output_path = config.get('pred_output_path', None)
self._preds = []
self._preds_logits = []
@property
def inputs_attrs(self):
if self._is_training:
reader = {"label_ids": [[-1, 1], 'int64']}
else:
reader = {}
reader = {}
bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
if self._is_training:
if self._learning_strategy == 'pointwise':
reader["label_ids"] = [[-1], 'int64']
elif self._learning_strategy == 'pairwise':
bb["sentence_pair_embedding_neg"] = [[-1, self._hidden_size], 'float32']
return {'reader': reader, 'backbone': bb}
@property
......@@ -55,51 +80,110 @@ class TaskParadigm(task_paradigm):
if self._is_training:
return {"loss": [[1], 'float32']}
else:
return {"logits": [[-1, 2], 'float32']}
if self._learning_strategy=='paiwise':
return {"probs": [[-1, 1], 'float32']}
else:
return {"logits": [[-1, 2], 'float32'],
"probs": [[-1, 2], 'float32']}
def build(self, inputs, scope_name=""):
if self._is_training:
labels = inputs["reader"]["label_ids"]
cls_feats = inputs["backbone"]["sentence_pair_embedding"]
# inputs
cls_feats = inputs["backbone"]["sentence_pair_embedding"]
if self._is_training:
cls_feats = fluid.layers.dropout(
x=cls_feats,
dropout_prob=self._dropout_prob,
dropout_implementation="upscale_in_train")
if self._learning_strategy == 'pairwise':
cls_feats_neg = inputs["backbone"]["sentence_pair_embedding_neg"]
cls_feats_neg = fluid.layers.dropout(
x=cls_feats_neg,
dropout_prob=self._dropout_prob,
dropout_implementation="upscale_in_train")
elif self._learning_strategy == 'pointwise':
labels = inputs["reader"]["label_ids"]
# loss
# for pointwise
if self._learning_strategy == 'pointwise':
logits = fluid.layers.fc(
input=cls_feats,
size=self._num_classes,
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b",
initializer=fluid.initializer.Constant(0.)))
probs = fluid.layers.softmax(logits)
if self._is_training:
ce_loss = fluid.layers.cross_entropy(
input=probs, label=labels)
loss = fluid.layers.mean(x=ce_loss)
return {'loss': loss}
# for pred
else:
return {'logits': logits,
'probs': probs}
# for pairwise
elif self._learning_strategy == 'pairwise':
pos_score = fluid.layers.fc(
input=cls_feats,
size=1,
act = "sigmoid",
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w_pr",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b_pr",
initializer=fluid.initializer.Constant(0.)))
pos_score = fluid.layers.reshape(x=pos_score, shape=[-1, 1], inplace=True)
logits = fluid.layers.fc(
input=cls_feats,
size=2,
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b",
initializer=fluid.initializer.Constant(0.)))
if self._is_training:
neg_score = fluid.layers.fc(
input=cls_feats_neg,
size=1,
act = "sigmoid",
param_attr=fluid.ParamAttr(
name=scope_name+"cls_out_w_pr",
initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b_pr",
initializer=fluid.initializer.Constant(0.)))
neg_score = fluid.layers.reshape(x=neg_score, shape=[-1, 1], inplace=True)
loss = fluid.layers.mean(computeHingeLoss(pos_score, neg_score, self._margin))
return {'loss': loss}
# for pred
else:
return {'probs': pos_score}
if self._is_training:
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
return {'loss': loss}
else:
return {'logits': logits}
def postprocess(self, rt_outputs):
def batch_postprocess(self, rt_outputs):
if not self._is_training:
logits = rt_outputs['logits']
preds = np.argmax(logits, -1)
self._preds.extend(preds.tolist())
def epoch_postprocess(self, post_inputs):
probs = []
logits = []
probs = rt_outputs['probs']
self._preds.extend(probs.tolist())
if self._learning_strategy == 'pointwise':
logits = rt_outputs['logits']
self._preds_logits.extend(logits.tolist())
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if self._pred_output_path is None:
raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
with open(os.path.join(self._pred_output_path, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(self._pred_output_path, 'predictions.json'))
if output_dir is None:
raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for i in range(len(self._preds)):
if self._learning_strategy == 'pointwise':
label = 0 if self._preds[i][0] > self._preds[i][1] else 1
result = {'index': i, 'label': label, 'logits': self._preds_logits[i], 'probs': self._preds[i]}
elif self._learning_strategy == 'pairwise':
label = 0 if self._preds[i][0] < 0.5 else 1
result = {'index': i, 'label': label, 'probs': self._preds[i][0]}
result = json.dumps(result)
writer.write(result+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
\ No newline at end of file
......@@ -14,30 +14,39 @@
# limitations under the License.
import paddle.fluid as fluid
from paddlepalm.interface import task_paradigm
from paddlepalm.head.base_head import Head
from paddle.fluid import layers
import numpy as np
import os
from paddlepalm.backbone.utils.transformer import pre_process_layer
class TaskParadigm(task_paradigm):
class MaskLM(Head):
'''
matching
mlm
'''
def __init__(self, config, phase, backbone_config=None):
def __init__(self, input_dim, vocab_size, hidden_act, initializer_range, dropout_prob=0.0, \
param_initializer_range=0.02, phase='train'):
self._is_training = phase == 'train'
self._emb_size = backbone_config['hidden_size']
self._hidden_size = backbone_config['hidden_size']
self._vocab_size = backbone_config['vocab_size']
self._hidden_act = backbone_config['hidden_act']
self._initializer_range = backbone_config['initializer_range']
self._emb_size = input_dim
self._hidden_size = input_dim
self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=param_initializer_range)
self._preds = []
self._vocab_size = vocab_size
self._hidden_act = hidden_act
self._initializer_range = initializer_range
@property
def inputs_attrs(self):
reader = {
"mask_label": [[-1, 1], 'int64'],
"mask_pos": [[-1, 1], 'int64']}
"token_ids":[[-1, -1], 'int64'],
"mask_label": [[-1], 'int64'],
"mask_pos": [[-1], 'int64'],
}
if not self._is_training:
del reader['mask_label']
del reader['batchsize_x_seqlen']
bb = {
"encoder_outputs": [[-1, -1, self._hidden_size], 'float32'],
"embedding_table": [[-1, self._vocab_size, self._emb_size], 'float32']}
......@@ -54,7 +63,13 @@ class TaskParadigm(task_paradigm):
mask_pos = inputs["reader"]["mask_pos"]
if self._is_training:
mask_label = inputs["reader"]["mask_label"]
max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
l1 = fluid.layers.shape(inputs["reader"]["token_ids"] )[0]
# bxs = inputs["reader"]["token_ids"].shape[2].value
l2 = fluid.layers.shape(inputs["reader"]["token_ids"][0])[0]
bxs = (l1*l2).astype(np.int64)
# max_position = inputs["reader"]["batchsize_x_seqlen"] - 1
max_position = bxs - 1
mask_pos = fluid.layers.elementwise_min(mask_pos, max_position)
mask_pos.stop_gradient = True
......@@ -100,11 +115,31 @@ class TaskParadigm(task_paradigm):
is_bias=True)
if self._is_training:
mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
logits=fc_out, label=mask_label)
inputs = fluid.layers.softmax(fc_out)
mask_lm_loss = fluid.layers.cross_entropy(
input=inputs, label=mask_label)
loss = fluid.layers.mean(mask_lm_loss)
return {'loss': loss}
else:
return {'logits': fc_out}
def batch_postprocess(self, rt_outputs):
if not self._is_training:
logits = rt_outputs['logits']
preds = np.argmax(logits, -1)
self._preds.extend(preds.tolist())
return preds
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if output_dir is None:
for p in self._preds:
print(p)
else:
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
......@@ -14,7 +14,7 @@
# limitations under the License.
import paddle.fluid as fluid
from paddlepalm.interface import task_paradigm
from paddlepalm.head.base_head import Head
import collections
import numpy as np
import os
......@@ -26,34 +26,37 @@ import json
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
class TaskParadigm(task_paradigm):
""""""
class MRC(Head):
"""
Machine Reading Comprehension
"""
def __init__(self, max_query_len, input_dim, pred_output_path=None, verbose=False, with_negative=False, do_lower_case=False, max_ans_len=None, null_score_diff_threshold=0.0, n_best_size=20, phase='train'):
def __init__(self, config, phase, backbone_config=None):
self._is_training = phase == 'train'
self._max_sequence_length = config['max_seq_len']
self._hidden_size = backbone_config['hidden_size']
self._hidden_size = input_dim
self._max_sequence_length = max_query_len
self._pred_results = []
if phase == 'pred':
self._max_answer_length = config.get('max_answer_len', None)
self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0)
self._n_best_size = config.get('n_best_size', 20)
self._pred_output_path = config.get('pred_output_path', None)
self._verbose = config.get('verbose', False)
self._with_negative = config.get('with_negative', False)
self._do_lower_case = config.get('do_lower_case', False)
output_dir = pred_output_path
self._max_answer_length = max_ans_len
self._null_score_diff_threshold = null_score_diff_threshold
self._n_best_size = n_best_size
output_dir = pred_output_path
self._verbose = verbose
self._with_negative = with_negative
self._do_lower_case = do_lower_case
@property
def inputs_attrs(self):
if self._is_training:
reader = {"start_positions": [[-1, 1], 'int64'],
"end_positions": [[-1, 1], 'int64'],
reader = {"start_positions": [[-1], 'int64'],
"end_positions": [[-1], 'int64'],
}
else:
reader = {'unique_ids': [[-1, 1], 'int64']}
reader = {'unique_ids': [[-1], 'int64']}
bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
return {'reader': reader, 'backbone': bb}
......@@ -70,21 +73,26 @@ class TaskParadigm(task_paradigm):
else:
return {'start_logits': [[-1, -1, 1], 'float32'],
'end_logits': [[-1, -1, 1], 'float32'],
'unique_ids': [[-1, 1], 'int64']}
'unique_ids': [[-1], 'int64']}
def build(self, inputs, scope_name=""):
if self._is_training:
start_positions = inputs['reader']['start_positions']
end_positions = inputs['reader']['end_positions']
max_position = inputs["reader"]["seqlen"] - 1
start_positions = fluid.layers.elementwise_min(start_positions, max_position)
end_positions = fluid.layers.elementwise_min(end_positions, max_position)
# max_position = inputs["reader"]["seqlen"] - 1
# start_positions = fluid.layers.elementwise_min(start_positions, max_position)
# end_positions = fluid.layers.elementwise_min(end_positions, max_position)
start_positions.stop_gradient = True
end_positions.stop_gradient = True
else:
unique_id = inputs['reader']['unique_ids']
# It's used to help fetch variable 'unique_ids' that will be removed in the future
helper_constant = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
fluid.layers.elementwise_mul(unique_id, helper_constant)
enc_out = inputs['backbone']['encoder_outputs']
logits = fluid.layers.fc(
input=enc_out,
......@@ -100,9 +108,11 @@ class TaskParadigm(task_paradigm):
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
def _compute_single_loss(logits, positions):
"""Compute start/end loss for mrc model"""
loss = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=positions)
"""Compute start/en
d loss for mrc model"""
inputs = fluid.layers.softmax(logits)
loss = fluid.layers.cross_entropy(
input=inputs, label=positions)
loss = fluid.layers.mean(x=loss)
return loss
......@@ -117,10 +127,10 @@ class TaskParadigm(task_paradigm):
'unique_ids': unique_id}
def postprocess(self, rt_outputs):
def batch_postprocess(self, rt_outputs):
"""this func will be called after each step(batch) of training/evaluating/predicting process."""
if not self._is_training:
unique_ids = np.squeeze(rt_outputs['unique_ids'], -1)
unique_ids = rt_outputs['unique_ids']
start_logits = rt_outputs['start_logits']
end_logits = rt_outputs['end_logits']
for idx in range(len(unique_ids)):
......@@ -139,19 +149,19 @@ class TaskParadigm(task_paradigm):
start_logits=s,
end_logits=e))
def epoch_postprocess(self, post_inputs):
def epoch_postprocess(self, post_inputs, output_dir=None):
"""(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
if not self._is_training:
if self._pred_output_path is None:
raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
if output_dir is None:
raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
examples = post_inputs['reader']['examples']
features = post_inputs['reader']['features']
if not os.path.exists(self._pred_output_path):
os.makedirs(self._pred_output_path)
output_prediction_file = os.path.join(self._pred_output_path, "predictions.json")
output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_prediction_file = os.path.join(output_dir, "predictions.json")
output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")
_write_predictions(examples, features, self._pred_results,
self._n_best_size, self._max_answer_length,
self._do_lower_case, output_prediction_file,
......@@ -194,8 +204,9 @@ def _write_predictions(all_examples, all_features, all_results, n_best_size,
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
ull_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
......
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid import layers
from paddlepalm.head.base_head import Head
import numpy as np
import os
import math
class SequenceLabel(Head):
'''
Sequence label
'''
def __init__(self, num_classes, input_dim, dropout_prob=0.0, learning_rate=1e-3, \
param_initializer_range=0.02, phase='train'):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
"""
self._is_training = phase == 'train'
self._hidden_size = input_dim
self.num_classes = num_classes
self._dropout_prob = dropout_prob if phase == 'train' else 0.0
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=param_initializer_range)
self.learning_rate = learning_rate
self._preds = []
@property
def inputs_attrs(self):
reader = {}
bb = {"encoder_outputs": [[-1, -1, -1], 'float32']}
if self._is_training:
reader["label_ids"] = [[-1, -1], 'int64']
reader["seq_lens"] = [[-1], 'int64']
return {'reader': reader, 'backbone': bb}
@property
def outputs_attrs(self):
if self._is_training:
return {'loss': [[1], 'float32']}
else:
return {'emission': [[-1, self.num_classes], 'float32']}
def build(self, inputs, scope_name=''):
token_emb = inputs['backbone']['encoder_outputs']
if self._is_training:
label_ids = inputs['reader']['label_ids']
seq_lens = inputs['reader']['seq_lens']
emission = fluid.layers.fc(
size=self.num_classes,
input=token_emb,
param_attr=fluid.ParamAttr(
initializer=self._param_initializer,
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)),
bias_attr=fluid.ParamAttr(
name=scope_name+"cls_out_b", initializer=fluid.initializer.Constant(0.)),
num_flatten_dims=2)
if self._is_training:
# compute loss
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=label_ids,
param_attr=fluid.ParamAttr(
name=scope_name+'crfw', learning_rate=self.learning_rate),
length=seq_lens)
avg_cost = fluid.layers.mean(x=crf_cost)
crf_decode = fluid.layers.crf_decoding(
input=emission,
param_attr=fluid.ParamAttr(name=scope_name+'crfw'),
length=seq_lens)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval(
input=crf_decode,
label=label_ids,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((self.num_classes - 1) / 2.0)),
seq_length=seq_lens)
chunk_evaluator = fluid.metrics.ChunkEvaluator()
chunk_evaluator.reset()
return {"loss": avg_cost}
else:
return {"emission": emission}
def batch_postprocess(self, rt_outputs):
if not self._is_training:
emission = rt_outputs['emission']
preds = np.argmax(emission, -1)
self._preds.extend(preds.tolist())
def epoch_postprocess(self, post_inputs, output_dir=None):
# there is no post_inputs needed and not declared in epoch_inputs_attrs, hence no elements exist in post_inputs
if not self._is_training:
if output_dir is None:
raise ValueError('argument output_dir not found in config. Please add it into config dict/file.')
with open(os.path.join(output_dir, 'predictions.json'), 'w') as writer:
for p in self._preds:
writer.write(str(p)+'\n')
print('Predictions saved at '+os.path.join(output_dir, 'predictions.json'))
from cls import ClassifyReader
from match import MatchReader
from ner import SequenceLabelReader
from mrc import MrcReader
from mlm import MaskLMReader
......@@ -14,7 +14,7 @@
# limitations under the License.
"""v1.1"""
from copy import copy
class BaseReader(object):
class Reader(object):
"""interface of data manager."""
def __init__(self, phase='train'):
......@@ -42,7 +42,6 @@ class BaseReader(object):
self._register.add(attr_name)
def register_with(self, backbone):
print(backbone)
for attr in backbone.inputs_attr:
self.require_attr(attr)
self._registered_backbone = backbone
......
......@@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.reader.base_reader import BaseReader
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader
class ClassifyReader(BaseReader):
class ClassifyReader(Reader):
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train'):
......@@ -29,7 +29,7 @@ class ClassifyReader(BaseReader):
"""
BaseReader.__init__(self, phase)
Reader.__init__(self, phase)
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
......
......@@ -13,87 +13,104 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.interface import reader
from paddlepalm.reader.utils.reader4ernie import ClassifyReader
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import ClassifyReader as CLSReader
class Reader(reader):
class MatchReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
"""
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', lang='en', seed=None, \
do_lower_case=False, learning_strategy='pointwise', phase='train', dev_count=1, print_prefix=''): # 需要什么加什么
"""
Args:
phase: train, eval, pred
"""
lang: en, ch, ...
learning_strategy: pointwise, pairwise
"""
self._is_training = phase == 'train'
Reader.__init__(self, phase)
reader = ClassifyReader(config['vocab_path'],
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', True),
for_cn=config.get('for_cn', False),
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
self._batch_size = config['batch_size']
self._max_seq_len = config['max_seq_len']
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
if phase == 'train':
self._input_file = config['train_file']
self._num_epochs = None # 防止iteartor终止
self._shuffle = config.get('shuffle', True)
self._shuffle_buffer = config.get('shuffle_buffer', 5000)
elif phase == 'eval':
self._input_file = config['dev_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
if learning_strategy == 'pointwise':
self._register.add('label_ids')
if learning_strategy == 'pairwise':
self._register.add('token_ids_neg')
self._register.add('position_ids_neg')
self._register.add('segment_ids_neg')
self._register.add('input_mask_neg')
self._register.add('task_ids_neg')
self._is_training = phase == 'train'
self._learning_strategy = learning_strategy
match_reader = CLSReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed,
learning_strategy = learning_strategy)
self._reader = match_reader
self._dev_count = dev_count
self._phase = phase
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
@property
def outputs_attr(self):
if self._is_training:
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"label_ids": [[-1], 'int64'],
"task_ids": [[-1, -1], 'int64']
}
else:
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32']
}
def load_data(self):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
def iterator(self):
def list_to_dict(x):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'label_ids', 'unique_ids']
outputs = {n: i for n,i in zip(names, x)}
del outputs['unique_ids']
if not self._is_training:
del outputs['label_ids']
return outputs
attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1, -1], 'int64'],
"label_ids": [[-1], 'int64'],
"token_ids_neg": [[-1, -1], 'int64'],
"position_ids_neg": [[-1, -1], 'int64'],
"segment_ids_neg": [[-1, -1], 'int64'],
"input_mask_neg": [[-1, -1, 1], 'float32'],
"task_ids_neg": [[-1, -1], 'int64']
}
return self._get_registed_attrs(attrs)
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def _iterator(self):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 'label_ids', \
'token_ids_neg', 'segment_ids_neg', 'position_ids_neg', 'task_ids_neg', 'input_mask_neg']
if self._learning_strategy == 'pairwise':
names.remove('label_ids')
for batch in self._data_generator():
yield list_to_dict(batch)
outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
@property
def num_examples(self):
return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
......@@ -13,79 +13,79 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.interface import reader
from paddlepalm.reader.utils.reader4ernie import MaskLMReader
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MaskLMReader as MLMReader
import numpy as np
class Reader(reader):
class MaskLMReader(Reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
def __init__(self, vocab_path, max_len, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
"""
Args:
phase: train, eval, pred
"""
"""
self._is_training = phase == 'train'
reader = MaskLMReader(config['vocab_path'],
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', False),
for_cn=config.get('for_cn', False),
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
Reader.__init__(self, phase)
self._batch_size = config['batch_size']
self._max_seq_len = config['max_seq_len']
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
self._register.add('mask_pos')
if phase == 'train':
self._input_file = config['train_file']
self._num_epochs = None # 防止iteartor终止
self._shuffle = config.get('shuffle', True)
self._shuffle_buffer = config.get('shuffle_buffer', 5000)
elif phase == 'eval':
self._input_file = config['dev_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
self._register.add('mask_label')
self._is_training = phase == 'train'
mlm_reader = MLMReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed)
self._reader = mlm_reader
self._phase = phase
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
self._dev_count = dev_count
@property
def outputs_attr(self):
return {"token_ids": [[-1, -1], 'int64'],
attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"task_ids": [[-1, -1], 'int64'],
"mask_label": [[-1], 'int64'],
"mask_pos": [[-1], 'int64'],
"mask_pos": [[-1], 'int64']
}
return self._get_registed_attrs(attrs)
def load_data(self):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
def iterator(self):
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='csv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def list_to_dict(x):
names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask',
'task_ids', 'mask_label', 'mask_pos']
outputs = {n: i for n,i in zip(names, x)}
# outputs['batchsize_x_seqlen'] = [self._batch_size * len(outputs['token_ids'][0]) - 1]
return outputs
def _iterator(self):
names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask',
'task_ids', 'mask_label', 'mask_pos']
for batch in self._data_generator():
# print(np.shape(list_to_dict(batch)['token_ids']))
# print(list_to_dict(batch)['mask_label'].tolist())
yield list_to_dict(batch)
outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase),
......@@ -95,3 +95,7 @@ class Reader(reader):
def num_examples(self):
return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
......@@ -13,77 +13,66 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.interface import reader
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import MRCReader
import numpy as np
class Reader(reader):
def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
class MrcReader(Reader):
def __init__(self, vocab_path, max_len, max_query_len, doc_stride, tokenizer='FullTokenizer', lang='en', seed=None, do_lower_case=False, \
remove_noanswer=True, phase='train', dev_count=1, print_prefix=''):
"""
Args:
phase: train, eval, pred
"""
lang: en, ch, ...
"""
self._is_training = phase == 'train'
Reader.__init__(self, phase)
reader = MRCReader(config['vocab_path'],
max_seq_len=config['max_seq_len'],
do_lower_case=config.get('do_lower_case', False),
tokenizer='FullTokenizer',
for_cn=config.get('for_cn', False),
doc_stride=config['doc_stride'],
remove_noanswer=config.get('remove_noanswer', True),
max_query_length=config['max_query_len'],
random_seed=config.get('seed', None))
self._reader = reader
self._dev_count = dev_count
self._batch_size = config['batch_size']
self._max_seq_len = config['max_seq_len']
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
if phase == 'train':
self._input_file = config['train_file']
# self._num_epochs = config['num_epochs']
self._num_epochs = None # 防止iteartor终止
self._shuffle = config.get('shuffle', True)
self._shuffle_buffer = config.get('shuffle_buffer', 5000)
if phase == 'eval':
self._input_file = config['dev_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
elif phase == 'pred':
self._input_file = config['pred_file']
self._num_epochs = 1
self._shuffle = False
self._batch_size = config.get('pred_batch_size', self._batch_size)
self._register.add("start_positions")
self._register.add("end_positions")
else:
self._register.add("unique_ids")
self._phase = phase
# self._batch_size =
self._print_first_n = config.get('print_first_n', 1)
self._is_training = phase == 'train'
# TODO: without slide window version
self._with_slide_window = config.get('with_slide_window', False)
mrc_reader = MRCReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
tokenizer=tokenizer,
doc_stride=doc_stride,
remove_noanswer=remove_noanswer,
max_query_length=max_query_len,
for_cn=for_cn,
random_seed=seed)
self._reader = mrc_reader
self._phase = phase
self._dev_count = dev_count
@property
def outputs_attr(self):
if self._is_training:
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"start_positions": [[-1], 'int64'],
"end_positions": [[-1], 'int64'],
"task_ids": [[-1, -1], 'int64']
}
else:
return {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"unique_ids": [[-1], 'int64']
}
attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"start_positions": [[-1], 'int64'],
"end_positions": [[-1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"unique_ids": [[-1], 'int64']
}
return self._get_registed_attrs(attrs)
@property
def epoch_outputs_attr(self):
......@@ -91,26 +80,34 @@ class Reader(reader):
return {"examples": None,
"features": None}
def load_data(self):
self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
def iterator(self):
def list_to_dict(x):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'start_positions', 'end_positions', 'unique_ids']
outputs = {n: i for n,i in zip(names, x)}
if self._is_training:
del outputs['unique_ids']
else:
del outputs['start_positions']
del outputs['end_positions']
return outputs
def load_data(self, input_file, batch_size, num_epochs=None, file_format='csv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def _iterator(self):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'start_positions', 'end_positions', 'unique_ids']
if self._is_training:
names.remove('unique_ids')
for batch in self._data_generator():
yield list_to_dict(batch)
outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
if not self._is_training:
assert 'unique_ids' in ret, ret
yield ret
def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase),
'features': self._reader.get_features(self._phase)}
......@@ -118,3 +115,7 @@ class Reader(reader):
def num_examples(self):
return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlepalm.reader.base_reader import Reader
from paddlepalm.reader.utils.reader4ernie import SequenceLabelReader as SLReader
class SequenceLabelReader(Reader):
def __init__(self, vocab_path, max_len, label_map_config, tokenizer='wordpiece', \
lang='en', seed=None, do_lower_case=False, phase='train', dev_count=1, print_prefix=''):
"""
Args:
phase: train, eval, pred
lang: en, ch, ...
"""
Reader.__init__(self, phase)
assert lang.lower() in ['en', 'cn', 'english', 'chinese'], "supported language: en (English), cn (Chinese)."
assert phase in ['train', 'predict'], "supported phase: train, predict."
for_cn = lang.lower() == 'cn' or lang.lower() == 'chinese'
self._register.add('token_ids')
self._register.add('seq_lens')
if phase == 'train':
self._register.add('label_ids')
self._is_training = phase == 'train'
ner_reader = SLReader(vocab_path,
max_seq_len=max_len,
do_lower_case=do_lower_case,
for_cn=for_cn,
random_seed=seed,
label_map_config=label_map_config)
self._reader = ner_reader
self._phase = phase
self._dev_count = dev_count
@property
def outputs_attr(self):
attrs = {"token_ids": [[-1, -1], 'int64'],
"position_ids": [[-1, -1], 'int64'],
"segment_ids": [[-1, -1], 'int64'],
"task_ids": [[-1, -1], 'int64'],
"input_mask": [[-1, -1, 1], 'float32'],
"seq_lens": [[-1], 'int64'],
"label_ids": [[-1, -1], 'int64']}
return self._get_registed_attrs(attrs)
def load_data(self, input_file, batch_size, num_epochs=None, \
file_format='tsv', shuffle_train=True):
self._batch_size = batch_size
self._num_epochs = num_epochs
self._data_generator = self._reader.data_generator( \
input_file, batch_size, num_epochs if self._phase == 'train' else 1, \
shuffle=shuffle_train if self._phase == 'train' else False, \
phase=self._phase)
def _iterator(self):
names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask',
'label_ids', 'seq_lens', 'label_ids']
for batch in self._data_generator():
outputs = {n: i for n,i in zip(names, batch)}
ret = {}
# TODO: move runtime shape check here
for attr in self.outputs_attr.keys():
ret[attr] = outputs[attr]
yield ret
def get_epoch_outputs(self):
return {'examples': self._reader.get_examples(self._phase),
'features': self._reader.get_features(self._phase)}
@property
def num_examples(self):
return self._reader.get_num_examples(phase=self._phase)
@property
def num_epochs(self):
return self._num_epochs
......@@ -19,57 +19,76 @@ from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3, dev_count=1):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
multidev_batch_tokens = []
multidev_mask_label = []
multidev_mask_pos = []
big_batch_tokens = batch_tokens
stride = len(batch_tokens) // dev_count
if stride == 0:
return None, None, None
p = stride
for i in range(dev_count):
batch_tokens = big_batch_tokens[p-stride:p]
p += stride
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
return batch_tokens, mask_label, mask_pos
mask_label = np.array(mask_label).astype("int64").reshape([-1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1])
multidev_batch_tokens.extend(batch_tokens)
multidev_mask_label.append(mask_label)
multidev_mask_pos.append(mask_pos)
return multidev_batch_tokens, multidev_mask_label, multidev_mask_pos
def prepare_batch_data(insts,
......@@ -83,7 +102,8 @@ def prepare_batch_data(insts,
task_id=0,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
return_num_token=False,
dev_count=1):
"""
1. generate Tensor of data
2. generate Tensor of position
......@@ -101,7 +121,8 @@ def prepare_batch_data(insts,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
MASK=mask_id,
dev_count=dev_count)
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out,
......@@ -125,7 +146,7 @@ def prepare_batch_data(insts,
return_list = [
src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
]
return return_list if len(return_list) > 1 else return_list[0]
return return_list
def pad_batch_data(insts,
......
......@@ -71,10 +71,10 @@ class TaskInstance(object):
self._train_finish = False
# 存放不同运行阶段(train,eval,pred)的数据集reader,key为phase,value为Reader实例
self._reader = {'train': None, 'eval': None, 'pred': None}
self._reader = {'train': None, 'eval': None, 'predict': None}
self._input_layer = None
self._inputname_to_varname = {}
self._task_layer = {'train': None, 'eval': None, 'pred': None}
self._task_layer = {'train': None, 'eval': None, 'predict': None}
self._pred_input_name_list = []
self._pred_input_varname_list = []
self._pred_fetch_name_list = []
......@@ -90,7 +90,7 @@ class TaskInstance(object):
def build_task_layer(self, net_inputs, phase, scope=""):
output_vars = self._task_layer[phase].build(net_inputs, scope_name=scope)
if phase == 'pred':
if phase == 'predict':
if output_vars is not None:
self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items())
else:
......
......@@ -3,6 +3,7 @@ import os
import json
import yaml
from config_helper import PDConfig
import logging
from paddle import fluid
def get_basename(f):
......
......@@ -16,6 +16,7 @@
import os
import sys
import random
import logging
import numpy as np
import paddle
from paddle import fluid
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册