未验证 提交 2fa3e51a 编写于 作者: 0 0YuanZhang0 提交者: GitHub

fix_python3_bug (#3461)

上级 313d0666
...@@ -9,7 +9,7 @@ Model ensemble can improve the generalization of MRC models. However, such appro ...@@ -9,7 +9,7 @@ Model ensemble can improve the generalization of MRC models. However, such appro
- Python >= 2.7 - Python >= 2.7
- cuda >= 9.0 - cuda >= 9.0
- cudnn >= 7.0 - cudnn >= 7.0
- PaddlePaddle >= 1.5.0 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start) - PaddlePaddle >= 1.6 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start)
### Data and Models Preparation ### Data and Models Preparation
User can get the data and trained knowledge_distillation models directly we provided: User can get the data and trained knowledge_distillation models directly we provided:
......
...@@ -18,6 +18,7 @@ from __future__ import division ...@@ -18,6 +18,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
from functools import partial from functools import partial
from functools import reduce
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
......
...@@ -15,7 +15,7 @@ PALM user guide: [README.md](https://github.com/PaddlePaddle/PALM/blob/master/RE ...@@ -15,7 +15,7 @@ PALM user guide: [README.md](https://github.com/PaddlePaddle/PALM/blob/master/RE
- Python >= 2.7 - Python >= 2.7
- cuda >= 9.0 - cuda >= 9.0
- cudnn >= 7.0 - cudnn >= 7.0
- PaddlePaddle >= 1.5.0 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start) - PaddlePaddle >= 1.6 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start)
### Data Preparation ### Data Preparation
#### Get data directly: #### Get data directly:
......
...@@ -12,6 +12,17 @@ bash wget_server_inference_model.sh ...@@ -12,6 +12,17 @@ bash wget_server_inference_model.sh
``` ```
## Start server ## Start server
We can set GPU card for bert server or xlnet server, By setting variable CUDA_VISIBLE_DEVICES:
```
export CUDA_VISIBLE_DEVICES=1
```
In main_server.py file we set the server port for bert and xlnet model, as shown below, If the port 5118 or 5120 is occupied, please set up an idle port.
```
url_1 = 'http://127.0.0.1:5118' # url for model1
url_2 = 'http://127.0.0.1:5120' # url for model2
```
start server
``` ```
bash start.sh bash start.sh
``` ```
cd bert_server cd bert_server
export CUDA_VISIBLE_DEVICES=1
sh start.sh sh start.sh
cd ../xlnet_server cd ../xlnet_server
export CUDA_VISIBLE_DEVICES=2
sh serve.sh sh serve.sh
cd .. cd ..
......
...@@ -41,144 +41,148 @@ def positional_embedding(pos_seq, inv_freq, bsz=None): ...@@ -41,144 +41,148 @@ def positional_embedding(pos_seq, inv_freq, bsz=None):
def positionwise_ffn(inp, d_model, d_inner, dropout_prob, param_initializer=None, def positionwise_ffn(inp, d_model, d_inner, dropout_prob, param_initializer=None,
act_type='relu', name='ff'): act_type='relu', name='ff'):
"""Position-wise Feed-forward Network.""" """Position-wise Feed-forward Network."""
if act_type not in ['relu', 'gelu']: if act_type not in ['relu', 'gelu']:
raise ValueError('Unsupported activation type {}'.format(act_type)) raise ValueError('Unsupported activation type {}'.format(act_type))
output = fluid.layers.fc(input=inp, size=d_inner, act=act_type, output = fluid.layers.fc(input=inp, size=d_inner, act=act_type,
num_flatten_dims=2, num_flatten_dims=2,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=name+'_layer_1_weight', initializer=param_initializer), name=name+'_layer_1_weight', initializer=param_initializer),
bias_attr=name+'_layer_1_bias') bias_attr=name+'_layer_1_bias')
output = fluid.layers.dropout(output, dropout_prob=dropout_prob, output = fluid.layers.dropout(output, dropout_prob=dropout_prob,
dropout_implementation="upscale_in_train", is_test=False) dropout_implementation="upscale_in_train", is_test=False)
output = fluid.layers.fc(output, size=d_model, output = fluid.layers.fc(output, size=d_model,
num_flatten_dims=2, num_flatten_dims=2,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=name+'_layer_2_weight', initializer=param_initializer), name=name+'_layer_2_weight', initializer=param_initializer),
bias_attr=name+'_layer_2_bias') bias_attr=name+'_layer_2_bias')
output = fluid.layers.dropout(output, dropout_prob=dropout_prob, output = fluid.layers.dropout(output, dropout_prob=dropout_prob,
dropout_implementation="upscale_in_train", is_test=False) dropout_implementation="upscale_in_train", is_test=False)
output = fluid.layers.layer_norm(output + inp, begin_norm_axis=len(output.shape)-1, output = fluid.layers.layer_norm(output + inp, begin_norm_axis=len(output.shape)-1,
epsilon=1e-12, epsilon=1e-12,
param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale', param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)), initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(name+'_layer_norm_bias', bias_attr=fluid.ParamAttr(name+'_layer_norm_bias',
initializer=fluid.initializer.Constant(0.))) initializer=fluid.initializer.Constant(0.)))
return output return output
def head_projection(h, d_model, n_head, d_head, param_initializer, name=''): def head_projection(h, d_model, n_head, d_head, param_initializer, name=''):
"""Project hidden states to a specific head with a 4D-shape.""" """Project hidden states to a specific head with a 4D-shape."""
proj_weight=fluid.layers.create_parameter( proj_weight=fluid.layers.create_parameter(
shape=[d_model, n_head, d_head], shape=[d_model, n_head, d_head],
dtype=h.dtype, dtype=h.dtype,
attr=fluid.ParamAttr(name=name+'_weight', initializer=param_initializer), attr=fluid.ParamAttr(name=name+'_weight', initializer=param_initializer),
is_bias=False) is_bias=False)
# ibh,hnd->ibnd # ibh,hnd->ibnd
head = fluid.layers.mul(x=h, y=proj_weight, x_num_col_dims=2, y_num_col_dims=1) head = fluid.layers.mul(x=h, y=proj_weight, x_num_col_dims=2, y_num_col_dims=1)
return head return head
def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, def post_attention(h, attn_vec, d_model, n_head, d_head, dropout,
param_initializer, residual=True, name=''): param_initializer, residual=True, name=''):
"""Post-attention processing.""" """Post-attention processing."""
# post-attention projection (back to `d_model`) # post-attention projection (back to `d_model`)
proj_o=fluid.layers.create_parameter( proj_o=fluid.layers.create_parameter(
shape=[d_model, n_head, d_head], shape=[d_model, n_head, d_head],
dtype=h.dtype, dtype=h.dtype,
attr=fluid.ParamAttr(name=name+'_o_weight', initializer=param_initializer), attr=fluid.ParamAttr(name=name+'_o_weight', initializer=param_initializer),
is_bias=False) is_bias=False)
# ibnd,hnd->ibh # ibnd,hnd->ibh
proj_o = fluid.layers.transpose(proj_o, perm=[1, 2, 0]) proj_o = fluid.layers.transpose(proj_o, perm=[1, 2, 0])
attn_out = fluid.layers.mul(x=attn_vec, y=proj_o, x_num_col_dims=2, y_num_col_dims=2) attn_out = fluid.layers.mul(x=attn_vec, y=proj_o, x_num_col_dims=2, y_num_col_dims=2)
attn_out = fluid.layers.dropout(attn_out, dropout_prob=dropout, attn_out = fluid.layers.dropout(attn_out, dropout_prob=dropout,
dropout_implementation="upscale_in_train", is_test=False) dropout_implementation="upscale_in_train", is_test=False)
if residual: if residual:
output = fluid.layers.layer_norm(attn_out + h, begin_norm_axis=len(attn_out.shape)-1, output = fluid.layers.layer_norm(attn_out + h, begin_norm_axis=len(attn_out.shape)-1,
epsilon=1e-12, epsilon=1e-12,
param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale', param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)), initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(name+'_layer_norm_bias', bias_attr=fluid.ParamAttr(name+'_layer_norm_bias',
initializer=fluid.initializer.Constant(0.))) initializer=fluid.initializer.Constant(0.)))
else: else:
output = fluid.layers.layer_norm(attn_out, begin_norm_axis=len(attn_out.shape)-1, output = fluid.layers.layer_norm(attn_out, begin_norm_axis=len(attn_out.shape)-1,
epsilon=1e-12, epsilon=1e-12,
param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale', param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale',
initializer=fluid.initializer.Constant(1.)), initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(name+'_layer_norm_bias', bias_attr=fluid.ParamAttr(name+'_layer_norm_bias',
initializer=fluid.initializer.Constant(0.))) initializer=fluid.initializer.Constant(0.)))
return output return output
def abs_attn_core(q_head, k_head, v_head, attn_mask, dropatt, scale): def abs_attn_core(q_head, k_head, v_head, attn_mask, dropatt, scale):
"""Core absolute positional attention operations.""" """Core absolute positional attention operations."""
attn_score = einsum4x4('ibnd,jbnd->ijbn', q_head, k_head) attn_score = einsum4x4('ibnd,jbnd->ijbn', q_head, k_head)
attn_score *= scale attn_score *= scale
if attn_mask is not None: if attn_mask is not None:
attn_score = attn_score - 1e30 * attn_mask attn_score = attn_score - 1e30 * attn_mask
# attention probability # attention probability
attn_prob = fluid.layers.softmax(attn_score, axis=1) attn_prob = fluid.layers.softmax(attn_score, axis=1)
attn_prob = fluid.layers.dropout(attn_prob, dropout_prob=dropatt, attn_prob = fluid.layers.dropout(attn_prob, dropout_prob=dropatt,
dropout_implementation="upscale_in_train", is_test=False) dropout_implementation="upscale_in_train", is_test=False)
# attention output # attention output
attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head) attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head)
return attn_vec
return attn_vec
def rel_attn_core(q_head, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, def rel_attn_core(q_head, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat,
r_w_bias, r_r_bias, r_s_bias, attn_mask, dropatt, r_w_bias, r_r_bias, r_s_bias, attn_mask, dropatt,
scale): scale):
"""Core relative positional attention operations.""" """Core relative positional attention operations."""
## content based attention score ## content based attention score
ac = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_w_bias, 2), k_head_h) ac = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_w_bias, 2), k_head_h)
# position based attention score # position based attention score
bd = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_r_bias, 2), k_head_r) bd = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_r_bias, 2), k_head_r)
#klen = fluid.layers.slice(fluid.layers.shape(ac), axes=[0], starts=[1], ends=[2]) #klen = fluid.layers.slice(fluid.layers.shape(ac), axes=[0], starts=[1], ends=[2])
bd = rel_shift(bd, klen=ac.shape[1]) bd = rel_shift(bd, klen=ac.shape[1])
# segment based attention score # segment based attention score
if seg_mat is None: if seg_mat is None:
ef = 0 ef = 0
else: else:
ef = 0 ef = 0
""" """
bsz = fluid.layers.slice(fluid.layers.shape(q_head), axes=[0], starts=[1], ends=[2]) bsz = fluid.layers.slice(fluid.layers.shape(q_head), axes=[0], starts=[1], ends=[2])
bsz.stop_gradient = True bsz.stop_gradient = True
""" """
#seg_embed = fluid.layers.unsqueeze(input=seg_embed, axes=[0]) #seg_embed = fluid.layers.unsqueeze(input=seg_embed, axes=[0])
seg_embed = fluid.layers.stack([seg_embed]*q_head.shape[0], axis=0) seg_embed = fluid.layers.stack([seg_embed]*q_head.shape[0], axis=0)
ef = einsum4x4('ibnd,isnd->ibns', fluid.layers.elementwise_add(q_head, r_s_bias, 2), seg_embed) ef = einsum4x4('ibnd,isnd->ibns', fluid.layers.elementwise_add(q_head, r_s_bias, 2), seg_embed)
ef = einsum4x4('ijbs,ibns->ijbn', seg_mat, ef) ef = einsum4x4('ijbs,ibns->ijbn', seg_mat, ef)
# merge attention scores and perform masking # merge attention scores and perform masking
attn_score = (ac + bd + ef) * scale attn_score = (ac + bd + ef) * scale
if attn_mask is not None: if attn_mask is not None:
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
attn_score = attn_score - 1e30 * attn_mask attn_score = attn_score - 1e30 * attn_mask
# attention probability # attention probability
#attn_prob = fluid.layers.softmax(attn_score, axis=1) #attn_prob = fluid.layers.softmax(attn_score, axis=1)
attn_score = fluid.layers.transpose(attn_score, [0, 2, 3, 1]) attn_score = fluid.layers.transpose(attn_score, [0, 2, 3, 1])
attn_prob = fluid.layers.softmax(attn_score) attn_prob = fluid.layers.softmax(attn_score)
attn_prob = fluid.layers.transpose(attn_prob, [0, 3, 1, 2]) attn_prob = fluid.layers.transpose(attn_prob, [0, 3, 1, 2])
attn_prob = fluid.layers.dropout(attn_prob, dropatt, attn_prob = fluid.layers.dropout(attn_prob, dropatt,
dropout_implementation="upscale_in_train") dropout_implementation="upscale_in_train")
# attention output # attention output
attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head_h) attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head_h)
return attn_vec return attn_vec
def rel_shift(x, klen=-1): def rel_shift(x, klen=-1):
"""perform relative shift to form the relative attention score.""" """perform relative shift to form the relative attention score."""
...@@ -192,67 +196,69 @@ def rel_shift(x, klen=-1): ...@@ -192,67 +196,69 @@ def rel_shift(x, klen=-1):
def _cache_mem(curr_out, prev_mem, mem_len, reuse_len=None): def _cache_mem(curr_out, prev_mem, mem_len, reuse_len=None):
"""cache hidden states into memory.""" """cache hidden states into memory."""
if mem_len is None or mem_len == 0: if mem_len is None or mem_len == 0:
return None return None
else: else:
if reuse_len is not None and reuse_len > 0: if reuse_len is not None and reuse_len > 0:
curr_out = curr_out[:reuse_len] curr_out = curr_out[:reuse_len]
if prev_mem is None:
new_mem = curr_out[-mem_len:]
else:
new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]
if prev_mem is None: new_mem.stop_gradient = True
new_mem = curr_out[-mem_len:] return new_mem
else:
new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]
new_mem.stop_gradient = True
return new_mem
def relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type, def relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type,
bi_data, bsz=None, dtype=None): bi_data, bsz=None, dtype=None):
"""create relative positional encoding.""" """create relative positional encoding."""
freq_seq = fluid.layers.range(0, d_model, 2.0, 'float32') freq_seq = fluid.layers.range(0, d_model, 2.0, 'float32')
if dtype is not None and dtype != 'float32':
freq_seq = tf.cast(freq_seq, dtype=dtype)
inv_freq = 1 / (10000 ** (freq_seq / d_model))
if attn_type == 'bi':
beg, end = klen, -qlen
elif attn_type == 'uni':
beg, end = klen, -1
else:
raise ValueError('Unknown `attn_type` {}.'.format(attn_type))
if bi_data:
fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
bwd_pos_seq = fluid.layers.range(-beg, -end, 1.0, 'float32')
if dtype is not None and dtype != 'float32': if dtype is not None and dtype != 'float32':
fwd_pos_seq =fluid.layers.cast(fwd_pos_seq, dtype='float32') freq_seq = tf.cast(freq_seq, dtype=dtype)
bwd_pos_seq = fluid.layers.cast(bwd_pos_seq, dtype='float32') inv_freq = 1 / (10000 ** (freq_seq / d_model))
if clamp_len > 0: if attn_type == 'bi':
fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len) beg, end = klen, -qlen
bwd_pos_seq = fluid.layers.clip(bwd_pos_seq, -clamp_len, clamp_len) elif attn_type == 'uni':
beg, end = klen, -1
else:
raise ValueError('Unknown `attn_type` {}.'.format(attn_type))
if bsz is not None: if bi_data:
# With bi_data, the batch size should be divisible by 2. fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
assert bsz % 2 == 0 bwd_pos_seq = fluid.layers.range(-beg, -end, 1.0, 'float32')
fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2) if dtype is not None and dtype != 'float32':
fwd_pos_seq =fluid.layers.cast(fwd_pos_seq, dtype='float32')
bwd_pos_seq = fluid.layers.cast(bwd_pos_seq, dtype='float32')
if clamp_len > 0:
fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
bwd_pos_seq = fluid.layers.clip(bwd_pos_seq, -clamp_len, clamp_len)
if bsz is not None:
# With bi_data, the batch size should be divisible by 2.
assert bsz % 2 == 0
fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
else:
fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq)
bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq)
pos_emb = fluid.layers.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
else: else:
fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq) fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq) if dtype is not None and dtype != 'float32':
fwd_pos_seq = fluid.layers.cast(fwd_pos_seq, dtype=dtype)
if clamp_len > 0:
fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz)
fluid.layers.reshape(pos_emb, [2*qlen, -1, d_model], inplace=True)
return pos_emb
pos_emb = fluid.layers.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
else:
fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
if dtype is not None and dtype != 'float32':
fwd_pos_seq = fluid.layers.cast(fwd_pos_seq, dtype=dtype)
if clamp_len > 0:
fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz)
fluid.layers.reshape(pos_emb, [2*qlen, -1, d_model], inplace=True)
return pos_emb
def rel_multihead_attn(h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed, def rel_multihead_attn(h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed,
attn_mask, mems, d_model, n_head, d_head, dropout, attn_mask, mems, d_model, n_head, d_head, dropout,
...@@ -299,58 +305,58 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, ...@@ -299,58 +305,58 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
use_fp16=False, name='', **kwargs): use_fp16=False, name='', **kwargs):
""" """
Defines a Transformer-XL computation graph with additional Defines a Transformer-XL computation graph with additional
support for XLNet. support for XLNet.
Args: Args:
inp_k: int32 Tensor in shape [len, bsz], the input token IDs. inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
seg_id: int32 Tensor in shape [len, bsz], the input segment IDs. seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
input_mask: float32 Tensor in shape [len, bsz], the input mask. input_mask: float32 Tensor in shape [len, bsz], the input mask.
0 for real tokens and 1 for padding. 0 for real tokens and 1 for padding.
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
from previous batches. The length of the list equals n_layer. from previous batches. The length of the list equals n_layer.
If None, no memory is used. If None, no memory is used.
perm_mask: float32 Tensor in shape [len, len, bsz]. perm_mask: float32 Tensor in shape [len, len, bsz].
If perm_mask[i, j, k] = 0, i attend to j in batch k; If perm_mask[i, j, k] = 0, i attend to j in batch k;
if perm_mask[i, j, k] = 1, i does not attend to j in batch k. if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
If None, each position attends to all the others. If None, each position attends to all the others.
target_mapping: float32 Tensor in shape [num_predict, len, bsz]. target_mapping: float32 Tensor in shape [num_predict, len, bsz].
If target_mapping[i, j, k] = 1, the i-th predict in batch k is If target_mapping[i, j, k] = 1, the i-th predict in batch k is
on the j-th token. on the j-th token.
Only used during pretraining for partial prediction. Only used during pretraining for partial prediction.
Set to None during finetuning. Set to None during finetuning.
inp_q: float32 Tensor in shape [len, bsz]. inp_q: float32 Tensor in shape [len, bsz].
1 for tokens with losses and 0 for tokens without losses. 1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention. Only used during pretraining for two-stream attention.
Set to None during finetuning. Set to None during finetuning.
n_layer: int, the number of layers. n_layer: int, the number of layers.
d_model: int, the hidden size. d_model: int, the hidden size.
n_head: int, the number of attention heads. n_head: int, the number of attention heads.
d_head: int, the dimension size of each attention head. d_head: int, the dimension size of each attention head.
d_inner: int, the hidden size in feed-forward layers. d_inner: int, the hidden size in feed-forward layers.
ff_activation: str, "relu" or "gelu". ff_activation: str, "relu" or "gelu".
untie_r: bool, whether to untie the biases in attention. untie_r: bool, whether to untie the biases in attention.
n_token: int, the vocab size. n_token: int, the vocab size.
is_training: bool, whether in training mode. is_training: bool, whether in training mode.
use_tpu: bool, whether TPUs are used. use_tpu: bool, whether TPUs are used.
use_fp16: bool, use bfloat16 instead of float32. use_fp16: bool, use bfloat16 instead of float32.
dropout: float, dropout rate. dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities. dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform". init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform". in [-init_range, init_range]. Only effective when init="uniform".
init_std: float, initialize the parameters with a normal distribution init_std: float, initialize the parameters with a normal distribution
with mean 0 and stddev init_std. Only effective when init="normal". with mean 0 and stddev init_std. Only effective when init="normal".
mem_len: int, the number of tokens to cache. mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future. and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline. bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning. Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len. clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping. -1 means no clamping.
same_length: bool, whether to use the same attention length for each token. same_length: bool, whether to use the same attention length for each token.
summary_type: str, "last", "first", "mean", or "attn". The method summary_type: str, "last", "first", "mean", or "attn". The method
to pool the input to get a vector representation. to pool the input to get a vector representation.
initializer: A tf initializer. initializer: A tf initializer.
scope: scope name for the computation graph. scope: scope name for the computation graph.
""" """
print('memory input {}'.format(mems)) print('memory input {}'.format(mems))
data_type = "float16" if use_fp16 else "float32" data_type = "float16" if use_fp16 else "float32"
...@@ -365,7 +371,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, ...@@ -365,7 +371,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
# causal attention mask # causal attention mask
if attn_type == 'uni': if attn_type == 'uni':
attn_mask = fluid.layers.create_global_var( attn_mask = fluid.layers.create_global_var(
name='attn_mask', name='attn_mask',
shape=[qlen, klen, 1, 1], shape=[qlen, klen, 1, 1],
value=0.0, value=0.0,
dtype=data_type, persistable=True) dtype=data_type, persistable=True)
...@@ -413,21 +419,21 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, ...@@ -413,21 +419,21 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
non_tgt_mask = None non_tgt_mask = None
if untie_r: if untie_r:
r_w_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, r_w_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type,
attr=fluid.ParamAttr(name=name+'_r_w_bias', initializer=initializer), attr=fluid.ParamAttr(name=name+'_r_w_bias', initializer=initializer),
is_bias=True) is_bias=True)
r_w_bias = [fluid.layers.slice(r_w_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)] r_w_bias = [fluid.layers.slice(r_w_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
r_w_bias = [fluid.layers.squeeze(r_w_bias[i], axes=[0]) for i in range(n_layer)] r_w_bias = [fluid.layers.squeeze(r_w_bias[i], axes=[0]) for i in range(n_layer)]
r_r_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, r_r_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type,
attr=fluid.ParamAttr(name=name+'_r_r_bias', initializer=initializer), attr=fluid.ParamAttr(name=name+'_r_r_bias', initializer=initializer),
is_bias=True) is_bias=True)
r_r_bias = [fluid.layers.slice(r_r_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)] r_r_bias = [fluid.layers.slice(r_r_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
r_r_bias = [fluid.layers.squeeze(r_r_bias[i], axes=[0]) for i in range(n_layer)] r_r_bias = [fluid.layers.squeeze(r_r_bias[i], axes=[0]) for i in range(n_layer)]
else: else:
r_w_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, r_w_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type,
attr=fluid.ParamAttr(name=name+'_r_w_bias', initializer=initializer), attr=fluid.ParamAttr(name=name+'_r_w_bias', initializer=initializer),
is_bias=True) is_bias=True)
r_r_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, r_r_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type,
attr=fluid.ParamAttr(name=name+'_r_r_bias', initializer=initializer), attr=fluid.ParamAttr(name=name+'_r_r_bias', initializer=initializer),
is_bias=True) is_bias=True)
...@@ -442,28 +448,28 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, ...@@ -442,28 +448,28 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
param_attr=fluid.ParamAttr(name=name+'_word_embedding', initializer=initializer)) param_attr=fluid.ParamAttr(name=name+'_word_embedding', initializer=initializer))
if inp_q is not None: if inp_q is not None:
pass pass
output_h = fluid.layers.dropout(word_emb_k, dropout_prob=dropout, output_h = fluid.layers.dropout(word_emb_k, dropout_prob=dropout,
dropout_implementation="upscale_in_train") dropout_implementation="upscale_in_train")
if inp_q is not None: if inp_q is not None:
pass pass
if seg_id is not None: if seg_id is not None:
if untie_r: if untie_r:
r_s_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, r_s_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type,
attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer), attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer),
is_bias=True) is_bias=True)
r_s_bias = [fluid.layers.slice(r_s_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)] r_s_bias = [fluid.layers.slice(r_s_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
r_s_bias = [fluid.layers.squeeze(r_s_bias[i], axes=[0]) for i in range(n_layer)] r_s_bias = [fluid.layers.squeeze(r_s_bias[i], axes=[0]) for i in range(n_layer)]
else: else:
r_s_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, r_s_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type,
attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer), attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer),
is_bias=True) is_bias=True)
seg_embed = fluid.layers.create_parameter(shape=[n_layer, 2, n_head, d_head], seg_embed = fluid.layers.create_parameter(shape=[n_layer, 2, n_head, d_head],
dtype=data_type, attr=fluid.ParamAttr(name=name+'_seg_embed', dtype=data_type, attr=fluid.ParamAttr(name=name+'_seg_embed',
initializer=initializer)) initializer=initializer))
seg_embed = [fluid.layers.slice(seg_embed, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)] seg_embed = [fluid.layers.slice(seg_embed, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
seg_embed = [fluid.layers.squeeze(seg_embed[i], axes=[0]) for i in range(n_layer)] seg_embed = [fluid.layers.squeeze(seg_embed[i], axes=[0]) for i in range(n_layer)]
...@@ -497,7 +503,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, ...@@ -497,7 +503,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
pos_emb.stop_gradient = True pos_emb.stop_gradient = True
##### Attention layers ##### Attention layers
if mems is None: if mems is None:
mems = [None] * n_layer mems = [None] * n_layer
for i in range(n_layer): for i in range(n_layer):
# cache new mems # cache new mems
#new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len)) #new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len))
...@@ -548,6 +554,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, ...@@ -548,6 +554,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
new_mems = None new_mems = None
return output, new_mems, lookup_table return output, new_mems, lookup_table
def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None, def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None,
tie_weight=False, bi_data=True): tie_weight=False, bi_data=True):
...@@ -578,53 +585,54 @@ def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout, ...@@ -578,53 +585,54 @@ def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout,
dropatt, input_mask, is_training, initializer, dropatt, input_mask, is_training, initializer,
scope=None, reuse=None, use_proj=True): scope=None, reuse=None, use_proj=True):
""" """
Different classification tasks may not may not share the same parameters Different classification tasks may not may not share the same parameters
to summarize the sequence features. to summarize the sequence features.
If shared, one can keep the `scope` to the default value `None`. If shared, one can keep the `scope` to the default value `None`.
Otherwise, one should specify a different `scope` for each task. Otherwise, one should specify a different `scope` for each task.
""" """
with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse): with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
if summary_type == 'last': if summary_type == 'last':
summary = hidden[-1] summary = hidden[-1]
elif summary_type == 'first': elif summary_type == 'first':
summary = hidden[0] summary = hidden[0]
elif summary_type == 'mean': elif summary_type == 'mean':
summary = tf.reduce_mean(hidden, axis=0) summary = tf.reduce_mean(hidden, axis=0)
elif summary_type == 'attn': elif summary_type == 'attn':
bsz = tf.shape(hidden)[1] bsz = tf.shape(hidden)[1]
summary_bias = tf.get_variable('summary_bias', [d_model], summary_bias = tf.get_variable('summary_bias', [d_model],
dtype=hidden.dtype, dtype=hidden.dtype,
initializer=initializer) initializer=initializer)
summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1]) summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])
if input_mask is not None: if input_mask is not None:
input_mask = input_mask[None, :, :, None] input_mask = input_mask[None, :, :, None]
summary = multihead_attn(summary_bias, hidden, hidden, input_mask, summary = multihead_attn(summary_bias, hidden, hidden, input_mask,
d_model, n_head, d_head, dropout, dropatt, d_model, n_head, d_head, dropout, dropatt,
is_training, initializer, residual=False) is_training, initializer, residual=False)
summary = summary[0] summary = summary[0]
else: else:
raise ValueError('Unsupported summary type {}'.format(summary_type)) raise ValueError('Unsupported summary type {}'.format(summary_type))
# use another projection as in BERT
if use_proj:
summary = tf.layers.dense(
summary,
d_model,
activation=tf.tanh,
initializer=initializer,
name='summary')
# use another projection as in BERT # dropout
if use_proj: summary = tf.layers.dropout(
summary = tf.layers.dense( summary, dropout, training=is_training,
summary, name='dropout')
d_model,
activation=tf.tanh,
initializer=initializer,
name='summary')
# dropout return summary
summary = tf.layers.dropout(
summary, dropout, training=is_training,
name='dropout')
return summary
def classification_loss(hidden, labels, n_class, initializer, name, reuse=None, def classification_loss(hidden, labels, n_class, initializer, name, reuse=None,
return_logits=False): return_logits=False):
...@@ -641,10 +649,10 @@ def classification_loss(hidden, labels, n_class, initializer, name, reuse=None, ...@@ -641,10 +649,10 @@ def classification_loss(hidden, labels, n_class, initializer, name, reuse=None,
param_attr=fluid.ParamAttr(name=name+'_logits', initializer=initializer)) param_attr=fluid.ParamAttr(name=name+'_logits', initializer=initializer))
one_hot_target = fluid.layers.one_hot(labels, depth=n_class, dtype=hidden.dtype) one_hot_target = fluid.layers.one_hot(labels, depth=n_class, dtype=hidden.dtype)
loss = -fuid.layers.reduce_sum(fluid.layers.log_softmax(logits) * one_hot_target, -1) loss = -fluid.layers.reduce_sum(fluid.layers.log_softmax(logits) * one_hot_target, -1)
if return_logits: if return_logits:
return loss, logits return loss, logits
return loss return loss
...@@ -661,6 +669,6 @@ def regression_loss(hidden, labels, initializer, name='transformer', ...@@ -661,6 +669,6 @@ def regression_loss(hidden, labels, initializer, name='transformer',
loss = tf.square(logits - labels) loss = tf.square(logits - labels)
if return_logits: if return_logits:
return loss, logits return loss, logits
return loss return loss
...@@ -12,126 +12,126 @@ SPIECE_UNDERLINE = '▁' ...@@ -12,126 +12,126 @@ SPIECE_UNDERLINE = '▁'
def printable_text(text): def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`.""" """Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case # These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string. # it's a Unicode string and in the other it's a byte string.
if six.PY3: if six.PY3:
if isinstance(text, str): if isinstance(text, str):
return text return text
elif isinstance(text, bytes): elif isinstance(text, bytes):
return text.decode("utf-8", "ignore") return text.decode("utf-8", "ignore")
else: else:
raise ValueError("Unsupported string type: %s" % (type(text))) raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2: elif six.PY2:
if isinstance(text, str): if isinstance(text, str):
return text return text
elif isinstance(text, unicode): elif isinstance(text, unicode):
return text.encode("utf-8") return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else: else:
raise ValueError("Unsupported string type: %s" % (type(text))) raise ValueError("Not running on Python2 or Python 3?")
else:
raise ValueError("Not running on Python2 or Python 3?")
def print_(*args): def print_(*args):
new_args = [] new_args = []
for arg in args: for arg in args:
if isinstance(arg, list): if isinstance(arg, list):
s = [printable_text(i) for i in arg] s = [printable_text(i) for i in arg]
s = ' '.join(s) s = ' '.join(s)
new_args.append(s) new_args.append(s)
else: else:
new_args.append(printable_text(arg)) new_args.append(printable_text(arg))
print(*new_args) print(*new_args)
def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False): def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
if remove_space: if remove_space:
outputs = ' '.join(inputs.strip().split()) outputs = ' '.join(inputs.strip().split())
else: else:
outputs = inputs outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"') outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str): if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode('utf-8') outputs = outputs.decode('utf-8')
if not keep_accents: if not keep_accents:
outputs = unicodedata.normalize('NFKD', outputs) outputs = unicodedata.normalize('NFKD', outputs)
outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
if lower: if lower:
outputs = outputs.lower() outputs = outputs.lower()
return outputs return outputs
def encode_pieces(sp_model, text, return_unicode=True, sample=False): def encode_pieces(sp_model, text, return_unicode=True, sample=False):
# return_unicode is used only for py2 # return_unicode is used only for py2
# note(zhiliny): in some systems, sentencepiece only accepts str for py2 # note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode): if six.PY2 and isinstance(text, unicode):
text = text.encode('utf-8') text = text.encode('utf-8')
if not sample: if not sample:
pieces = sp_model.EncodeAsPieces(text) pieces = sp_model.EncodeAsPieces(text)
else:
pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
cur_pieces = sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else: else:
new_pieces.append(piece) pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
cur_pieces = sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2 # note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode: if six.PY2 and return_unicode:
ret_pieces = [] ret_pieces = []
for piece in new_pieces: for piece in new_pieces:
if isinstance(piece, str): if isinstance(piece, str):
piece = piece.decode('utf-8') piece = piece.decode('utf-8')
ret_pieces.append(piece) ret_pieces.append(piece)
new_pieces = ret_pieces new_pieces = ret_pieces
return new_pieces return new_pieces
def encode_ids(sp_model, text, sample=False): def encode_ids(sp_model, text, sample=False):
pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample) pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)
ids = [sp_model.PieceToId(piece) for piece in pieces] ids = [sp_model.PieceToId(piece) for piece in pieces]
return ids return ids
if __name__ == '__main__': if __name__ == '__main__':
import sentencepiece as spm import sentencepiece as spm
sp = spm.SentencePieceProcessor() sp = spm.SentencePieceProcessor()
sp.load('sp10m.uncased.v3.model') sp.load('sp10m.uncased.v3.model')
print_(u'I was born in 2000, and this is falsé.') print_(u'I was born in 2000, and this is falsé.')
print_(u'ORIGINAL', sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.')) print_(u'ORIGINAL', sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.'))
print_(u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.')) print_(u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.'))
print(encode_ids(sp, u'I was born in 2000, and this is falsé.')) print(encode_ids(sp, u'I was born in 2000, and this is falsé.'))
print_('') print_('')
prepro_func = partial(preprocess_text, lower=True) prepro_func = partial(preprocess_text, lower=True)
print_(prepro_func('I was born in 2000, and this is falsé.')) print_(prepro_func('I was born in 2000, and this is falsé.'))
print_('ORIGINAL', sp.EncodeAsPieces(prepro_func('I was born in 2000, and this is falsé.'))) print_('ORIGINAL', sp.EncodeAsPieces(prepro_func('I was born in 2000, and this is falsé.')))
print_('OURS', encode_pieces(sp, prepro_func('I was born in 2000, and this is falsé.'))) print_('OURS', encode_pieces(sp, prepro_func('I was born in 2000, and this is falsé.')))
print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.'))) print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.')))
print_('') print_('')
print_('I was born in 2000, and this is falsé.') print_('I was born in 2000, and this is falsé.')
print_('ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.')) print_('ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.'))
print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.')) print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.'))
print(encode_ids(sp, 'I was born in 2000, and this is falsé.')) print(encode_ids(sp, 'I was born in 2000, and this is falsé.'))
print_('') print_('')
print_('I was born in 92000, and this is falsé.') print_('I was born in 92000, and this is falsé.')
print_('ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.')) print_('ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.'))
print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.')) print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.'))
print(encode_ids(sp, 'I was born in 92000, and this is falsé.')) print(encode_ids(sp, 'I was born in 92000, and this is falsé.'))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册