Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
48d8a390
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
48d8a390
编写于
5月 11, 2020
作者:
G
guosheng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add api docs for TransformerEncoder.
上级
cf752eba
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
65 addition
and
43 deletion
+65
-43
hapi/text/text.py
hapi/text/text.py
+65
-43
未找到文件。
hapi/text/text.py
浏览文件 @
48d8a390
...
@@ -2367,10 +2367,20 @@ class TransformerCell(Layer):
...
@@ -2367,10 +2367,20 @@ class TransformerCell(Layer):
import paddle
import paddle
import paddle.fluid as fluid
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding
from paddle.incubate.hapi.text import TransformerCell
from paddle.incubate.hapi.text import TransformerCell
from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
embedder = Embedding(size=[1000, 128])
class Embedder(fluid.dygraph.Layer):
def __init__(self):
self.word_embedder = Embedding(size=[1000, 128])
self.pos_embedder = Embedding(size=[500, 128])
def forward(self, inputs):
word, position = inputs
return self.word_embedder(word) + self.pos_embedder(position)
embedder = Embedder()
output_layer = Linear(128, 1000)
output_layer = Linear(128, 1000)
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
transformer_cell = TransformerCell(decoder, embedder, output_layer)
transformer_cell = TransformerCell(decoder, embedder, output_layer)
...
@@ -2392,7 +2402,7 @@ class TransformerCell(Layer):
...
@@ -2392,7 +2402,7 @@ class TransformerCell(Layer):
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output, beam_size=4)
enc_output, beam_size=4)
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias,
self.beam_size
)
trg_src_attn_bias,
beam_size=4
)
static_caches = decoder.prepare_static_cache(enc_output)
static_caches = decoder.prepare_static_cache(enc_output)
outputs = dynamic_decoder(
outputs = dynamic_decoder(
inits=caches,
inits=caches,
...
@@ -2517,10 +2527,20 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
...
@@ -2517,10 +2527,20 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
import paddle
import paddle
import paddle.fluid as fluid
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding
from paddle.incubate.hapi.text import TransformerCell
from paddle.incubate.hapi.text import TransformerCell
from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
embedder = Embedding(size=[1000, 128])
class Embedder(fluid.dygraph.Layer):
def __init__(self):
self.word_embedder = Embedding(size=[1000, 128])
self.pos_embedder = Embedding(size=[500, 128])
def forward(self, inputs):
word, position = inputs
return self.word_embedder(word) + self.pos_embedder(position)
embedder = Embedder()
output_layer = Linear(128, 1000)
output_layer = Linear(128, 1000)
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
transformer_cell = TransformerCell(decoder, embedder, output_layer)
transformer_cell = TransformerCell(decoder, embedder, output_layer)
...
@@ -2542,7 +2562,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
...
@@ -2542,7 +2562,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
enc_output, beam_size=4)
enc_output, beam_size=4)
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
trg_src_attn_bias,
self.beam_size
)
trg_src_attn_bias,
beam_size=4
)
static_caches = decoder.prepare_static_cache(enc_output)
static_caches = decoder.prepare_static_cache(enc_output)
outputs = dynamic_decoder(
outputs = dynamic_decoder(
inits=caches,
inits=caches,
...
@@ -2944,53 +2964,33 @@ class TransformerEncoder(Layer):
...
@@ -2944,53 +2964,33 @@ class TransformerEncoder(Layer):
"""
"""
TransformerEncoder is a stack of N encoder layers.
TransformerEncoder is a stack of N encoder layers.
Applies a stacked multi-layer gated recurrent unit (GRU) RNN to an input
sequence.
Parameters:
Parameters:
n_layer (int): The number of encoder layers to be stacked.
n_layer (int): The number of encoder layers to be stacked.
n_head (int): The number of heads in the multi-head attention(MHA).
n_head (int): The number of heads in multi-head attention(MHA).
d_key (int): The number of heads in the multi-head attention. Mostly .
d_key (int): The feature size to transformer queries and keys as in
d_value (int): The number of heads in the multiheadattention.
multi-head attention. Mostly it equals to `d_model // n_head`.
d_value (int): The feature size to transformer values as in multi-head
attention. Mostly it equals to `d_model // n_head`.
d_model (int): The expected feature size in the input and output.
d_model (int): The expected feature size in the input and output.
d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
prepostprocess_dropout (float, optional): The dropout probability used
prepostprocess_dropout (float, optional): The dropout probability used
in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
attention_dropout (float, optional): The dropout probability used
attention_dropout (float, optional): The dropout probability used
in MHA to drop some attention target. Default 0.1
in MHA to drop some attention target. Default 0.1
relu_dropout (float, optional): The dropout probability used
in
FFN
relu_dropout (float, optional): The dropout probability used
after
FFN
in MHA to drop some attention target
. Default 0.1
activition
. Default 0.1
preprocess_cmd (str, optional): The process applied before each MHA and
preprocess_cmd (str, optional): The process applied before each MHA and
FFN sub-layer, and it also would be applied. It should be a string
FFN sub-layer, and it also would be applied on output of the last
that includes `d`, `a`, `n` as , where `d` for dropout, `a` for add
stacked layer. It should be a string composed of `d`, `a`, `n`,
residual connection, `n` for layer normalization.
where `d` for dropout, `a` for add residual connection, `n` for
network. Default `n`.
layer normalization. Default `n`.
postprocess_cmd (str, optional): The process applied after each MHA and
FFN sub-layer. Same as `preprocess_cmd`. It should be a string
composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
residual connection, `n` for layer normalization. Default `da`.
ffn_fc1_act (str, optional): The activation function in the feedforward
ffn_fc1_act (str, optional): The activation function in the feedforward
network. Default relu.
network. Default relu.
dropout(float|list|tuple, optional): The dropout probability after each
GRU. It also can be a list or tuple, including dropout probabilities
for the corresponding GRU. Default 0.0
is_reverse (bool, optional): Indicate whether to calculate in the reverse
order of input sequences. Default: `False`.
time_major (bool, optional): Indicate the data layout of Tensor included
in `input` and `output` tensors. If `False`, the data layout would
be batch major with shape `[batch_size, sequence_length, ...]`. If
`True`, the data layout would be time major with shape
`[sequence_length, batch_size, ...]`. Default: `False`.
param_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
Default None.
bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
a list or tuple, it's length must equal to `num_layers`. Otherwise,
construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
Default None.
dtype(string, optional): The data type used in this cell. It can be
float32 or float64. Default float32.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -2999,9 +2999,12 @@ class TransformerEncoder(Layer):
...
@@ -2999,9 +2999,12 @@ class TransformerEncoder(Layer):
import paddle.fluid as fluid
import paddle.fluid as fluid
from paddle.incubate.hapi.text import TransformerEncoder
from paddle.incubate.hapi.text import TransformerEncoder
inputs = paddle.rand((2, 4, 32))
# encoder input: [batch_size, src_len, d_model]
gru = TransformerEncoder(n_layers=2, input_size=32, hidden_size=64,)
enc_input = paddle.rand((2, 4, 32))
outputs, _ = gru(inputs) # [2, 4, 32]
# self attention bias: [batch_size, n_head, src_len, src_len]
attn_bias = paddle.rand((2, 2, 4, 4))
encoder = TransformerEncoder(2, 2, 64, 64, 128, 512)
enc_output = encoder(inputs, attn_bias) # [2, 4, 32]
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -3040,7 +3043,26 @@ class TransformerEncoder(Layer):
...
@@ -3040,7 +3043,26 @@ class TransformerEncoder(Layer):
self
.
processer
=
PrePostProcessLayer
(
preprocess_cmd
,
d_model
,
self
.
processer
=
PrePostProcessLayer
(
preprocess_cmd
,
d_model
,
prepostprocess_dropout
)
prepostprocess_dropout
)
def
forward
(
self
,
enc_input
,
attn_bias
):
def
forward
(
self
,
enc_input
,
attn_bias
=
None
):
"""
Applies a stack of N Transformer encoder layers on input sequences.
Parameters:
enc_input (Variable): The input of Transformer encoder. It is a tensor
with shape `[batch_size, sequence_length, d_model]`. The data
type should be float32 or float64.
attn_bias(Variable, optional): A tensor used in encoder self attention
to mask out attention on unwanted positions, usually the paddings. It
is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
where the unwanted positions have `-INF` values and the others
have 0 values. It can be None for inference. The data type should
be float32 or float64. It can be None when nothing wanted to be
masked out. Default None
Returns:
Variable: The output of Transformer encoder. It is a tensor that has
\
the same shape and data type as `enc_input`.
"""
for
encoder_layer
in
self
.
encoder_layers
:
for
encoder_layer
in
self
.
encoder_layers
:
enc_output
=
encoder_layer
(
enc_input
,
attn_bias
)
enc_output
=
encoder_layer
(
enc_input
,
attn_bias
)
enc_input
=
enc_output
enc_input
=
enc_output
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录