未验证 提交 4c5cfdea 编写于 作者: L liu zhengxi 提交者: GitHub

fix paddle.nn.Transformer api (#27391)

上级 d726fd5e
...@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase): ...@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
trans_output = transformer(src, tgt, src_mask, tgt_mask, trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask) memory_mask)
def test_transformer_attr_1(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None],
bias_attr=[False])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_2(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None, None],
bias_attr=[False, False])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_3(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None, None, None],
bias_attr=[False, False, True])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_boolean(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
bias_attr=False)
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n): ...@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
if isinstance(param_attr, (list, tuple)): if isinstance(param_attr, (list, tuple)):
assert len(param_attr) == n, ( assert len(param_attr) == n, (
"length of param_attr should be %d when it is a list/tuple" % n) "length of param_attr should be %d when it is a list/tuple" % n)
param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] param_attrs = []
for attr in param_attr:
if isinstance(attr, bool):
if attr:
param_attrs.append(ParamAttr._to_attr(None))
else:
param_attrs.append(False)
else:
param_attrs.append(ParamAttr._to_attr(attr))
# param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
elif isinstance(param_attr, bool):
param_attrs = []
if param_attr:
param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
else:
param_attrs = [False] * n
else: else:
param_attrs = [] param_attrs = []
attr = ParamAttr._to_attr(param_attr) attr = ParamAttr._to_attr(param_attr)
...@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer): ...@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
Default: None, which means the default weight parameter property is used. Default: None, which means the default weight parameter property is used.
See usage for details in :code:`ParamAttr` . See usage for details in :code:`ParamAttr` .
bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
...@@ -986,22 +1001,31 @@ class Transformer(Layer): ...@@ -986,22 +1001,31 @@ class Transformer(Layer):
Otherwise, no pre-process and post-precess includes dropout, residual Otherwise, no pre-process and post-precess includes dropout, residual
connection, layer normalization. Default False connection, layer normalization. Default False
weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property. weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
self attention, `weight_attr[1]` would be used as `weight_attr` for `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
cross attention, and `weight_attr[2]` would be used as `weight_attr` would be used as `weight_attr` for cross attention of `TransformerDecoder`,
for linear in FFN. Otherwise, the three sub-layers all uses it as and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
`weight_attr` to create parameters. Default: None, which means the If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
default weight parameter property is used. See usage for details and cross attntion and `weight_attr[1]` would be used as `weight_attr` for
linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
for self attention, cross attention and linear in FFN. Otherwise,
the three sub-layers all uses it as `weight_attr` to create parameters.
Default: None, which means the default weight parameter property is used.
See usage for details
in :code:`ParamAttr` . in :code:`ParamAttr` .
bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
self attention, `bias_attr[1]` would be used as `bias_attr` for `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
cross attention, and `bias_attr[2]` would be used as `bias_attr` would be used as `bias_attr` for cross attention of `TransformerDecoder`,
for linear in FFN. Otherwise, the three sub-layers all uses it as and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
`bias_attr` to create parameters. The `False` value means the If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
corresponding layer would not have trainable bias parameter. See and cross attntion and `bias_attr[1]` would be used as `bias_attr` for
usage for details in :code:`ParamAttr` . Default: None,which means linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
the default bias parameter property is used. for self attention, cross attention and linear in FFN. Otherwise,
the three sub-layers all uses it as `bias_attr` to create parameters.
The `False` value means the corresponding layer would not have trainable
bias parameter. See usage for details in :code:`ParamAttr` .
Default: None,which means the default bias parameter property is used.
custom_encoder (Layer): If custom encoder is provided, use it as the encoder. custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
Default None Default None
custom_decoder (Layer): If custom decoder is provided, use it as the decoder. custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
...@@ -1049,13 +1073,51 @@ class Transformer(Layer): ...@@ -1049,13 +1073,51 @@ class Transformer(Layer):
custom_decoder=None): custom_decoder=None):
super(Transformer, self).__init__() super(Transformer, self).__init__()
if isinstance(bias_attr, (list, tuple)):
if len(bias_attr) == 1:
encoder_bias_attr = [bias_attr[0]] * 2
decoder_bias_attr = [bias_attr[0]] * 3
elif len(bias_attr) == 2:
encoder_bias_attr = bias_attr
decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]]
elif len(bias_attr) == 3:
encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
decoder_bias_attr = bias_attr
else:
assert False, (
"length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
)
else:
encoder_bias_attr = bias_attr
decoder_bias_attr = bias_attr
if isinstance(weight_attr, (list, tuple)):
if len(weight_attr) == 1:
encoder_weight_attr = [weight_attr[0]] * 2
decoder_weight_attr = [weight_attr[0]] * 3
elif len(weight_attr) == 2:
encoder_weight_attr = weight_attr
decoder_weight_attr = [
weight_attr[0], weight_attr[0], weight_attr[-1]
]
elif len(weight_attr) == 3:
encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
decoder_weight_attr = weight_attr
else:
assert False, (
"length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
)
else:
encoder_weight_attr = weight_attr
decoder_weight_attr = weight_attr
if custom_encoder is not None: if custom_encoder is not None:
self.encoder = custom_encoder self.encoder = custom_encoder
else: else:
encoder_layer = TransformerEncoderLayer( encoder_layer = TransformerEncoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, d_model, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before, weight_attr, attn_dropout, act_dropout, normalize_before,
bias_attr) encoder_weight_attr, encoder_bias_attr)
encoder_norm = LayerNorm(d_model) encoder_norm = LayerNorm(d_model)
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
encoder_norm) encoder_norm)
...@@ -1065,8 +1127,8 @@ class Transformer(Layer): ...@@ -1065,8 +1127,8 @@ class Transformer(Layer):
else: else:
decoder_layer = TransformerDecoderLayer( decoder_layer = TransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, d_model, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before, weight_attr, attn_dropout, act_dropout, normalize_before,
bias_attr) decoder_weight_attr, decoder_bias_attr)
decoder_norm = LayerNorm(d_model) decoder_norm = LayerNorm(d_model)
self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
decoder_norm) decoder_norm)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册