diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 36bc83647965e53b86c37e06b2d157d3555c554e..b0b6e62a602aaeadd1d60c23e2bc280d5b88cc96 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -402,9 +402,8 @@ class MultiHeadAttention(Layer): q, k, v, cache = self._prepare_qkv(query, key, value, cache) # scale dot product attention - # TODO(guosheng): use tensor.matmul, however it doesn't support `alpha` - product = layers.matmul( - x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) + product = paddle.matmul( + x=q * (self.head_dim**-0.5), y=k, transpose_y=True) if attn_mask is not None: # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, product.dtype)