diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 36bc83647965e53b86c37e06b2d157d3555c554e..b0b6e62a602aaeadd1d60c23e2bc280d5b88cc96 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -402,9 +402,8 @@ class MultiHeadAttention(Layer):
             q, k, v, cache = self._prepare_qkv(query, key, value, cache)
 
         # scale dot product attention
-        # TODO(guosheng): use tensor.matmul, however it doesn't support `alpha`
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        product = paddle.matmul(
+            x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
         if attn_mask is not None:
             # Support bool or int mask
             attn_mask = _convert_attention_mask(attn_mask, product.dtype)