transpose in matmul

f9e3eaa0 · Hui Zhang · 3d7ca938 · f9e3eaa0
隐藏空白更改
内联并排

Showing with 7 addition and 4 deletion

paddlespeech/s2t/modules/attention.py paddlespeech/s2t/modules/attention.py +7 -4

未找到文件。
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -188,8 +188,9 @@ class MultiHeadedAttention(nn.Layer):
        #   non-trivial to calculate `next_cache_start` here.
        new_cache = paddle.concat((k, v), axis=-1)

-        scores = paddle.matmul(q,
-                               k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k)
+        # scores = paddle.matmul(q,
+                            #    k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k)
+        scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
        return self.forward_attention(v, scores, mask), new_cache


@@ -309,11 +310,13 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
        # first compute matrix a and matrix c
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        # (batch, head, time1, time2)
-        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+        # matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+        matrix_ac = paddle.matmul(q_with_bias_u, k, transpose_y=True)

        # compute matrix b and matrix d
        # (batch, head, time1, time2)
-        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        # matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = paddle.matmul(q_with_bias_v, p, transpose_y=True)
        # Remove rel_shift since it is useless in speech recognition,
        # and it requires special attention for streaming.
        # matrix_bd = self.rel_shift(matrix_bd)