From e6e8fa74826e4526ed25446c722413cdb0171269 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 5 Jul 2021 12:29:08 +0000
Subject: [PATCH] paddle.broadcast_shape; log_softmax; equal(zeros);
 register_buffer

---
 deepspeech/exps/u2/model.py          |  2 +-
 deepspeech/models/u2.py              |  2 +-
 deepspeech/modules/attention.py      |  6 +++---
 deepspeech/utils/tensor_utils.py     | 12 +++++++++---
 third_party/paddle_audio/frontend.py | 15 +++++++--------
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 6297f185..6ff71d71 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -616,7 +616,7 @@ class U2Tester(U2Trainer):
                     shape=[1, encoder_max_time, encoder_model_size],
                     dtype='float32'),  # encoder_out
             ])
-        logger.info(f"Export code: {static_model}")
+        logger.info(f"Export code: {static_model.main_program}")
 
         paddle.jit.save(static_model, self.args.export_path)
 
diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py
index c6a33ad5..9337a5bb 100644
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -954,5 +954,5 @@ class U2InferModel(U2Model):
         # (num_hyps, max_hyps_len, vocab_size)
         decoder_out, _ = self.decoder.export(encoder_out, encoder_mask, hyps,
                                              hyps_masks)
-        decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1)
+        decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
         return decoder_out
diff --git a/deepspeech/modules/attention.py b/deepspeech/modules/attention.py
index 977cdd58..95ea8ebe 100644
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
@@ -99,11 +99,11 @@ class MultiHeadedAttention(nn.Layer):
         """
         n_batch = value.shape[0]
         if mask is not None:
+            # TODO(Hui Zhang): slice not support `int`; paddle not has `scalar` tensor.
             mask = mask.unsqueeze(1).equal(
-                paddle.to_tensor(0, dtype=mask.dtype))  # (batch, 1, *, time2)
+                paddle.zeros([1], dtype=mask.dtype))  # (batch, 1, *, time2)
             scores = masked_fill(scores, mask, -float('inf'))
-            attn = paddle.softmax(
-                scores, axis=-1)
+            attn = paddle.softmax(scores, axis=-1)
             attn = masked_fill(attn, mask, 0.0)  # (batch, head, time1, time2)
         else:
             attn = paddle.softmax(
diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py
index d9e91a50..e7fbda29 100644
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 """Unility functions for Transformer."""
 from typing import List
-from typing import Tuple
 from typing import Optional
+from typing import Tuple
 from typing import Union
 
 import paddle
@@ -25,6 +25,7 @@ __all__ = ["masked_fill", "pad_sequence", "add_sos_eos", "th_accuracy"]
 
 logger = Log(__name__).getlog()
 
+
 def is_broadcastable(shp1, shp2):
     for a, b in zip(shp1[::-1], shp2[::-1]):
         if a == 1 or b == 1 or a == b:
@@ -33,17 +34,22 @@ def is_broadcastable(shp1, shp2):
             return False
     return True
 
+
 def masked_fill(xs: paddle.Tensor,
                 mask: paddle.Tensor,
                 value: Union[float, int]):
     if paddle.in_dynamic_mode():
         assert is_broadcastable(xs.shape, mask.shape) is True
-    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
+        bshape = paddle.broadcast_shape(xs.shape, mask.shape)
+    else:
+        # TODO(Hui Zhang): support broadcast_shape in static graph
+        bshape = xs.shape
     mask = mask.broadcast_to(bshape)
     trues = paddle.ones_like(xs) * value
     xs = paddle.where(mask, trues, xs)
     return xs
 
+
 def pad_sequence(sequences: List[paddle.Tensor],
                  batch_first: bool=False,
                  padding_value: float=0.0) -> paddle.Tensor:
@@ -184,4 +190,4 @@ def th_accuracy(pad_outputs: paddle.Tensor,
     #TODO(Hui Zhang): sum not support bool type
     # denominator = paddle.sum(mask)
     denominator = paddle.sum(mask.astype(pad_targets.dtype))
-    return float(numerator) / float(denominator)
\ No newline at end of file
+    return float(numerator) / float(denominator)
diff --git a/third_party/paddle_audio/frontend.py b/third_party/paddle_audio/frontend.py
index 1b337732..674563ad 100644
--- a/third_party/paddle_audio/frontend.py
+++ b/third_party/paddle_audio/frontend.py
@@ -24,7 +24,7 @@ def frame(x: Tensor,
     hop_length : int
         Number of samples shifted between ajancent frames.
     clip : bool, optional
-        Whether to clip audio that does not fit into the last frame, by 
+        Whether to clip audio that does not fit into the last frame, by
         default True
 
     Returns
@@ -53,16 +53,16 @@ def frame(x: Tensor,
 
 
 class STFT(nn.Layer):
-    """A module for computing stft transformation in a differentiable way. 
-    
+    """A module for computing stft transformation in a differentiable way.
+
     Parameters
     ------------
     n_fft : int
         Number of samples in a frame.
-        
+
     hop_length : int
         Number of samples shifted between adjacent frames.
-        
+
     win_length : int
         Length of the window.
 
@@ -109,8 +109,7 @@ class STFT(nn.Layer):
 
         # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
         w = np.expand_dims(w, 1)
-        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
+        self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
 
     def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]:
         """Compute the stft transform.
@@ -118,7 +117,7 @@ class STFT(nn.Layer):
         ------------
         x : Tensor [shape=(B, T)]
             The input waveform.
-        num_samples : Tensor 
+        num_samples : Tensor
             Number of samples of each waveform.
         Returns
         ------------
-- 
GitLab