From e6e8fa74826e4526ed25446c722413cdb0171269 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 5 Jul 2021 12:29:08 +0000 Subject: [PATCH] paddle.broadcast_shape; log_softmax; equal(zeros); register_buffer --- deepspeech/exps/u2/model.py | 2 +- deepspeech/models/u2.py | 2 +- deepspeech/modules/attention.py | 6 +++--- deepspeech/utils/tensor_utils.py | 12 +++++++++--- third_party/paddle_audio/frontend.py | 15 +++++++-------- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 6297f185..6ff71d71 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -616,7 +616,7 @@ class U2Tester(U2Trainer): shape=[1, encoder_max_time, encoder_model_size], dtype='float32'), # encoder_out ]) - logger.info(f"Export code: {static_model}") + logger.info(f"Export code: {static_model.main_program}") paddle.jit.save(static_model, self.args.export_path) diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index c6a33ad5..9337a5bb 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -954,5 +954,5 @@ class U2InferModel(U2Model): # (num_hyps, max_hyps_len, vocab_size) decoder_out, _ = self.decoder.export(encoder_out, encoder_mask, hyps, hyps_masks) - decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1) + decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1) return decoder_out diff --git a/deepspeech/modules/attention.py b/deepspeech/modules/attention.py index 977cdd58..95ea8ebe 100644 --- a/deepspeech/modules/attention.py +++ b/deepspeech/modules/attention.py @@ -99,11 +99,11 @@ class MultiHeadedAttention(nn.Layer): """ n_batch = value.shape[0] if mask is not None: + # TODO(Hui Zhang): slice not support `int`; paddle not has `scalar` tensor. mask = mask.unsqueeze(1).equal( - paddle.to_tensor(0, dtype=mask.dtype)) # (batch, 1, *, time2) + paddle.zeros([1], dtype=mask.dtype)) # (batch, 1, *, time2) scores = masked_fill(scores, mask, -float('inf')) - attn = paddle.softmax( - scores, axis=-1) + attn = paddle.softmax(scores, axis=-1) attn = masked_fill(attn, mask, 0.0) # (batch, head, time1, time2) else: attn = paddle.softmax( diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py index d9e91a50..e7fbda29 100644 --- a/deepspeech/utils/tensor_utils.py +++ b/deepspeech/utils/tensor_utils.py @@ -13,8 +13,8 @@ # limitations under the License. """Unility functions for Transformer.""" from typing import List -from typing import Tuple from typing import Optional +from typing import Tuple from typing import Union import paddle @@ -25,6 +25,7 @@ __all__ = ["masked_fill", "pad_sequence", "add_sos_eos", "th_accuracy"] logger = Log(__name__).getlog() + def is_broadcastable(shp1, shp2): for a, b in zip(shp1[::-1], shp2[::-1]): if a == 1 or b == 1 or a == b: @@ -33,17 +34,22 @@ def is_broadcastable(shp1, shp2): return False return True + def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): if paddle.in_dynamic_mode(): assert is_broadcastable(xs.shape, mask.shape) is True - bshape = paddle.broadcast_shape(xs.shape, mask.shape) + bshape = paddle.broadcast_shape(xs.shape, mask.shape) + else: + # TODO(Hui Zhang): support broadcast_shape in static graph + bshape = xs.shape mask = mask.broadcast_to(bshape) trues = paddle.ones_like(xs) * value xs = paddle.where(mask, trues, xs) return xs + def pad_sequence(sequences: List[paddle.Tensor], batch_first: bool=False, padding_value: float=0.0) -> paddle.Tensor: @@ -184,4 +190,4 @@ def th_accuracy(pad_outputs: paddle.Tensor, #TODO(Hui Zhang): sum not support bool type # denominator = paddle.sum(mask) denominator = paddle.sum(mask.astype(pad_targets.dtype)) - return float(numerator) / float(denominator) \ No newline at end of file + return float(numerator) / float(denominator) diff --git a/third_party/paddle_audio/frontend.py b/third_party/paddle_audio/frontend.py index 1b337732..674563ad 100644 --- a/third_party/paddle_audio/frontend.py +++ b/third_party/paddle_audio/frontend.py @@ -24,7 +24,7 @@ def frame(x: Tensor, hop_length : int Number of samples shifted between ajancent frames. clip : bool, optional - Whether to clip audio that does not fit into the last frame, by + Whether to clip audio that does not fit into the last frame, by default True Returns @@ -53,16 +53,16 @@ def frame(x: Tensor, class STFT(nn.Layer): - """A module for computing stft transformation in a differentiable way. - + """A module for computing stft transformation in a differentiable way. + Parameters ------------ n_fft : int Number of samples in a frame. - + hop_length : int Number of samples shifted between adjacent frames. - + win_length : int Length of the window. @@ -109,8 +109,7 @@ class STFT(nn.Layer): # (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size) w = np.expand_dims(w, 1) - weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) - self.register_buffer("weight", weight) + self.weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype()) def forward(self, x: Tensor, num_samples: Tensor) -> Tuple[Tensor, Tensor]: """Compute the stft transform. @@ -118,7 +117,7 @@ class STFT(nn.Layer): ------------ x : Tensor [shape=(B, T)] The input waveform. - num_samples : Tensor + num_samples : Tensor Number of samples of each waveform. Returns ------------ -- GitLab