fix wav concat method

e522009d · 小湉湉 · 9224659c · e522009d · e522009d
隐藏空白更改
内联并排

Showing with 13 addition and 22 deletion

ernie-sat/inference.py ernie-sat/inference.py +2 -12

ernie-sat/mlm.py ernie-sat/mlm.py +11 -10

未找到文件。
--- a/ernie-sat/inference.py
+++ b/ernie-sat/inference.py
@@ -546,19 +546,9 @@ def decode_with_model(mlm_model: nn.Layer,
        text_seg_pos=feats['text_seg_pos'],
        span_bdy=new_span_bdy,
        use_teacher_forcing=use_teacher_forcing)
-    if 0 in output[0].shape and 0 not in output[-1].shape:
-        output_feat = paddle.concat(
-            output[1:-1] + [output[-1].squeeze()], axis=0)
-    elif 0 not in output[0].shape and 0 in output[-1].shape:
-        output_feat = paddle.concat(
-            [output[0].squeeze()] + output[1:-1], axis=0)
-    elif 0 in output[0].shape and 0 in output[-1].shape:
-        output_feat = paddle.concat(output[1:-1], axis=0)
-    else:
-        output_feat = paddle.concat(
-            [output[0].squeeze(0)] + output[1:-1] + [output[-1].squeeze(0)],
-            axis=0)

+    # 拼接音频
+    output_feat = paddle.concat(x=output, axis=0)
    wav_org, _ = librosa.load(wav_path, sr=fs)
    return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length


--- a/ernie-sat/mlm.py
+++ b/ernie-sat/mlm.py
@@ -7,7 +7,6 @@ from typing import Optional
 from typing import Tuple
 from typing import Union

-import numpy as np
 import paddle
 import yaml
 from paddle import nn
@@ -395,13 +394,13 @@ class MLM(nn.Layer):
            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
        '''
        Args:
-            speech (paddle.Tensor): input speech (B, Tmax, D).
-            text (paddle.Tensor): input text (B, Tmax2).
-            masked_pos (paddle.Tensor): masked position of input speech (B, Tmax)
-            speech_mask (paddle.Tensor): mask of speech (B, 1, Tmax).
-            text_mask (paddle.Tensor): mask of text (B, 1, Tmax2).
-            speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (B, Tmax).
-            text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (B, Tmax2).
+            speech (paddle.Tensor): input speech (1, Tmax, D).
+            text (paddle.Tensor): input text (1, Tmax2).
+            masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
+            speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
+            text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
+            speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
+            text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
            span_bdy (List[int]): masked mel boundary of input speech (2,)
            use_teacher_forcing (bool): whether to use teacher forcing
        Returns:
@@ -410,7 +409,6 @@ class MLM(nn.Layer):
                [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
        '''

-        outs = [speech[:, :span_bdy[0]]]
        z_cache = None
        if use_teacher_forcing:
            before_outs, zs, *_ = self.forward(
@@ -423,8 +421,11 @@ class MLM(nn.Layer):
                text_seg_pos=text_seg_pos)
            if zs is None:
                zs = before_outs
+
+            speech = speech.squeeze(0)
+            outs = [speech[:span_bdy[0]]]
            outs += [zs[0][span_bdy[0]:span_bdy[1]]]
-            outs += [speech[:, span_bdy[1]:]]
+            outs += [speech[span_bdy[1]:]]
            return outs
        return None