Merge pull request #1887 from jerryuhoo/fix_preprocess

[tts] normalize wav max value to 1 in preprocess, test=tts

Merge pull request #1887 from jerryuhoo/fix_preprocess
[tts] normalize wav max value to 1 in preprocess, test=tts
3474188b · 小湉湉 · GitHub · 1eec7b5e · 167aaa65 · 3474188b
4 changed file
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -55,8 +55,11 @@ def process_sentence(config: Dict[str, Any],
    if utt_id in sentences:
        # reading, resampling may occur
        wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
        assert np.abs(wav).max(
        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."

--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any],
    if utt_id in sentences:
        # reading, resampling may occur
        y, _ = librosa.load(str(fp), sr=config.fs)
-        if len(y.shape) != 1 or np.abs(y).max() > 1.0:
+        if len(y.shape) != 1:
            return record
+        max_value = np.abs(y).max()
+        if max_value > 1.0:
+            y = y / max_value
        assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
        assert np.abs(y).max(
        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."

--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any],
    if utt_id in sentences:
        # reading, resampling may occur
        wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
        assert np.abs(wav).max(
        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."

--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -51,8 +51,11 @@ def process_sentence(config: Dict[str, Any],
    if utt_id in sentences:
        # reading, resampling may occur
        wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
        assert np.abs(wav).max(
        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."