[TTS] fix some bugs of ERNIE-SAT (#2378)

* fix ernie_sat, test=tts * fix for comments, test=tts

[TTS] fix some bugs of ERNIE-SAT (#2378)
* fix ernie_sat, test=tts * fix for comments, test=tts
80b18021 · 小湉湉 · GitHub · ec571bb0 · 80b18021 · 80b18021
5 changed file
--- a/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/aishell3/ernie_sat/local/synthesize_e2e.sh
@@ -13,9 +13,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/synthesize_e2e.py \
        --task_name=synthesize \
-        --wav_path=source/SSB03540307.wav\
-        --old_str='请播放歌曲小苹果。' \
-        --new_str='歌曲真好听。' \
+        --wav_path=source/SSB03540307.wav \
+        --old_str='请播放歌曲小苹果' \
+        --new_str='歌曲真好听' \
        --source_lang=zh \
        --target_lang=zh \
        --erniesat_config=${config_path} \

--- a/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize_e2e.sh
@@ -15,7 +15,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/synthesize_e2e.py \
        --task_name=synthesize \
        --wav_path=source/p243_313.wav \
-        --old_str='For that reason cover should not be given.' \
+        --old_str='For that reason cover should not be given' \
        --new_str='今天天气很好' \
        --source_lang=en \
        --target_lang=zh \
@@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/synthesize_e2e.py \
        --task_name=synthesize \
        --wav_path=source/SSB03540307.wav \
-        --old_str='请播放歌曲小苹果。' \
-        --new_str="Thank you!" \
+        --old_str='请播放歌曲小苹果' \
+        --new_str="Thank you" \
        --source_lang=zh \
        --target_lang=en \
        --erniesat_config=${config_path} \

--- a/examples/vctk/ernie_sat/local/synthesize_e2e.sh
+++ b/examples/vctk/ernie_sat/local/synthesize_e2e.sh
@@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/synthesize_e2e.py \
        --task_name=synthesize \
        --wav_path=source/p243_313.wav \
-        --old_str='For that reason cover should not be given.' \
+        --old_str='For that reason cover should not be given' \
        --new_str='I love you very much do you love me' \
        --source_lang=en \
        --target_lang=en \
@@ -36,8 +36,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/synthesize_e2e.py \
        --task_name=edit \
        --wav_path=source/p243_313.wav \
-        --old_str='For that reason cover should not be given.' \
-        --new_str='For that reason cover is not impossible to be given.' \
+        --old_str='For that reason cover should not be given' \
+        --new_str='For that reason cover is not impossible to be given' \
        --source_lang=en \
        --target_lang=en \
        --erniesat_config=${config_path} \

--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@@ -58,7 +58,7 @@ def _readtg(tg_path: str, lang: str='en', fs: int=24000, n_shift: int=300):
        durations[-2] += durations[-1]
        durations = durations[:-1]

-    # replace ' and 'sil' with 'sp'
+    # replace '' and 'sil' with 'sp'
    phones = ['sp' if (phn == '' or phn == 'sil') else phn for phn in phones]

    if lang == 'en':
@@ -195,7 +195,7 @@ def words2phns(text: str, lang='en'):
            wrd = wrd.upper()
        if (wrd not in ds):
            wrd2phns[str(index) + '_' + wrd] = 'spn'
-            phns.extend('spn')
+            phns.extend(['spn'])
        else:
            wrd2phns[str(index) + '_' + wrd] = word2phns_dict[wrd].split()
            phns.extend(word2phns_dict[wrd].split())

--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -137,9 +137,6 @@ def prep_feats_with_dur(wav_path: str,
    new_wav = np.concatenate(
        [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])

-    # 音频是正常遮住了
-    sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
-
    # 4. get old and new mel span to be mask
    old_span_bdy = get_span_bdy(
        mfa_start=mfa_start, mfa_end=mfa_end, span_to_repl=span_to_repl)
@@ -274,7 +271,8 @@ def get_wav(wav_path: str,
            new_str: str='',
            duration_adjust: bool=True,
            fs: int=24000,
-            n_shift: int=300):
+            n_shift: int=300,
+            task_name: str='synthesize'):

    outs = get_mlm_output(
        wav_path=wav_path,
@@ -298,9 +296,11 @@ def get_wav(wav_path: str,
    alt_wav = np.squeeze(alt_wav)

    old_time_bdy = [n_shift * x for x in old_span_bdy]
+    if task_name == 'edit':
        wav_replaced = np.concatenate(
            [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
-
+    else:
+        wav_replaced = alt_wav
    wav_dict = {"origin": wav_org, "output": wav_replaced}
    return wav_dict

@@ -356,7 +356,11 @@ def parse_args():
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")

    # ernie sat related
-    parser.add_argument("--task_name", type=str, help="task name")
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        choices=['edit', 'synthesize'],
+        help="task name.")
    parser.add_argument("--wav_path", type=str, help="path of old wav")
    parser.add_argument("--old_str", type=str, help="old string")
    parser.add_argument("--new_str", type=str, help="new string")
@@ -410,10 +414,9 @@ if __name__ == '__main__':
    if args.task_name == 'edit':
        new_str = new_str
    elif args.task_name == 'synthesize':
-        new_str = old_str + new_str
+        new_str = old_str + ' ' + new_str
    else:
-        new_str = old_str + new_str
-    print("new_str:", new_str)
+        new_str = old_str + ' ' + new_str

    # Extractor
    mel_extractor = LogMelFBank(
@@ -467,7 +470,8 @@ if __name__ == '__main__':
        new_str=new_str,
        duration_adjust=args.duration_adjust,
        fs=erniesat_config.fs,
-        n_shift=erniesat_config.n_shift)
+        n_shift=erniesat_config.n_shift,
+        task_name=args.task_name)

    sf.write(
        args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)