From f43d0260006e145a5a23dcb0f947ca4c62f83253 Mon Sep 17 00:00:00 2001 From: HuangLiangJie Date: Wed, 9 Nov 2022 12:30:13 +0800 Subject: [PATCH] Add rhythm tags for MFA, test=tts (#2615) * Add rhythm tags for MFA, test=tts --- examples/other/mfa/README.md | 3 + examples/other/mfa/local/generate_lexicon.py | 5 ++ examples/other/mfa/local/reorganize_baker.py | 73 +++++++++++++++++++- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/examples/other/mfa/README.md b/examples/other/mfa/README.md index c24524ab..216d1275 100644 --- a/examples/other/mfa/README.md +++ b/examples/other/mfa/README.md @@ -4,3 +4,6 @@ Run the following script to get started, for more detail, please see `run.sh`. ```bash ./run.sh ``` +# Rhythm tags for MFA +If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh` +Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA. diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py index e9445665..3deb2470 100644 --- a/examples/other/mfa/local/generate_lexicon.py +++ b/examples/other/mfa/local/generate_lexicon.py @@ -182,12 +182,17 @@ if __name__ == "__main__": "--with-tone", action="store_true", help="whether to consider tone.") parser.add_argument( "--with-r", action="store_true", help="whether to consider erhua.") + parser.add_argument( + "--rhy-with-duration", + action="store_true", ) args = parser.parse_args() lexicon = generate_lexicon(args.with_tone, args.with_r) symbols = generate_symbols(lexicon) with open(args.output + ".lexicon", 'wt') as f: + if args.rhy_with_duration: + f.write("sp1 sp1\nsp2 sp2\nsp3 sp3\nsp4 sp4\n") for k, v in lexicon.items(): f.write(f"{k} {v}\n") diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py index 153e01d1..0e0035bd 100644 --- a/examples/other/mfa/local/reorganize_baker.py +++ b/examples/other/mfa/local/reorganize_baker.py @@ -23,6 +23,7 @@ for more details. """ import argparse import os +import re import shutil from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -32,6 +33,22 @@ import librosa import soundfile as sf from tqdm import tqdm +repalce_dict = { + ";": "", + "。": "", + ":": "", + "—": "", + ")": "", + ",": "", + "“": "", + "(": "", + "、": "", + "…": "", + "!": "", + "?": "", + "”": "" +} + def get_transcripts(path: Union[str, Path]): transcripts = {} @@ -55,9 +72,13 @@ def resample_and_save(source, target, sr=16000): def reorganize_baker(root_dir: Union[str, Path], output_dir: Union[str, Path]=None, - resample_audio=False): + resample_audio=False, + rhy_dur=False): root_dir = Path(root_dir).expanduser() - transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" + if rhy_dur: + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt" + else: + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" transcriptions = get_transcripts(transcript_path) wave_dir = root_dir / "Wave" @@ -92,6 +113,46 @@ def reorganize_baker(root_dir: Union[str, Path], print("Done!") +def insert_rhy(sentence_first, sentence_second): + sub = '#' + return_words = [] + sentence_first = sentence_first.translate(str.maketrans(repalce_dict)) + rhy_idx = [substr.start() for substr in re.finditer(sub, sentence_first)] + re_rhy_idx = [] + sentence_first_ = sentence_first.replace("#1", "").replace( + "#2", "").replace("#3", "").replace("#4", "") + sentence_seconds = sentence_second.split(" ") + for i, w in enumerate(rhy_idx): + re_rhy_idx.append(w - i * 2) + i = 0 + # print("re_rhy_idx: ", re_rhy_idx) + for sentence_s in (sentence_seconds): + return_words.append(sentence_s) + if i < len(re_rhy_idx) and len(return_words) - i == re_rhy_idx[i]: + return_words.append("sp" + sentence_first[rhy_idx[i] + 1:rhy_idx[i] + + 2]) + i = i + 1 + return return_words + + +def normalize_rhy(root_dir: Union[str, Path]): + root_dir = Path(root_dir).expanduser() + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" + target_transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt" + + with open(transcript_path) as f: + lines = f.readlines() + + with open(target_transcript_path, 'wt') as f: + for i in range(0, len(lines), 2): + sentence_first = lines[i] #第一行直接保存 + f.write(sentence_first) + transcription = lines[i + 1].strip() + f.write("\t" + " ".join( + insert_rhy(sentence_first.split('\t')[1], transcription)) + + "\n") + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Reorganize Baker dataset for MFA") @@ -104,6 +165,12 @@ if __name__ == "__main__": "--resample-audio", action="store_true", help="To resample audio files or just copy them") + parser.add_argument( + "--rhy-with-duration", + action="store_true", ) args = parser.parse_args() - reorganize_baker(args.root_dir, args.output_dir, args.resample_audio) + if args.rhy_with_duration: + normalize_rhy(args.root_dir) + reorganize_baker(args.root_dir, args.output_dir, args.resample_audio, + args.rhy_with_duration) -- GitLab