未验证 提交 f43d0260 编写于 作者: H HuangLiangJie 提交者: GitHub

Add rhythm tags for MFA, test=tts (#2615)

* Add rhythm tags for MFA, test=tts
上级 fd73a184
...@@ -4,3 +4,6 @@ Run the following script to get started, for more detail, please see `run.sh`. ...@@ -4,3 +4,6 @@ Run the following script to get started, for more detail, please see `run.sh`.
```bash ```bash
./run.sh ./run.sh
``` ```
# Rhythm tags for MFA
If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.
...@@ -182,12 +182,17 @@ if __name__ == "__main__": ...@@ -182,12 +182,17 @@ if __name__ == "__main__":
"--with-tone", action="store_true", help="whether to consider tone.") "--with-tone", action="store_true", help="whether to consider tone.")
parser.add_argument( parser.add_argument(
"--with-r", action="store_true", help="whether to consider erhua.") "--with-r", action="store_true", help="whether to consider erhua.")
parser.add_argument(
"--rhy-with-duration",
action="store_true", )
args = parser.parse_args() args = parser.parse_args()
lexicon = generate_lexicon(args.with_tone, args.with_r) lexicon = generate_lexicon(args.with_tone, args.with_r)
symbols = generate_symbols(lexicon) symbols = generate_symbols(lexicon)
with open(args.output + ".lexicon", 'wt') as f: with open(args.output + ".lexicon", 'wt') as f:
if args.rhy_with_duration:
f.write("sp1 sp1\nsp2 sp2\nsp3 sp3\nsp4 sp4\n")
for k, v in lexicon.items(): for k, v in lexicon.items():
f.write(f"{k} {v}\n") f.write(f"{k} {v}\n")
......
...@@ -23,6 +23,7 @@ for more details. ...@@ -23,6 +23,7 @@ for more details.
""" """
import argparse import argparse
import os import os
import re
import shutil import shutil
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
...@@ -32,6 +33,22 @@ import librosa ...@@ -32,6 +33,22 @@ import librosa
import soundfile as sf import soundfile as sf
from tqdm import tqdm from tqdm import tqdm
repalce_dict = {
";": "",
"。": "",
":": "",
"—": "",
")": "",
",": "",
"“": "",
"(": "",
"、": "",
"…": "",
"!": "",
"?": "",
"”": ""
}
def get_transcripts(path: Union[str, Path]): def get_transcripts(path: Union[str, Path]):
transcripts = {} transcripts = {}
...@@ -55,9 +72,13 @@ def resample_and_save(source, target, sr=16000): ...@@ -55,9 +72,13 @@ def resample_and_save(source, target, sr=16000):
def reorganize_baker(root_dir: Union[str, Path], def reorganize_baker(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None, output_dir: Union[str, Path]=None,
resample_audio=False): resample_audio=False,
rhy_dur=False):
root_dir = Path(root_dir).expanduser() root_dir = Path(root_dir).expanduser()
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" if rhy_dur:
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt"
else:
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
transcriptions = get_transcripts(transcript_path) transcriptions = get_transcripts(transcript_path)
wave_dir = root_dir / "Wave" wave_dir = root_dir / "Wave"
...@@ -92,6 +113,46 @@ def reorganize_baker(root_dir: Union[str, Path], ...@@ -92,6 +113,46 @@ def reorganize_baker(root_dir: Union[str, Path],
print("Done!") print("Done!")
def insert_rhy(sentence_first, sentence_second):
sub = '#'
return_words = []
sentence_first = sentence_first.translate(str.maketrans(repalce_dict))
rhy_idx = [substr.start() for substr in re.finditer(sub, sentence_first)]
re_rhy_idx = []
sentence_first_ = sentence_first.replace("#1", "").replace(
"#2", "").replace("#3", "").replace("#4", "")
sentence_seconds = sentence_second.split(" ")
for i, w in enumerate(rhy_idx):
re_rhy_idx.append(w - i * 2)
i = 0
# print("re_rhy_idx: ", re_rhy_idx)
for sentence_s in (sentence_seconds):
return_words.append(sentence_s)
if i < len(re_rhy_idx) and len(return_words) - i == re_rhy_idx[i]:
return_words.append("sp" + sentence_first[rhy_idx[i] + 1:rhy_idx[i]
+ 2])
i = i + 1
return return_words
def normalize_rhy(root_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt"
target_transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt"
with open(transcript_path) as f:
lines = f.readlines()
with open(target_transcript_path, 'wt') as f:
for i in range(0, len(lines), 2):
sentence_first = lines[i] #第一行直接保存
f.write(sentence_first)
transcription = lines[i + 1].strip()
f.write("\t" + " ".join(
insert_rhy(sentence_first.split('\t')[1], transcription)) +
"\n")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Reorganize Baker dataset for MFA") description="Reorganize Baker dataset for MFA")
...@@ -104,6 +165,12 @@ if __name__ == "__main__": ...@@ -104,6 +165,12 @@ if __name__ == "__main__":
"--resample-audio", "--resample-audio",
action="store_true", action="store_true",
help="To resample audio files or just copy them") help="To resample audio files or just copy them")
parser.add_argument(
"--rhy-with-duration",
action="store_true", )
args = parser.parse_args() args = parser.parse_args()
reorganize_baker(args.root_dir, args.output_dir, args.resample_audio) if args.rhy_with_duration:
normalize_rhy(args.root_dir)
reorganize_baker(args.root_dir, args.output_dir, args.resample_audio,
args.rhy_with_duration)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册