未验证 提交 054f16ec 编写于 作者: B blue-fish 提交者: GitHub

Add synthesizer preprocessing support for other datasets (#441)

Co-authored-by: NCorentin Jemine <corentin.jemine@gmail.com>
上级 eaf5ec44
......@@ -10,12 +10,12 @@ import numpy as np
import librosa
def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams):
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool,
datasets_name: str, subfolders: str):
# Gather the input directories
dataset_root = datasets_root.joinpath("LibriSpeech")
input_dirs = [dataset_root.joinpath("train-clean-100"),
dataset_root.joinpath("train-clean-360")]
dataset_root = datasets_root.joinpath(datasets_name)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
assert all(input_dir.exists() for input_dir in input_dirs)
......@@ -30,9 +30,9 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
# Preprocess the dataset
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams)
hparams=hparams, no_alignments=no_alignments)
job = Pool(n_processes).imap(func, speaker_dirs)
for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"):
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.close()
......@@ -51,32 +51,62 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
metadata = []
for book_dir in speaker_dir.glob("*"):
# Gather the utterance audios and texts
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
# A few alignment files will be missing
continue
# Iterate over each entry in the alignments file
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))
# Process each sub-utterance
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
if no_alignments:
# Gather the utterance audios and texts
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpaths = book_dir.glob(extension)
for wav_fpath in wav_fpaths:
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Get the corresponding text
# Check for .txt (for compatibility with other datasets)
text_fpath = wav_fpath.with_suffix(".txt")
if not text_fpath.exists():
# Check for .normalized.txt (LibriTTS)
text_fpath = wav_fpath.with_suffix(".normalized.txt")
assert text_fpath.exists()
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "")
text = text.strip()
# Process the utterance
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
skip_existing, hparams))
else:
# Process alignment file (LibriSpeech support)
# Gather the utterance audios and texts
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
# A few alignment files will be missing
continue
# Iterate over each entry in the alignments file
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))
# Process each sub-utterance
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]
......@@ -222,4 +252,4 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
\ No newline at end of file
from synthesizer.preprocess import preprocess_librispeech
from synthesizer.preprocess import preprocess_dataset
from synthesizer.hparams import hparams
from utils.argutils import print_args
from pathlib import Path
......@@ -26,6 +26,13 @@ if __name__ == "__main__":
"Hyperparameter overrides as a comma-separated list of name-value pairs")
parser.add_argument("--no_trim", action="store_true", help=\
"Preprocess audio without trimming silences (not recommended).")
parser.add_argument("--no_alignments", action="store_true", help=\
"Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)")
parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
"Name of the dataset directory to process.")
parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
"Comma-separated list of subfolders to process inside your dataset directory")
args = parser.parse_args()
# Process the arguments
......@@ -49,4 +56,4 @@ if __name__ == "__main__":
# Preprocess the dataset
print_args(args, parser)
args.hparams = hparams.parse(args.hparams)
preprocess_librispeech(**vars(args))
preprocess_dataset(**vars(args))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册