Add synthesizer preprocessing support for other datasets (#441)

Co-authored-by: N Corentin Jemine <corentin.jemine@gmail.com>

Add synthesizer preprocessing support for other datasets (#441)
Co-authored-by: N Corentin Jemine <corentin.jemine@gmail.com>
054f16ec · blue-fish · GitHub · eaf5ec44 · 054f16ec · 054f16ec
隐藏空白更改
内联并排

Showing with 71 addition and 34 deletion

synthesizer/preprocess.py synthesizer/preprocess.py +62 -32

synthesizer_preprocess_audio.py synthesizer_preprocess_audio.py +9 -2

未找到文件。
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@@ -10,12 +10,12 @@ import numpy as np
 import librosa


-def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, 
-                           skip_existing: bool, hparams):
+def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
+                           skip_existing: bool, hparams, no_alignments: bool,
+                           datasets_name: str, subfolders: str):
    # Gather the input directories
-    dataset_root = datasets_root.joinpath("LibriSpeech")
-    input_dirs = [dataset_root.joinpath("train-clean-100"), 
-                  dataset_root.joinpath("train-clean-360")]
+    dataset_root = datasets_root.joinpath(datasets_name)
+    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
    assert all(input_dir.exists() for input_dir in input_dirs)
    
@@ -30,9 +30,9 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
    # Preprocess the dataset
    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
    func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 
-                   hparams=hparams)
+                   hparams=hparams, no_alignments=no_alignments)
    job = Pool(n_processes).imap(func, speaker_dirs)
-    for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"):
+    for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
        for metadatum in speaker_metadata:
            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
    metadata_file.close()
@@ -51,32 +51,62 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int,
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))


-def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
+def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
    metadata = []
    for book_dir in speaker_dir.glob("*"):
-        # Gather the utterance audios and texts
-        try:
-            alignments_fpath = next(book_dir.glob("*.alignment.txt"))
-            with alignments_fpath.open("r") as alignments_file:
-                alignments = [line.rstrip().split(" ") for line in alignments_file]
-        except StopIteration:
-            # A few alignment files will be missing
-            continue
-        
-        # Iterate over each entry in the alignments file
-        for wav_fname, words, end_times in alignments:
-            wav_fpath = book_dir.joinpath(wav_fname + ".flac")
-            assert wav_fpath.exists()
-            words = words.replace("\"", "").split(",")
-            end_times = list(map(float, end_times.replace("\"", "").split(",")))
-            
-            # Process each sub-utterance
-            wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
-            for i, (wav, text) in enumerate(zip(wavs, texts)):
-                sub_basename = "%s_%02d" % (wav_fname, i)
-                metadata.append(process_utterance(wav, text, out_dir, sub_basename, 
-                                                  skip_existing, hparams))
-    
+        if no_alignments:
+            # Gather the utterance audios and texts
+            # LibriTTS uses .wav but we will include extensions for compatibility with other datasets
+            extensions = ["*.wav", "*.flac", "*.mp3"]
+            for extension in extensions:
+                wav_fpaths = book_dir.glob(extension)
+
+                for wav_fpath in wav_fpaths:
+                    # Load the audio waveform
+                    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
+                    if hparams.rescale:
+                        wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
+                    # Get the corresponding text
+                    # Check for .txt (for compatibility with other datasets)
+                    text_fpath = wav_fpath.with_suffix(".txt")
+                    if not text_fpath.exists():
+                        # Check for .normalized.txt (LibriTTS)
+                        text_fpath = wav_fpath.with_suffix(".normalized.txt")
+                        assert text_fpath.exists()
+                    with text_fpath.open("r") as text_file:
+                        text = "".join([line for line in text_file])
+                        text = text.replace("\"", "")
+                        text = text.strip()
+
+                    # Process the utterance
+                    metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
+                                                      skip_existing, hparams))
+        else:
+            # Process alignment file (LibriSpeech support)
+            # Gather the utterance audios and texts
+            try:
+                alignments_fpath = next(book_dir.glob("*.alignment.txt"))
+                with alignments_fpath.open("r") as alignments_file:
+                    alignments = [line.rstrip().split(" ") for line in alignments_file]
+            except StopIteration:
+                # A few alignment files will be missing
+                continue
+
+            # Iterate over each entry in the alignments file
+            for wav_fname, words, end_times in alignments:
+                wav_fpath = book_dir.joinpath(wav_fname + ".flac")
+                assert wav_fpath.exists()
+                words = words.replace("\"", "").split(",")
+                end_times = list(map(float, end_times.replace("\"", "").split(",")))
+
+                # Process each sub-utterance
+                wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
+                for i, (wav, text) in enumerate(zip(wavs, texts)):
+                    sub_basename = "%s_%02d" % (wav_fname, i)
+                    metadata.append(process_utterance(wav, text, out_dir, sub_basename,
+                                                      skip_existing, hparams))
+
    return [m for m in metadata if m is not None]


@@ -222,4 +252,4 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
-    
\ No newline at end of file
+    
--- a/synthesizer_preprocess_audio.py
+++ b/synthesizer_preprocess_audio.py
-from synthesizer.preprocess import preprocess_librispeech
+from synthesizer.preprocess import preprocess_dataset
 from synthesizer.hparams import hparams
 from utils.argutils import print_args
 from pathlib import Path
@@ -26,6 +26,13 @@ if __name__ == "__main__":
        "Hyperparameter overrides as a comma-separated list of name-value pairs")
    parser.add_argument("--no_trim", action="store_true", help=\
        "Preprocess audio without trimming silences (not recommended).")
+    parser.add_argument("--no_alignments", action="store_true", help=\
+        "Use this option when dataset does not include alignments\
+        (these are used to split long audio files into sub-utterances.)")
+    parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
+        "Name of the dataset directory to process.")
+    parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
+        "Comma-separated list of subfolders to process inside your dataset directory")
    args = parser.parse_args()

    # Process the arguments
@@ -49,4 +56,4 @@ if __name__ == "__main__":
    # Preprocess the dataset
    print_args(args, parser)
    args.hparams = hparams.parse(args.hparams)
-    preprocess_librispeech(**vars(args))    
+    preprocess_dataset(**vars(args))