diff --git a/.gitignore b/.gitignore index 9fe17bcebb32d017968834acfabd03bbae6a24d7..9e0ff35c82ab66e767b5019d7fb4bad78081bf6e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ __pycache__/ *.py[cod] *$py.class +# vscode +.vscode + # C extensions *.so diff --git a/parakeet/datasets/vctk.py b/parakeet/datasets/vctk.py new file mode 100644 index 0000000000000000000000000000000000000000..8faad4f38a67f7d8c13a4fb88d007cbaaaab4e82 --- /dev/null +++ b/parakeet/datasets/vctk.py @@ -0,0 +1,80 @@ +from pathlib import Path +import pandas as pd +from ruamel.yaml import YAML +import io + +import librosa +import numpy as np + +from parakeet.g2p.en import text_to_sequence +from parakeet.data.dataset import Dataset +from parakeet.data.datacargo import DataCargo +from parakeet.data.batch import TextIDBatcher, WavBatcher + +class VCTK(Dataset): + def __init__(self, root: Path): + self.root = root + self.text_root = self.root.joinpath("txt") + self.wav_root = self.root.joinpath("wav48") + + if not (self.root.joinpath("metadata.csv").exists() and + self.root.joinpath("speaker_indices.yaml").exists()): + self._prepare_metadata() + self.speaker_indices, self.metadata = self._load_metadata() + + def _load_metadata(self): + yaml=YAML(typ='safe') + speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml")) + metadata = pd.read_csv(self.root.joinpath("metadata.csv"), + sep="|", quoting=3, header=1) + return speaker_indices, metadata + + def _prepare_metadata(self): + metadata = [] + speaker_to_index = {} + for i, speaker_folder in enumerate(self.text_root.iterdir()): + if speaker_folder.is_dir(): + speaker_to_index[speaker_folder.name] = i + for text_file in speaker_folder.iterdir(): + if text_file.is_file(): + with io.open(str(text_file)) as f: + transcription = f.read().strip() + wav_file = text_file.with_suffix(".wav") + metadata.append((wav_file.name, speaker_folder.name, transcription)) + metadata = pd.DataFrame.from_records(metadata, + columns=["wave_file", "speaker", "text"]) + + # save them + yaml=YAML(typ='safe') + yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml")) + metadata.to_csv(self.root.joinpath("metadata.csv"), + sep="|", quoting=3, index=False) + + def _get_example(self, metadatum): + wave_file, speaker, text = metadatum + wav_path = self.wav_root.joinpath(speaker, wave_file) + wav, sr = librosa.load(str(wav_path), sr=None) + phoneme_seq = np.array(text_to_sequence(text)) + return wav, self.speaker_indices[speaker], phoneme_seq + + def __getitem__(self, index): + metadatum = self.metadata.iloc[index] + example = self._get_example(metadatum) + return example + + def __len__(self): + return len(self.metadata) + + def _batch_examples(self, minibatch): + wav_batch, speaker_batch, phoneme_batch = [], [], [] + for example in minibatch: + wav, speaker_id, phoneme_seq = example + wav_batch.append(wav) + speaker_batch.append(speaker_id) + phoneme_batch.append(phoneme_seq) + wav_batch = WavBatcher(pad_value=0.)(wav_batch) + speaker_batch = np.array(speaker_batch) + phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) + return wav_batch, speaker_batch, phoneme_batch + + \ No newline at end of file diff --git a/setup.py b/setup.py index c6e014800b1d0332cbecb2c3abb253f34f2a218b..2a164465c8b7d8c23c5382458bbd4a80a57c2b0a 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,9 @@ setup_info = dict( license='Apache 2', install_requires=[ - 'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', 'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy', 'tqdm', + 'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', + 'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy', + 'ruamel.yaml', 'pandas', ], # Package info diff --git a/tests/test_ljspeech.py b/tests/test_ljspeech.py index 5f837221bdb921a1cb49c31474ba619d7f10ed24..04db6a9997df0ce2c64a472a75a16cdabfd58df8 100644 --- a/tests/test_ljspeech.py +++ b/tests/test_ljspeech.py @@ -3,7 +3,7 @@ from parakeet.data.datacargo import DataCargo from pathlib import Path -LJSPEECH_ROOT = Path("/Users/chenfeiyu/projects/LJSpeech-1.1") +LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1") ljspeech = LJSpeech(LJSPEECH_ROOT) ljspeech_cargo = DataCargo(ljspeech, batch_size=16, shuffle=True) for i, batch in enumerate(ljspeech_cargo): diff --git a/tests/test_vctk.py b/tests/test_vctk.py new file mode 100644 index 0000000000000000000000000000000000000000..3f7d61e3c67954e0cb746f60cefa1211e20998a3 --- /dev/null +++ b/tests/test_vctk.py @@ -0,0 +1,11 @@ +from parakeet.datasets import vctk +from pathlib import Path +from parakeet.data.datacargo import DataCargo + +root = Path("/workspace/datasets/VCTK-Corpus") +vctk_dataset = vctk.VCTK(root) +vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True) + +for i, batch in enumerate(vctk_cargo): + print(i) +