vctk.py 3.2 KB
Newer Older
C
chenfeiyu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
from pathlib import Path
import pandas as pd
from ruamel.yaml import YAML
import io

import librosa
import numpy as np

from parakeet.g2p.en import text_to_sequence
from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, WavBatcher

class VCTK(Dataset):
C
chenfeiyu 已提交
15 16 17
    def __init__(self, root):
        assert isinstance(root, (str, Path)), "root should be a string or Path object"
        self.root = root if isinstance(root, Path) else Path(root)
C
chenfeiyu 已提交
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
        self.text_root = self.root.joinpath("txt")
        self.wav_root = self.root.joinpath("wav48")

        if not (self.root.joinpath("metadata.csv").exists() and 
                self.root.joinpath("speaker_indices.yaml").exists()):
            self._prepare_metadata()
        self.speaker_indices, self.metadata = self._load_metadata()

    def _load_metadata(self):
        yaml=YAML(typ='safe')
        speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
        metadata = pd.read_csv(self.root.joinpath("metadata.csv"), 
                               sep="|", quoting=3, header=1)
        return speaker_indices, metadata

    def _prepare_metadata(self):
        metadata = []
        speaker_to_index = {}
        for i, speaker_folder in enumerate(self.text_root.iterdir()):
            if speaker_folder.is_dir():
                speaker_to_index[speaker_folder.name] = i
                for text_file in speaker_folder.iterdir():
                    if text_file.is_file():
                        with io.open(str(text_file)) as f:
                            transcription = f.read().strip()
                    wav_file = text_file.with_suffix(".wav")
                    metadata.append((wav_file.name, speaker_folder.name, transcription))
        metadata = pd.DataFrame.from_records(metadata,
                                             columns=["wave_file", "speaker", "text"])
        
        # save them
        yaml=YAML(typ='safe')
        yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
        metadata.to_csv(self.root.joinpath("metadata.csv"), 
                        sep="|", quoting=3, index=False)

    def _get_example(self, metadatum):
        wave_file, speaker, text = metadatum
        wav_path = self.wav_root.joinpath(speaker, wave_file)
        wav, sr = librosa.load(str(wav_path), sr=None)
        phoneme_seq = np.array(text_to_sequence(text))
        return wav, self.speaker_indices[speaker], phoneme_seq

    def __getitem__(self, index):
        metadatum = self.metadata.iloc[index]
        example = self._get_example(metadatum)
        return example

    def __len__(self):
        return len(self.metadata)

    def _batch_examples(self, minibatch):
        wav_batch, speaker_batch, phoneme_batch = [], [], []
        for example in minibatch:
            wav, speaker_id, phoneme_seq = example
            wav_batch.append(wav)
            speaker_batch.append(speaker_id)
            phoneme_batch.append(phoneme_seq)
        wav_batch = WavBatcher(pad_value=0.)(wav_batch)
        speaker_batch = np.array(speaker_batch)
        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
        return wav_batch, speaker_batch, phoneme_batch