data.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division
import os
import csv
from pathlib import Path
import numpy as np
from paddle import fluid
import pandas as pd
import librosa
from scipy import signal

import paddle.fluid.dygraph as dg

from parakeet.g2p.en import text_to_sequence, sequence_to_text
from parakeet.data import DatasetMixin, TransformDataset, FilterDataset, CacheDataset
from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler, BucketSampler


class LJSpeechMetaData(DatasetMixin):
    def __init__(self, root):
        self.root = Path(root)
        self._wav_dir = self.root.joinpath("wavs")
        csv_path = self.root.joinpath("metadata.csv")
        self._table = pd.read_csv(
            csv_path,
            sep="|",
            encoding="utf-8",
            header=None,
            quoting=csv.QUOTE_NONE,
            names=["fname", "raw_text", "normalized_text"])

    def get_example(self, i):
        fname, raw_text, normalized_text = self._table.iloc[i]
        fname = str(self._wav_dir.joinpath(fname + ".wav"))
        return fname, raw_text, normalized_text

    def __len__(self):
        return len(self._table)


class Transform(object):
    def __init__(self,
                 replace_pronunciation_prob=0.,
                 sample_rate=22050,
                 preemphasis=.97,
                 n_fft=1024,
                 win_length=1024,
                 hop_length=256,
                 fmin=125,
                 fmax=7600,
                 n_mels=80,
                 min_level_db=-100,
                 ref_level_db=20,
                 max_norm=0.999,
                 clip_norm=True):
        self.replace_pronunciation_prob = replace_pronunciation_prob

        self.sample_rate = sample_rate
        self.preemphasis = preemphasis
        self.n_fft = n_fft
        self.win_length = win_length
        self.hop_length = hop_length

        self.fmin = fmin
        self.fmax = fmax
        self.n_mels = n_mels

        self.min_level_db = min_level_db
        self.ref_level_db = ref_level_db
        self.max_norm = max_norm
        self.clip_norm = clip_norm

    def __call__(self, in_data):
        fname, _, normalized_text = in_data

        # text processing
        mix_grapheme_phonemes = text_to_sequence(
            normalized_text, self.replace_pronunciation_prob)
        text_length = len(mix_grapheme_phonemes)
        # CAUTION: positions start from 1
        speaker_id = None

        # wave processing
        wav, _ = librosa.load(fname, sr=self.sample_rate)
        # preemphasis
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)

        # STFT
        D = librosa.stft(
            y=y,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)
        S = np.abs(D)

        # to db and normalize to 0-1
        amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))  # 1e-5
        S_norm = 20 * np.log10(np.maximum(amplitude_min,
                                          S)) - self.ref_level_db
        S_norm = (S_norm - self.min_level_db) / (-self.min_level_db)
        S_norm = self.max_norm * S_norm
        if self.clip_norm:
            S_norm = np.clip(S_norm, 0, self.max_norm)

        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
        S_mel = librosa.feature.melspectrogram(
            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
        S_mel_norm = self.max_norm * S_mel_norm
        if self.clip_norm:
            S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm)

        # num_frames
        n_frames = S_mel_norm.shape[-1]  # CAUTION: original number of frames
        return (mix_grapheme_phonemes, text_length, speaker_id, S_norm.T,
                S_mel_norm.T, n_frames)


class DataCollector(object):
    def __init__(self, downsample_factor=4, r=1):
        self.downsample_factor = int(downsample_factor)
        self.frames_per_step = int(r)
        self._factor = int(downsample_factor * r)
        # CAUTION: small diff here
        self._pad_begin = int(downsample_factor * r)

    def __call__(self, examples):
        batch_size = len(examples)

        # lengths
        text_lengths = np.array([example[1]
                                 for example in examples]).astype(np.int64)
        frames = np.array([example[5]
                           for example in examples]).astype(np.int64)

        max_text_length = int(np.max(text_lengths))
        max_frames = int(np.max(frames))
        if max_frames % self._factor != 0:
            max_frames += (self._factor - max_frames % self._factor)
        max_frames += self._pad_begin
        max_decoder_length = max_frames // self._factor

        # pad time sequence
        text_sequences = []
        lin_specs = []
        mel_specs = []
        done_flags = []
        for example in examples:
            (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
             S_mel_norm, num_frames) = example
            text_sequences.append(
                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
                                               ),
                       mode="constant"))
            lin_specs.append(
                np.pad(S_norm, ((self._pad_begin, max_frames - self._pad_begin
                                 - num_frames), (0, 0)),
                       mode="constant"))
            mel_specs.append(
                np.pad(S_mel_norm, ((self._pad_begin, max_frames -
                                     self._pad_begin - num_frames), (0, 0)),
                       mode="constant"))
            done_flags.append(
                np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
                       (0, max_decoder_length - int(
                           np.ceil(num_frames // self._factor))),
                       mode="constant",
                       constant_values=1))
        text_sequences = np.array(text_sequences).astype(np.int64)
        lin_specs = np.array(lin_specs).astype(np.float32)
        mel_specs = np.array(mel_specs).astype(np.float32)

        # downsample here
        done_flags = np.array(done_flags).astype(np.float32)

        # text positions
        text_mask = (np.arange(1, 1 + max_text_length) <= np.expand_dims(
            text_lengths, -1)).astype(np.int64)
        text_positions = np.arange(
            1, 1 + max_text_length, dtype=np.int64) * text_mask

        # decoder_positions
        decoder_positions = np.tile(
            np.expand_dims(
                np.arange(
                    1, 1 + max_decoder_length, dtype=np.int64), 0),
            (batch_size, 1))

        return (text_sequences, text_lengths, text_positions, mel_specs,
                lin_specs, frames, decoder_positions, done_flags)


def make_data_loader(data_root, config):
    # construct meta data
    meta = LJSpeechMetaData(data_root)

    # filter it!
    min_text_length = config["meta_data"]["min_text_length"]
    meta = FilterDataset(meta, lambda x: len(x[2]) >= min_text_length)

    # transform meta data into meta data
    c = config["transform"]
    transform = Transform(
        replace_pronunciation_prob=c["replace_pronunciation_prob"],
        sample_rate=c["sample_rate"],
        preemphasis=c["preemphasis"],
        n_fft=c["n_fft"],
        win_length=c["win_length"],
        hop_length=c["hop_length"],
        fmin=c["fmin"],
        fmax=c["fmax"],
        n_mels=c["n_mels"],
        min_level_db=c["min_level_db"],
        ref_level_db=c["ref_level_db"],
        max_norm=c["max_norm"],
        clip_norm=c["clip_norm"])
    ljspeech = TransformDataset(meta, transform)

    # use meta data's text length as a sort key for the sampler
    batch_size = config["train"]["batch_size"]
    text_lengths = [len(example[2]) for example in meta]
    sampler = PartialyRandomizedSimilarTimeLengthSampler(text_lengths,
                                                         batch_size)

    env = dg.parallel.ParallelEnv()
    num_trainers = env.nranks
    local_rank = env.local_rank
    sampler = BucketSampler(
        text_lengths, batch_size, num_trainers=num_trainers, rank=local_rank)

    # some model hyperparameters affect how we process data
    model_config = config["model"]
    collector = DataCollector(
        downsample_factor=model_config["downsample_factor"],
        r=model_config["outputs_per_step"])
    ljspeech_loader = DataCargo(
        ljspeech, batch_fn=collector, batch_size=batch_size, sampler=sampler)
    loader = fluid.io.DataLoader.from_generator(capacity=10, return_list=True)
    loader.set_batch_generator(
        ljspeech_loader, places=fluid.framework._current_expected_place())
    return loader