# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import random import io import platform from os.path import dirname, join from nnmnkwii.datasets import FileSourceDataset, FileDataSource from os.path import join, expanduser import random # import global hyper parameters from hparams import hparams from deepvoice3_paddle import frontend, builder _frontend = getattr(frontend, hparams.frontend) def _pad(seq, max_len, constant_values=0): return np.pad(seq, (0, max_len - len(seq)), mode="constant", constant_values=constant_values) def _pad_2d(x, max_len, b_pad=0): x = np.pad(x, [(b_pad, max_len - len(x) - b_pad), (0, 0)], mode="constant", constant_values=0) return x class TextDataSource(FileDataSource): def __init__(self, data_root, speaker_id=None): self.data_root = data_root self.speaker_ids = None self.multi_speaker = False # If not None, filter by speaker_id self.speaker_id = speaker_id def collect_files(self): meta = join(self.data_root, "train.txt") with io.open(meta, "rt", encoding="utf-8") as f: lines = f.readlines() l = lines[0].split("|") assert len(l) == 4 or len(l) == 5 self.multi_speaker = len(l) == 5 texts = list(map(lambda l: l.split("|")[3], lines)) if self.multi_speaker: speaker_ids = list(map(lambda l: int(l.split("|")[-1]), lines)) # Filter by speaker_id # using multi-speaker dataset as a single speaker dataset if self.speaker_id is not None: indices = np.array(speaker_ids) == self.speaker_id texts = list(np.array(texts)[indices]) self.multi_speaker = False return texts return texts, speaker_ids else: return texts def collect_features(self, *args): if self.multi_speaker: text, speaker_id = args else: text = args[0] global _frontend if _frontend is None: _frontend = getattr(frontend, hparams.frontend) seq = _frontend.text_to_sequence( text, p=hparams.replace_pronunciation_prob) if platform.system() == "Windows": if hasattr(hparams, "gc_probability"): _frontend = None # memory leaking prevention in Windows if np.random.rand() < hparams.gc_probability: gc.collect() # garbage collection enforced print("GC done") if self.multi_speaker: return np.asarray(seq, dtype=np.int32), int(speaker_id) else: return np.asarray(seq, dtype=np.int32) class _NPYDataSource(FileDataSource): def __init__(self, data_root, col, speaker_id=None): self.data_root = data_root self.col = col self.frame_lengths = [] self.speaker_id = speaker_id def collect_files(self): meta = join(self.data_root, "train.txt") with io.open(meta, "rt", encoding="utf-8") as f: lines = f.readlines() l = lines[0].split("|") assert len(l) == 4 or len(l) == 5 multi_speaker = len(l) == 5 self.frame_lengths = list(map(lambda l: int(l.split("|")[2]), lines)) paths = list(map(lambda l: l.split("|")[self.col], lines)) paths = list(map(lambda f: join(self.data_root, f), paths)) if multi_speaker and self.speaker_id is not None: speaker_ids = list(map(lambda l: int(l.split("|")[-1]), lines)) # Filter by speaker_id # using multi-speaker dataset as a single speaker dataset indices = np.array(speaker_ids) == self.speaker_id paths = list(np.array(paths)[indices]) self.frame_lengths = list(np.array(self.frame_lengths)[indices]) # aha, need to cast numpy.int64 to int self.frame_lengths = list(map(int, self.frame_lengths)) return paths def collect_features(self, path): return np.load(path) class MelSpecDataSource(_NPYDataSource): def __init__(self, data_root, speaker_id=None): super(MelSpecDataSource, self).__init__(data_root, 1, speaker_id) class LinearSpecDataSource(_NPYDataSource): def __init__(self, data_root, speaker_id=None): super(LinearSpecDataSource, self).__init__(data_root, 0, speaker_id) class PartialyRandomizedSimilarTimeLengthSampler(object): """Partially randmoized sampler 1. Sort by lengths 2. Pick a small patch and randomize it 3. Permutate mini-batchs """ def __init__(self, lengths, batch_size=16, batch_group_size=None, permutate=True): self.sorted_indices = np.argsort(lengths) self.lengths = np.array(lengths)[self.sorted_indices] self.batch_size = batch_size if batch_group_size is None: batch_group_size = min(batch_size * 32, len(self.lengths)) if batch_group_size % batch_size != 0: batch_group_size -= batch_group_size % batch_size self.batch_group_size = batch_group_size assert batch_group_size % batch_size == 0 self.permutate = permutate def __iter__(self): indices = self.sorted_indices.copy() batch_group_size = self.batch_group_size s, e = 0, 0 for i in range(len(indices) // batch_group_size): s = i * batch_group_size e = s + batch_group_size random.shuffle(indices[s:e]) # Permutate batches if self.permutate: perm = np.arange(len(indices[:e]) // self.batch_size) random.shuffle(perm) indices[:e] = indices[:e].reshape( -1, self.batch_size)[perm, :].reshape(-1) # Handle last elements s += batch_group_size if s < len(indices): random.shuffle(indices[s:]) return iter(indices) def __len__(self): return len(self.sorted_indices) class Dataset(object): def __init__(self, X, Mel, Y): self.X = X self.Mel = Mel self.Y = Y # alias self.multi_speaker = X.file_data_source.multi_speaker def __getitem__(self, idx): if self.multi_speaker: text, speaker_id = self.X[idx] return text, self.Mel[idx], self.Y[idx], speaker_id else: return self.X[idx], self.Mel[idx], self.Y[idx] def __len__(self): return len(self.X) def make_loader(dataset, batch_size, shuffle, sampler, create_batch_fn, trainer_count, local_rank): assert not ( shuffle and sampler), "shuffle and sampler should not be valid in the same time." num_samples = len(dataset) def wrapper(): if sampler is None: ids = range(num_samples) if shuffle: random.shuffle(ids) else: ids = sampler batch, batches = [], [] for idx in ids: batch.append(dataset[idx]) if len(batch) >= batch_size: batches.append(batch) batch = [] if len(batches) >= trainer_count: yield create_batch_fn(batches[local_rank]) batches = [] if len(batch) > 0: batches.append(batch) if len(batches) >= trainer_count: yield create_batch_fn(batches[local_rank]) return wrapper def create_batch(batch): """Create batch""" r = hparams.outputs_per_step downsample_step = hparams.downsample_step multi_speaker = len(batch[0]) == 4 # Lengths input_lengths = [len(x[0]) for x in batch] max_input_len = max(input_lengths) input_lengths = np.array(input_lengths, dtype=np.int64) target_lengths = [len(x[1]) for x in batch] max_target_len = max(target_lengths) target_lengths = np.array(target_lengths, dtype=np.int64) if max_target_len % (r * downsample_step) != 0: max_target_len += (r * downsample_step) - max_target_len % ( r * downsample_step) assert max_target_len % (r * downsample_step) == 0 # Set 0 for zero beginning padding # imitates initial decoder states b_pad = r max_target_len += b_pad * downsample_step x_batch = np.array( [_pad(x[0], max_input_len) for x in batch], dtype=np.int64) mel_batch = np.array( [_pad_2d( x[1], max_target_len, b_pad=b_pad) for x in batch], dtype=np.float32) # down sampling is done here if downsample_step > 1: mel_batch = mel_batch[:, 0::downsample_step, :] mel_batch = np.expand_dims(np.transpose(mel_batch, axes=[0, 2, 1]), axis=2) y_batch = np.array( [_pad_2d( x[2], max_target_len, b_pad=b_pad) for x in batch], dtype=np.float32) y_batch = np.expand_dims(np.transpose(y_batch, axes=[0, 2, 1]), axis=2) # text positions text_positions = np.array( [_pad(np.arange(1, len(x[0]) + 1), max_input_len) for x in batch], dtype=np.int64) max_decoder_target_len = max_target_len // r // downsample_step # frame positions s, e = 1, max_decoder_target_len + 1 frame_positions = np.tile( np.expand_dims( np.arange( s, e, dtype=np.int64), axis=0), (len(batch), 1)) # done flags done = np.array([ _pad( np.zeros( len(x[1]) // r // downsample_step - 1, dtype=np.float32), max_decoder_target_len, constant_values=1) for x in batch ]) done = np.expand_dims(np.expand_dims(done, axis=1), axis=1) if multi_speaker: speaker_ids = np.array([x[3] for x in batch]) return (x_batch, input_lengths, mel_batch, y_batch, text_positions, frame_positions, done, target_lengths, speaker_ids) else: speaker_ids = None return (x_batch, input_lengths, mel_batch, y_batch, text_positions, frame_positions, done, target_lengths)