提交 6aac1827 编写于 作者: C chenfeiyu

refactor for deep voice 3, update wavenet and clarinet to use enable_dygraph

上级 a4dd5acc
......@@ -25,6 +25,7 @@ from tensorboardX import SummaryWriter
import paddle.fluid.dygraph as dg
from paddle import fluid
fluid.require_version('1.8.0')
from parakeet.modules.weight_norm import WeightNormWrapper
from parakeet.models.wavenet import WaveNet, UpsampleNet
......@@ -64,6 +65,13 @@ if __name__ == "__main__":
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
dg.enable_dygraph(place)
ljspeech_meta = LJSpeechMetaData(args.data)
data_config = config["data"]
......@@ -105,12 +113,6 @@ if __name__ == "__main__":
batch_size=1,
sampler=SequentialSampler(ljspeech_valid))
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
# conditioner(upsampling net)
conditioner_config = config["conditioner"]
upsampling_factors = conditioner_config["upsampling_factors"]
......@@ -124,8 +126,8 @@ if __name__ == "__main__":
assert loss_type == "mog" and output_dim == 3, \
"the teacher wavenet should be a wavenet with single gaussian output"
teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim,
n_mels, filter_size, loss_type, log_scale_min)
teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
filter_size, loss_type, log_scale_min)
# load & freeze upsample_net & teacher
freeze(teacher)
......@@ -152,8 +154,7 @@ if __name__ == "__main__":
# load parameters
if args.checkpoint is not None:
# load from args.checkpoint
iteration = io.load_parameters(
model, checkpoint_path=args.checkpoint)
iteration = io.load_parameters(model, checkpoint_path=args.checkpoint)
else:
# load from "args.output/checkpoints"
checkpoint_dir = os.path.join(args.output, "checkpoints")
......
......@@ -25,10 +25,11 @@ from tensorboardX import SummaryWriter
import paddle.fluid.dygraph as dg
from paddle import fluid
fluid.require_version('1.8.0')
from parakeet.models.wavenet import WaveNet, UpsampleNet
from parakeet.models.clarinet import STFT, Clarinet, ParallelWaveNet
from parakeet.data import TransformDataset, SliceDataset, RandomSampler, SequentialSampler, DataCargo
from parakeet.data import TransformDataset, SliceDataset, CacheDataset, RandomSampler, SequentialSampler, DataCargo
from parakeet.utils.layer_tools import summary, freeze
from parakeet.utils import io
......@@ -66,6 +67,13 @@ if __name__ == "__main__":
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
dg.enable_dygraph(place)
print("Command Line args: ")
for k, v in vars(args).items():
print("{}: {}".format(k, v))
......@@ -83,8 +91,9 @@ if __name__ == "__main__":
ljspeech = TransformDataset(ljspeech_meta, transform)
valid_size = data_config["valid_size"]
ljspeech_valid = SliceDataset(ljspeech, 0, valid_size)
ljspeech_train = SliceDataset(ljspeech, valid_size, len(ljspeech))
ljspeech_valid = CacheDataset(SliceDataset(ljspeech, 0, valid_size))
ljspeech_train = CacheDataset(
SliceDataset(ljspeech, valid_size, len(ljspeech)))
teacher_config = config["teacher"]
n_loop = teacher_config["n_loop"]
......@@ -113,12 +122,6 @@ if __name__ == "__main__":
make_output_tree(args.output)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
# conditioner(upsampling net)
conditioner_config = config["conditioner"]
upsampling_factors = conditioner_config["upsampling_factors"]
......@@ -132,8 +135,8 @@ if __name__ == "__main__":
assert loss_type == "mog" and output_dim == 3, \
"the teacher wavenet should be a wavenet with single gaussian output"
teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim,
n_mels, filter_size, loss_type, log_scale_min)
teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
filter_size, loss_type, log_scale_min)
freeze(teacher)
student_config = config["student"]
......@@ -217,11 +220,9 @@ if __name__ == "__main__":
audios, mels, audio_starts, clip_kl=global_step > 500)
writer.add_scalar("learning_rate",
optim._learning_rate.step().numpy()[0],
global_step)
optim._learning_rate.step().numpy()[0], global_step)
for k, v in loss_dict.items():
writer.add_scalar("loss/{}".format(k),
v.numpy()[0], global_step)
writer.add_scalar("loss/{}".format(k), v.numpy()[0], global_step)
l = loss_dict["loss"]
step_loss = l.numpy()[0]
......
......@@ -23,6 +23,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
```text
├── data.py data_processing
├── model.py function to create model, criterion and optimizer
├── configs/ (example) configuration files
├── sentences.txt sample sentences
├── synthesis.py script to synthesize waveform from text
......@@ -34,19 +35,20 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
`train.py` and `synthesis.py` have 3 arguments in common, `--checkpooint`, `iteration` and `output`.
1. `output` is the directory for saving results.
During training, checkpoints are saved in `checkpoints/` in `output` and tensorboard log is save in `log/` in `output`. Other possible outputs are saved in `states/` in `outuput`.
During synthesizing, audio files and other possible outputs are save in `synthesis/` in `output`.
During training, checkpoints are saved in `checkpoints/` in `output` and tensorboard log is save in `log/` in `output`. States for training including alignment plots, spectrogram plots and generated audio files are saved in `states/` in `outuput`. In addition, we periodically evaluate the model with several given sentences, the alignment plots and generated audio files are save in `eval/` in `output`.
During synthesizing, audio files and the alignment plots are save in `synthesis/` in `output`.
So after training and synthesizing with the same output directory, the file structure of the output directory looks like this.
```text
├── checkpoints/ # checkpoint directory (including *.pdparams, *.pdopt and a text file `checkpoint` that records the latest checkpoint)
├── states/ # audio files generated at validation and other possible outputs
├── states/ # alignment plots, spectrogram plots and generated wavs at training
├── log/ # tensorboard log
└── synthesis/ # synthesized audio files and other possible outputs
├── eval/ # audio files an alignment plots generated at evaluation during training
└── synthesis/ # synthesized audio files and alignment plots
```
2. `--checkpoint` and `--iteration` for loading from existing checkpoint. Loading existing checkpoiont follows the following rule:
If `--checkpoint` is provided, the checkpoint specified by `--checkpoint` is loaded.
If `--checkpoint` is provided, the path of the checkpoint specified by `--checkpoint` is loaded.
If `--checkpoint` is not provided, we try to load the model specified by `--iteration` from the checkpoint directory. If `--iteration` is not provided, we try to load the latested checkpoint from checkpoint directory.
## Train
......@@ -100,6 +102,18 @@ python train.py \
experiment
```
To train the model in a paralle in multiple gpus, you can launch the training script with `paddle.distributed.launch`. For example, to train with gpu `0,1,2,3`, you can use the example script below. Note that for parallel training, devices are specified with `--selected_gpus` passed to `paddle.distributed.launch`. In this case, `--device` passed to `train.py`, if specified, is ignored.
Example script:
```bash
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 \
train.py \
--config=configs/ljspeech.yaml \
--data=./LJSpeech-1.1/ \
experiment
```
You can monitor training log via tensorboard, using the script below.
```bash
......
......@@ -17,13 +17,16 @@ import os
import csv
from pathlib import Path
import numpy as np
from paddle import fluid
import pandas as pd
import librosa
from scipy import signal, io
import six
from scipy import signal
import paddle.fluid.dygraph as dg
from parakeet.data import DatasetMixin, TransformDataset, FilterDataset
from parakeet.g2p.en import text_to_sequence, sequence_to_text
from parakeet.data import DatasetMixin, TransformDataset, FilterDataset, CacheDataset
from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler, BucketSampler
class LJSpeechMetaData(DatasetMixin):
......@@ -50,7 +53,7 @@ class LJSpeechMetaData(DatasetMixin):
class Transform(object):
def __init__(self,
replace_pronounciation_prob=0.,
replace_pronunciation_prob=0.,
sample_rate=22050,
preemphasis=.97,
n_fft=1024,
......@@ -63,7 +66,7 @@ class Transform(object):
ref_level_db=20,
max_norm=0.999,
clip_norm=True):
self.replace_pronounciation_prob = replace_pronounciation_prob
self.replace_pronunciation_prob = replace_pronunciation_prob
self.sample_rate = sample_rate
self.preemphasis = preemphasis
......@@ -85,7 +88,7 @@ class Transform(object):
# text processing
mix_grapheme_phonemes = text_to_sequence(
normalized_text, self.replace_pronounciation_prob)
normalized_text, self.replace_pronunciation_prob)
text_length = len(mix_grapheme_phonemes)
# CAUTION: positions start from 1
speaker_id = None
......@@ -125,8 +128,8 @@ class Transform(object):
# num_frames
n_frames = S_mel_norm.shape[-1] # CAUTION: original number of frames
return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, n_frames)
return (mix_grapheme_phonemes, text_length, speaker_id, S_norm.T,
S_mel_norm.T, n_frames)
class DataCollector(object):
......@@ -166,12 +169,12 @@ class DataCollector(object):
),
mode="constant"))
lin_specs.append(
np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
self._pad_begin - num_frames)),
np.pad(S_norm, ((self._pad_begin, max_frames - self._pad_begin
- num_frames), (0, 0)),
mode="constant"))
mel_specs.append(
np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
self._pad_begin - num_frames)),
np.pad(S_mel_norm, ((self._pad_begin, max_frames -
self._pad_begin - num_frames), (0, 0)),
mode="constant"))
done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
......@@ -180,10 +183,10 @@ class DataCollector(object):
mode="constant",
constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs),
(0, 2, 1)).astype(np.float32)
mel_specs = np.transpose(np.array(mel_specs),
(0, 2, 1)).astype(np.float32)
lin_specs = np.array(lin_specs).astype(np.float32)
mel_specs = np.array(mel_specs).astype(np.float32)
# downsample here
done_flags = np.array(done_flags).astype(np.float32)
# text positions
......@@ -201,3 +204,54 @@ class DataCollector(object):
return (text_sequences, text_lengths, text_positions, mel_specs,
lin_specs, frames, decoder_positions, done_flags)
def make_data_loader(data_root, config):
# construct meta data
meta = LJSpeechMetaData(data_root)
# filter it!
min_text_length = config["meta_data"]["min_text_length"]
meta = FilterDataset(meta, lambda x: len(x[2]) >= min_text_length)
# transform meta data into meta data
c = config["transform"]
transform = Transform(
replace_pronunciation_prob=c["replace_pronunciation_prob"],
sample_rate=c["sample_rate"],
preemphasis=c["preemphasis"],
n_fft=c["n_fft"],
win_length=c["win_length"],
hop_length=c["hop_length"],
fmin=c["fmin"],
fmax=c["fmax"],
n_mels=c["n_mels"],
min_level_db=c["min_level_db"],
ref_level_db=c["ref_level_db"],
max_norm=c["max_norm"],
clip_norm=c["clip_norm"])
ljspeech = CacheDataset(TransformDataset(meta, transform))
# use meta data's text length as a sort key for the sampler
batch_size = config["train"]["batch_size"]
text_lengths = [len(example[2]) for example in meta]
sampler = PartialyRandomizedSimilarTimeLengthSampler(text_lengths,
batch_size)
env = dg.parallel.ParallelEnv()
num_trainers = env.nranks
local_rank = env.local_rank
sampler = BucketSampler(
text_lengths, batch_size, num_trainers=num_trainers, rank=local_rank)
# some model hyperparameters affect how we process data
model_config = config["model"]
collector = DataCollector(
downsample_factor=model_config["downsample_factor"],
r=model_config["outputs_per_step"])
ljspeech_loader = DataCargo(
ljspeech, batch_fn=collector, batch_size=batch_size, sampler=sampler)
loader = fluid.io.DataLoader.from_generator(capacity=10, return_list=True)
loader.set_batch_generator(
ljspeech_loader, places=fluid.framework._current_expected_place())
return loader
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import fluid
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg
from parakeet.g2p import en
from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3, TTSLoss, ConvSpec, WindowRange
from parakeet.utils.layer_tools import summary, freeze
def make_model(config):
c = config["model"]
# speaker embedding
n_speakers = c["n_speakers"]
speaker_dim = c["speaker_embed_dim"]
if n_speakers > 1:
speaker_embed = dg.Embedding(
(n_speakers, speaker_dim),
param_attr=I.Normal(scale=c["speaker_embedding_weight_std"]))
else:
speaker_embed = None
# encoder
h = c["encoder_channels"]
k = c["kernel_size"]
encoder_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3), )
encoder = Encoder(
n_vocab=en.n_vocab,
embed_dim=c["text_embed_dim"],
n_speakers=n_speakers,
speaker_dim=speaker_dim,
embedding_weight_std=c["embedding_weight_std"],
convolutions=encoder_convolutions,
dropout=c["dropout"])
if c["freeze_embedding"]:
freeze(encoder.embed)
# decoder
h = c["decoder_channels"]
k = c["kernel_size"]
prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
attentive_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1), )
attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True]
window = WindowRange(c["window_backward"], c["window_ahead"])
decoder = Decoder(
n_speakers,
speaker_dim,
embed_dim=c["text_embed_dim"],
mel_dim=config["transform"]["n_mels"],
r=c["outputs_per_step"],
max_positions=c["max_positions"],
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=c["dropout"],
use_memory_mask=c["use_memory_mask"],
force_monotonic_attention=force_monotonic_attention,
query_position_rate=c["query_position_rate"],
key_position_rate=c["key_position_rate"],
window_range=window,
key_projection=c["key_projection"],
value_projection=c["value_projection"])
if not c["trainable_positional_encodings"]:
freeze(decoder.embed_keys_positions)
freeze(decoder.embed_query_positions)
# converter(postnet)
linear_dim = 1 + config["transform"]["n_fft"] // 2
h = c["converter_channels"]
k = c["kernel_size"]
postnet_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3), )
use_decoder_states = c["use_decoder_state_for_postnet_input"]
converter = Converter(
n_speakers,
speaker_dim,
in_channels=decoder.state_dim
if use_decoder_states else config["transform"]["n_mels"],
linear_dim=linear_dim,
time_upsampling=c["downsample_factor"],
convolutions=postnet_convolutions,
dropout=c["dropout"])
model = DeepVoice3(
encoder,
decoder,
converter,
speaker_embed,
use_decoder_states=use_decoder_states)
return model
def make_criterion(config):
# =========================loss=========================
loss_config = config["loss"]
transform_config = config["transform"]
model_config = config["model"]
priority_freq = loss_config["priority_freq"] # Hz
sample_rate = transform_config["sample_rate"]
linear_dim = 1 + transform_config["n_fft"] // 2
priority_bin = int(priority_freq / (0.5 * sample_rate) * linear_dim)
criterion = TTSLoss(
masked_weight=loss_config["masked_loss_weight"],
priority_bin=priority_bin,
priority_weight=loss_config["priority_freq_weight"],
binary_divergence_weight=loss_config["binary_divergence_weight"],
guided_attention_sigma=loss_config["guided_attention_sigma"],
downsample_factor=model_config["downsample_factor"],
r=model_config["outputs_per_step"])
return criterion
def make_optimizer(model, config):
# =========================lr_scheduler=========================
lr_config = config["lr_scheduler"]
warmup_steps = lr_config["warmup_steps"]
peak_learning_rate = lr_config["peak_learning_rate"]
lr_scheduler = dg.NoamDecay(1 / (warmup_steps * (peak_learning_rate)**2),
warmup_steps)
# =========================optimizer=========================
optim_config = config["optimizer"]
optim = fluid.optimizer.Adam(
lr_scheduler,
beta1=optim_config["beta1"],
beta2=optim_config["beta2"],
epsilon=optim_config["epsilon"],
parameter_list=model.parameters(),
grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1))
return optim
......@@ -20,6 +20,7 @@ import numpy as np
import soundfile as sf
from paddle import fluid
fluid.require_version('1.8.0')
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
from tensorboardX import SummaryWriter
......@@ -29,7 +30,8 @@ from parakeet.modules.weight_norm import WeightNormWrapper
from parakeet.utils.layer_tools import summary
from parakeet.utils import io
from utils import make_model, eval_model, plot_alignment
from model import make_model
from utils import make_evaluator
if __name__ == "__main__":
parser = argparse.ArgumentParser(
......@@ -61,101 +63,29 @@ if __name__ == "__main__":
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
# =========================model=========================
transform_config = config["transform"]
replace_pronounciation_prob = transform_config[
"replace_pronunciation_prob"]
sample_rate = transform_config["sample_rate"]
preemphasis = transform_config["preemphasis"]
n_fft = transform_config["n_fft"]
n_mels = transform_config["n_mels"]
model_config = config["model"]
downsample_factor = model_config["downsample_factor"]
r = model_config["outputs_per_step"]
n_speakers = model_config["n_speakers"]
speaker_dim = model_config["speaker_embed_dim"]
speaker_embed_std = model_config["speaker_embedding_weight_std"]
n_vocab = en.n_vocab
embed_dim = model_config["text_embed_dim"]
linear_dim = 1 + n_fft // 2
use_decoder_states = model_config[
"use_decoder_state_for_postnet_input"]
filter_size = model_config["kernel_size"]
encoder_channels = model_config["encoder_channels"]
decoder_channels = model_config["decoder_channels"]
converter_channels = model_config["converter_channels"]
dropout = model_config["dropout"]
padding_idx = model_config["padding_idx"]
embedding_std = model_config["embedding_weight_std"]
max_positions = model_config["max_positions"]
freeze_embedding = model_config["freeze_embedding"]
trainable_positional_encodings = model_config[
"trainable_positional_encodings"]
use_memory_mask = model_config["use_memory_mask"]
query_position_rate = model_config["query_position_rate"]
key_position_rate = model_config["key_position_rate"]
window_backward = model_config["window_backward"]
window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"]
dv3 = make_model(
n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
embedding_std, max_positions, n_vocab, freeze_embedding,
filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, window_backward,
window_ahead, key_projection, value_projection, downsample_factor,
linear_dim, use_decoder_states, converter_channels, dropout)
summary(dv3)
dg.enable_dygraph(place)
model = make_model(config)
checkpoint_dir = os.path.join(args.output, "checkpoints")
if args.checkpoint is not None:
iteration = io.load_parameters(
dv3, checkpoint_path=args.checkpoint)
iteration = io.load_parameters(model, checkpoint_path=args.checkpoint)
else:
iteration = io.load_parameters(
dv3, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
model, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
# WARNING: don't forget to remove weight norm to re-compute each wrapped layer's weight
# removing weight norm also speeds up computation
for layer in dv3.sublayers():
for layer in model.sublayers():
if isinstance(layer, WeightNormWrapper):
layer.remove_weight_norm()
transform_config = config["transform"]
c = transform_config["replace_pronunciation_prob"]
sample_rate = transform_config["sample_rate"]
min_level_db = transform_config["min_level_db"]
ref_level_db = transform_config["ref_level_db"]
preemphasis = transform_config["preemphasis"]
win_length = transform_config["win_length"]
hop_length = transform_config["hop_length"]
synthesis_config = config["synthesis"]
power = synthesis_config["power"]
n_iter = synthesis_config["n_iter"]
synthesis_dir = os.path.join(args.output, "synthesis")
if not os.path.exists(synthesis_dir):
os.makedirs(synthesis_dir)
with open(args.text, "rt", encoding="utf-8") as f:
lines = f.readlines()
for idx, line in enumerate(lines):
text = line[:-1]
dv3.eval()
wav, attn = eval_model(dv3, text, replace_pronounciation_prob,
min_level_db, ref_level_db, power,
n_iter, win_length, hop_length,
preemphasis)
plot_alignment(
attn,
os.path.join(synthesis_dir,
"test_{}_step_{}.png".format(idx, iteration)))
sf.write(
os.path.join(synthesis_dir,
"test_{}_step{}.wav".format(idx, iteration)),
wav, sample_rate)
sentences = [line[:-1] for line in lines]
evaluator = make_evaluator(config, sentences, synthesis_dir)
evaluator(model, iteration)
......@@ -13,57 +13,37 @@
# limitations under the License.
from __future__ import division
import time
import os
import argparse
import ruamel.yaml
import numpy as np
import matplotlib
matplotlib.use("agg")
from matplotlib import cm
import matplotlib.pyplot as plt
import tqdm
import librosa
from librosa import display
import soundfile as sf
from tensorboardX import SummaryWriter
from paddle import fluid
fluid.require_version('1.8.0')
import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
from parakeet.utils.io import load_parameters, save_parameters
from parakeet.g2p import en
from parakeet.data import FilterDataset, TransformDataset, FilterDataset
from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3, ConvSpec
from parakeet.models.deepvoice3.loss import TTSLoss
from parakeet.utils.layer_tools import summary
from parakeet.utils import io
from data import LJSpeechMetaData, DataCollector, Transform
from utils import make_model, eval_model, save_state, make_output_tree, plot_alignment
from data import make_data_loader
from model import make_model, make_criterion, make_optimizer
from utils import make_output_tree, add_options, get_place, Evaluator, StateSaver, make_evaluator, make_state_saver
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Train a Deep Voice 3 model with LJSpeech dataset.")
parser.add_argument("--config", type=str, help="experimrnt config")
parser.add_argument(
"--data",
type=str,
default="/workspace/datasets/LJSpeech-1.1/",
help="The path of the LJSpeech dataset.")
parser.add_argument("--device", type=int, default=-1, help="device to use")
g = parser.add_mutually_exclusive_group()
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from.")
g.add_argument(
"--iteration",
type=int,
help="the iteration of the checkpoint to load from output directory")
add_options(parser)
args, _ = parser.parse_known_args()
parser.add_argument(
"output", type=str, default="experiment", help="path to save results")
# only use args.device when training in single process
# when training with distributed.launch, devices are provided by
# `--selected_gpus` for distributed.launch
env = dg.parallel.ParallelEnv()
device_id = env.dev_id if env.nranks > 1 else args.device
place = get_place(device_id)
# start dygraph
dg.enable_dygraph(place)
args, _ = parser.parse_known_args()
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
......@@ -71,267 +51,122 @@ if __name__ == "__main__":
for k, v in vars(args).items():
print("{}: {}".format(k, v))
# =========================dataset=========================
# construct meta data
data_root = args.data
meta = LJSpeechMetaData(data_root)
# filter it!
min_text_length = config["meta_data"]["min_text_length"]
meta = FilterDataset(meta, lambda x: len(x[2]) >= min_text_length)
# transform meta data into meta data
transform_config = config["transform"]
replace_pronounciation_prob = transform_config[
"replace_pronunciation_prob"]
sample_rate = transform_config["sample_rate"]
preemphasis = transform_config["preemphasis"]
n_fft = transform_config["n_fft"]
win_length = transform_config["win_length"]
hop_length = transform_config["hop_length"]
fmin = transform_config["fmin"]
fmax = transform_config["fmax"]
n_mels = transform_config["n_mels"]
min_level_db = transform_config["min_level_db"]
ref_level_db = transform_config["ref_level_db"]
max_norm = transform_config["max_norm"]
clip_norm = transform_config["clip_norm"]
transform = Transform(replace_pronounciation_prob, sample_rate,
preemphasis, n_fft, win_length, hop_length, fmin,
fmax, n_mels, min_level_db, ref_level_db, max_norm,
clip_norm)
ljspeech = TransformDataset(meta, transform)
# =========================dataiterator=========================
# use meta data's text length as a sort key for the sampler
train_config = config["train"]
batch_size = train_config["batch_size"]
text_lengths = [len(example[2]) for example in meta]
sampler = PartialyRandomizedSimilarTimeLengthSampler(text_lengths,
batch_size)
# some hyperparameters affect how we process data, so create a data collector!
model_config = config["model"]
downsample_factor = model_config["downsample_factor"]
r = model_config["outputs_per_step"]
collector = DataCollector(downsample_factor=downsample_factor, r=r)
ljspeech_loader = DataCargo(
ljspeech, batch_fn=collector, batch_size=batch_size, sampler=sampler)
# =========================model=========================
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
# =========================model=========================
n_speakers = model_config["n_speakers"]
speaker_dim = model_config["speaker_embed_dim"]
speaker_embed_std = model_config["speaker_embedding_weight_std"]
n_vocab = en.n_vocab
embed_dim = model_config["text_embed_dim"]
linear_dim = 1 + n_fft // 2
use_decoder_states = model_config[
"use_decoder_state_for_postnet_input"]
filter_size = model_config["kernel_size"]
encoder_channels = model_config["encoder_channels"]
decoder_channels = model_config["decoder_channels"]
converter_channels = model_config["converter_channels"]
dropout = model_config["dropout"]
padding_idx = model_config["padding_idx"]
embedding_std = model_config["embedding_weight_std"]
max_positions = model_config["max_positions"]
freeze_embedding = model_config["freeze_embedding"]
trainable_positional_encodings = model_config[
"trainable_positional_encodings"]
use_memory_mask = model_config["use_memory_mask"]
query_position_rate = model_config["query_position_rate"]
key_position_rate = model_config["key_position_rate"]
window_backward = model_config["window_backward"]
window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"]
dv3 = make_model(
n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
embedding_std, max_positions, n_vocab, freeze_embedding,
filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, window_backward,
window_ahead, key_projection, value_projection, downsample_factor,
linear_dim, use_decoder_states, converter_channels, dropout)
summary(dv3)
# =========================loss=========================
loss_config = config["loss"]
masked_weight = loss_config["masked_loss_weight"]
priority_freq = loss_config["priority_freq"] # Hz
priority_bin = int(priority_freq / (0.5 * sample_rate) * linear_dim)
priority_freq_weight = loss_config["priority_freq_weight"]
binary_divergence_weight = loss_config["binary_divergence_weight"]
guided_attention_sigma = loss_config["guided_attention_sigma"]
criterion = TTSLoss(
masked_weight=masked_weight,
priority_bin=priority_bin,
priority_weight=priority_freq_weight,
binary_divergence_weight=binary_divergence_weight,
guided_attention_sigma=guided_attention_sigma,
downsample_factor=downsample_factor,
r=r)
# =========================lr_scheduler=========================
lr_config = config["lr_scheduler"]
warmup_steps = lr_config["warmup_steps"]
peak_learning_rate = lr_config["peak_learning_rate"]
lr_scheduler = dg.NoamDecay(
1 / (warmup_steps * (peak_learning_rate)**2), warmup_steps)
# =========================optimizer=========================
optim_config = config["optimizer"]
beta1 = optim_config["beta1"]
beta2 = optim_config["beta2"]
epsilon = optim_config["epsilon"]
optim = fluid.optimizer.Adam(
lr_scheduler,
beta1,
beta2,
epsilon=epsilon,
parameter_list=dv3.parameters(),
grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1))
data_loader = make_data_loader(args.data, config)
model = make_model(config)
if env.nranks > 1:
strategy = dg.parallel.prepare_context()
model = dg.DataParallel(model, strategy)
criterion = make_criterion(config)
optim = make_optimizer(model, config)
# generation
synthesis_config = config["synthesis"]
power = synthesis_config["power"]
n_iter = synthesis_config["n_iter"]
# =========================link(dataloader, paddle)=========================
loader = fluid.io.DataLoader.from_generator(
capacity=10, return_list=True)
loader.set_batch_generator(ljspeech_loader, places=place)
# tensorboard & checkpoint preparation
output_dir = args.output
ckpt_dir = os.path.join(output_dir, "checkpoints")
log_dir = os.path.join(output_dir, "log")
state_dir = os.path.join(output_dir, "states")
eval_dir = os.path.join(output_dir, "eval")
if env.local_rank == 0:
make_output_tree(output_dir)
writer = SummaryWriter(logdir=log_dir)
else:
writer = None
sentences = [
"Scientists at the CERN laboratory say they have discovered a new particle.",
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
"President Trump met with other leaders at the Group of 20 conference.",
"Generative adversarial network or variational auto-encoder.",
"Please call Stella.",
"Some have accepted this as a miracle without any physical explanation.",
]
evaluator = make_evaluator(config, sentences, eval_dir, writer)
state_saver = make_state_saver(config, state_dir, writer)
# load parameters and optimizer, and opdate iterations done sofar
if args.checkpoint is not None:
iteration = io.load_parameters(
dv3, optim, checkpoint_path=args.checkpoint)
iteration = load_parameters(
model, optim, checkpoint_path=args.checkpoint)
else:
iteration = io.load_parameters(
dv3, optim, checkpoint_dir=ckpt_dir, iteration=args.iteration)
iteration = load_parameters(
model, optim, checkpoint_dir=ckpt_dir, iteration=args.iteration)
# =========================train=========================
train_config = config["train"]
max_iter = train_config["max_iteration"]
snap_interval = train_config["snap_interval"]
save_interval = train_config["save_interval"]
eval_interval = train_config["eval_interval"]
global_step = iteration + 1
iterator = iter(tqdm.tqdm(loader))
iterator = iter(tqdm.tqdm(data_loader))
downsample_factor = config["model"]["downsample_factor"]
while global_step <= max_iter:
try:
batch = next(iterator)
except StopIteration as e:
iterator = iter(tqdm.tqdm(loader))
iterator = iter(tqdm.tqdm(data_loader))
batch = next(iterator)
dv3.train()
(text_sequences, text_lengths, text_positions, mel_specs,
lin_specs, frames, decoder_positions, done_flags) = batch
model.train()
(text_sequences, text_lengths, text_positions, mel_specs, lin_specs,
frames, decoder_positions, done_flags) = batch
downsampled_mel_specs = F.strided_slice(
mel_specs,
axes=[1],
starts=[0],
ends=[mel_specs.shape[1]],
strides=[downsample_factor])
mel_outputs, linear_outputs, alignments, done = dv3(
text_sequences, text_positions, text_lengths, None,
downsampled_mel_specs, decoder_positions)
outputs = model(
text_sequences,
text_positions,
text_lengths,
None,
downsampled_mel_specs,
decoder_positions, )
# mel_outputs, linear_outputs, alignments, done
inputs = (downsampled_mel_specs, lin_specs, done_flags, text_lengths,
frames)
losses = criterion(outputs, inputs)
losses = criterion(mel_outputs, linear_outputs, done, alignments,
downsampled_mel_specs, lin_specs, done_flags,
text_lengths, frames)
l = losses["loss"]
if env.nranks > 1:
l = model.scale_loss(l)
l.backward()
model.apply_collective_grads()
else:
l.backward()
# record learning rate before updating
if env.local_rank == 0:
writer.add_scalar("learning_rate",
optim._learning_rate.step().numpy(), global_step)
optim.minimize(l)
optim.clear_gradients()
# ==================all kinds of tedious things=================
# record step loss into tensorboard
step_loss = {
k: v.numpy()[0]
for k, v in losses.items() if v is not None
}
tqdm.tqdm.write("global_step: {}\tloss: {}".format(
# record step losses
step_loss = {k: v.numpy()[0] for k, v in losses.items()}
if env.local_rank == 0:
tqdm.tqdm.write("[Train] global_step: {}\tloss: {}".format(
global_step, step_loss["loss"]))
for k, v in step_loss.items():
writer.add_scalar(k, v, global_step)
# train state saving, the first sentence in the batch
if global_step % snap_interval == 0:
save_state(
state_dir,
writer,
global_step,
mel_input=downsampled_mel_specs,
mel_output=mel_outputs,
lin_input=lin_specs,
lin_output=linear_outputs,
alignments=alignments,
win_length=win_length,
hop_length=hop_length,
min_level_db=min_level_db,
ref_level_db=ref_level_db,
power=power,
n_iter=n_iter,
preemphasis=preemphasis,
sample_rate=sample_rate)
if env.local_rank == 0 and global_step % snap_interval == 0:
input_specs = (mel_specs, lin_specs)
state_saver(outputs, input_specs, global_step)
# evaluation
if global_step % eval_interval == 0:
sentences = [
"Scientists at the CERN laboratory say they have discovered a new particle.",
"There's a way to measure the acute emotional intelligence that has never gone out of style.",
"President Trump met with other leaders at the Group of 20 conference.",
"Generative adversarial network or variational auto-encoder.",
"Please call Stella.",
"Some have accepted this as a miracle without any physical explanation.",
]
for idx, sent in enumerate(sentences):
wav, attn = eval_model(
dv3, sent, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length,
preemphasis)
wav_path = os.path.join(
state_dir, "waveform",
"eval_sample_{:09d}.wav".format(global_step))
sf.write(wav_path, wav, sample_rate)
writer.add_audio(
"eval_sample_{}".format(idx),
wav,
global_step,
sample_rate=sample_rate)
attn_path = os.path.join(
state_dir, "alignments",
"eval_sample_attn_{:09d}.png".format(global_step))
plot_alignment(attn, attn_path)
writer.add_image(
"eval_sample_attn{}".format(idx),
cm.viridis(attn),
global_step,
dataformats="HWC")
if env.local_rank == 0 and global_step % eval_interval == 0:
evaluator(model, global_step)
# save checkpoint
if global_step % save_interval == 0:
io.save_parameters(ckpt_dir, global_step, dv3, optim)
if env.local_rank == 0 and global_step % save_interval == 0:
save_parameters(ckpt_dir, global_step, model, optim)
global_step += 1
......@@ -15,6 +15,8 @@
from __future__ import division
import os
import numpy as np
import matplotlib
matplotlib.use("agg")
from matplotlib import cm
import matplotlib.pyplot as plt
import librosa
......@@ -24,232 +26,220 @@ import soundfile as sf
from paddle import fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.initializer as I
from parakeet.g2p import en
from parakeet.models.deepvoice3.encoder import ConvSpec
from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3, WindowRange
from parakeet.utils.layer_tools import freeze
@fluid.framework.dygraph_only
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
padding_idx, embedding_std, max_positions, n_vocab,
freeze_embedding, filter_size, encoder_channels, mel_dim,
decoder_channels, r, trainable_positional_encodings,
use_memory_mask, query_position_rate, key_position_rate,
window_behind, window_ahead, key_projection, value_projection,
downsample_factor, linear_dim, use_decoder_states,
converter_channels, dropout):
"""just a simple function to create a deepvoice 3 model"""
if n_speakers > 1:
spe = dg.Embedding(
(n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
def get_place(device_id):
"""get place from device_id, -1 stands for CPU"""
if device_id == -1:
place = fluid.CPUPlace()
else:
spe = None
h = encoder_channels
k = filter_size
encoder_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3), )
enc = Encoder(
n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
dropout=dropout)
if freeze_embedding:
freeze(enc.embed)
h = decoder_channels
prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
attentive_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1), )
attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True]
dec = Decoder(
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=r,
max_positions=max_positions,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=WindowRange(window_behind, window_ahead),
key_projection=key_projection,
value_projection=value_projection)
if not trainable_positional_encodings:
freeze(dec.embed_keys_positions)
freeze(dec.embed_query_positions)
h = converter_channels
postnet_convolutions = (
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3), )
cvt = Converter(
n_speakers,
speaker_dim,
dec.state_dim if use_decoder_states else mel_dim,
linear_dim,
time_upsampling=downsample_factor,
convolutions=postnet_convolutions,
dropout=dropout)
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
return dv3
@fluid.framework.dygraph_only
def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length,
preemphasis):
"""generate waveform from text using a deepvoice 3 model"""
place = fluid.CUDAPlace(device_id)
return place
def add_options(parser):
parser.add_argument("--config", type=str, help="experimrnt config")
parser.add_argument(
"--data",
type=str,
default="/workspace/datasets/LJSpeech-1.1/",
help="The path of the LJSpeech dataset.")
parser.add_argument("--device", type=int, default=-1, help="device to use")
g = parser.add_mutually_exclusive_group()
g.add_argument("--checkpoint", type=str, help="checkpoint to resume from.")
g.add_argument(
"--iteration",
type=int,
help="the iteration of the checkpoint to load from output directory")
parser.add_argument(
"output", type=str, default="experiment", help="path to save results")
def make_evaluator(config, text_sequences, output_dir, writer=None):
c = config["transform"]
p_replace = c["replace_pronunciation_prob"]
sample_rate = c["sample_rate"]
preemphasis = c["preemphasis"]
win_length = c["win_length"]
hop_length = c["hop_length"]
min_level_db = c["min_level_db"]
ref_level_db = c["ref_level_db"]
synthesis_config = config["synthesis"]
power = synthesis_config["power"]
n_iter = synthesis_config["n_iter"]
return Evaluator(
text_sequences,
p_replace,
sample_rate,
preemphasis,
win_length,
hop_length,
min_level_db,
ref_level_db,
power,
n_iter,
output_dir=output_dir,
writer=writer)
class Evaluator(object):
def __init__(self,
text_sequences,
p_replace,
sample_rate,
preemphasis,
win_length,
hop_length,
min_level_db,
ref_level_db,
power,
n_iter,
output_dir,
writer=None):
self.text_sequences = text_sequences
self.output_dir = output_dir
self.writer = writer
self.p_replace = p_replace
self.sample_rate = sample_rate
self.preemphasis = preemphasis
self.win_length = win_length
self.hop_length = hop_length
self.min_level_db = min_level_db
self.ref_level_db = ref_level_db
self.power = power
self.n_iter = n_iter
def process_a_sentence(self, model, text):
text = np.array(
en.text_to_sequence(
text, p=replace_pronounciation_prob),
dtype=np.int64)
text, p=self.p_replace), dtype=np.int64)
length = len(text)
print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length)
text = np.expand_dims(text, 0)
text_positions = np.expand_dims(text_positions, 0)
model.eval()
mel_outputs, linear_outputs, alignments, done = model.transduce(
if isinstance(model, dg.DataParallel):
_model = model._layers
else:
_model = model
mel_outputs, linear_outputs, alignments, done = _model.transduce(
dg.to_variable(text), dg.to_variable(text_positions))
linear_outputs_np = linear_outputs.numpy()[0].T # (C, T)
wav = spec_to_waveform(linear_outputs_np, min_level_db, ref_level_db,
power, n_iter, win_length, hop_length, preemphasis)
wav = spec_to_waveform(linear_outputs_np, self.min_level_db,
self.ref_level_db, self.power, self.n_iter,
self.win_length, self.hop_length,
self.preemphasis)
alignments_np = alignments.numpy()[0] # batch_size = 1
print("linear_outputs's shape: ", linear_outputs_np.shape)
print("alignmnets' shape:", alignments.shape)
return wav, alignments_np
def __call__(self, model, iteration):
writer = self.writer
for i, seq in enumerate(self.text_sequences):
print("[Eval] synthesizing sentence {}".format(i))
wav, alignments_np = self.process_a_sentence(model, seq)
wav_path = os.path.join(
self.output_dir,
"eval_sample_{}_step_{:09d}.wav".format(i, iteration))
sf.write(wav_path, wav, self.sample_rate)
if writer is not None:
writer.add_audio(
"eval_sample_{}".format(i),
wav,
iteration,
sample_rate=self.sample_rate)
attn_path = os.path.join(
self.output_dir,
"eval_sample_{}_step_{:09d}.png".format(i, iteration))
plot_alignment(alignments_np, attn_path)
if writer is not None:
writer.add_image(
"eval_sample_attn_{}".format(i),
cm.viridis(alignments_np),
iteration,
dataformats="HWC")
def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
win_length, hop_length, preemphasis):
"""Convert output linear spec to waveform using griffin-lim vocoder.
Args:
spec (ndarray): the output linear spectrogram, shape(C, T), where C means n_fft, T means frames.
"""
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(
lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
if preemphasis > 0:
wav = signal.lfilter([1.], [1., -preemphasis], wav)
return wav
def make_output_tree(output_dir):
print("creating output tree: {}".format(output_dir))
ckpt_dir = os.path.join(output_dir, "checkpoints")
state_dir = os.path.join(output_dir, "states")
log_dir = os.path.join(output_dir, "log")
for x in [ckpt_dir, state_dir]:
if not os.path.exists(x):
os.makedirs(x)
for x in ["alignments", "waveform", "lin_spec", "mel_spec"]:
p = os.path.join(state_dir, x)
if not os.path.exists(p):
os.makedirs(p)
def plot_alignment(alignment, path):
"""
Plot an attention layer's alignment for a sentence.
alignment: shape(T_dec, T_enc).
"""
plt.figure()
plt.imshow(alignment)
plt.colorbar()
plt.xlabel('Encoder timestep')
plt.ylabel('Decoder timestep')
plt.savefig(path)
plt.close()
def save_state(save_dir,
writer,
global_step,
mel_input=None,
mel_output=None,
lin_input=None,
lin_output=None,
alignments=None,
win_length=1024,
hop_length=256,
min_level_db=-100,
ref_level_db=20,
power=1.4,
n_iter=32,
preemphasis=0.97,
sample_rate=22050):
"""Save training intermediate results. Save states for the first sentence in the batch, including
mel_spec(predicted, target), lin_spec(predicted, target), attn, waveform.
Args:
save_dir (str): directory to save results.
writer (SummaryWriter): tensorboardX summary writer
global_step (int): global step.
mel_input (Variable, optional): Defaults to None. Shape(B, T_mel, C_mel)
mel_output (Variable, optional): Defaults to None. Shape(B, T_mel, C_mel)
lin_input (Variable, optional): Defaults to None. Shape(B, T_lin, C_lin)
lin_output (Variable, optional): Defaults to None. Shape(B, T_lin, C_lin)
alignments (Variable, optional): Defaults to None. Shape(N, B, T_dec, C_enc)
wav ([type], optional): Defaults to None. [description]
"""
if mel_input is not None and mel_output is not None:
def make_state_saver(config, output_dir, writer=None):
c = config["transform"]
p_replace = c["replace_pronunciation_prob"]
sample_rate = c["sample_rate"]
preemphasis = c["preemphasis"]
win_length = c["win_length"]
hop_length = c["hop_length"]
min_level_db = c["min_level_db"]
ref_level_db = c["ref_level_db"]
synthesis_config = config["synthesis"]
power = synthesis_config["power"]
n_iter = synthesis_config["n_iter"]
return StateSaver(p_replace, sample_rate, preemphasis, win_length,
hop_length, min_level_db, ref_level_db, power, n_iter,
output_dir, writer)
class StateSaver(object):
def __init__(self,
p_replace,
sample_rate,
preemphasis,
win_length,
hop_length,
min_level_db,
ref_level_db,
power,
n_iter,
output_dir,
writer=None):
self.output_dir = output_dir
self.writer = writer
self.p_replace = p_replace
self.sample_rate = sample_rate
self.preemphasis = preemphasis
self.win_length = win_length
self.hop_length = hop_length
self.min_level_db = min_level_db
self.ref_level_db = ref_level_db
self.power = power
self.n_iter = n_iter
def __call__(self, outputs, inputs, iteration):
mel_output, lin_output, alignments, done_output = outputs
mel_input, lin_input = inputs
writer = self.writer
# mel spectrogram
mel_input = mel_input[0].numpy().T
mel_output = mel_output[0].numpy().T
path = os.path.join(save_dir, "mel_spec")
path = os.path.join(self.output_dir, "mel_spec")
plt.figure(figsize=(10, 3))
display.specshow(mel_input)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path, "target_mel_spec_step{:09d}.png".format(
global_step)))
os.path.join(path, "target_mel_spec_step_{:09d}.png".format(
iteration)))
plt.close()
if writer is not None:
writer.add_image(
"target/mel_spec",
cm.viridis(mel_input),
global_step,
iteration,
dataformats="HWC")
plt.figure(figsize=(10, 3))
......@@ -257,34 +247,36 @@ def save_state(save_dir,
plt.colorbar()
plt.title("mel_output")
plt.savefig(
os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
global_step)))
os.path.join(path, "predicted_mel_spec_step_{:09d}.png".format(
iteration)))
plt.close()
if writer is not None:
writer.add_image(
"predicted/mel_spec",
cm.viridis(mel_output),
global_step,
iteration,
dataformats="HWC")
if lin_input is not None and lin_output is not None:
# linear spectrogram
lin_input = lin_input[0].numpy().T
lin_output = lin_output[0].numpy().T
path = os.path.join(save_dir, "lin_spec")
path = os.path.join(self.output_dir, "lin_spec")
plt.figure(figsize=(10, 3))
display.specshow(lin_input)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path, "target_lin_spec_step{:09d}.png".format(
global_step)))
os.path.join(path, "target_lin_spec_step_{:09d}.png".format(
iteration)))
plt.close()
if writer is not None:
writer.add_image(
"target/lin_spec",
cm.viridis(lin_input),
global_step,
iteration,
dataformats="HWC")
plt.figure(figsize=(10, 3))
......@@ -292,37 +284,91 @@ def save_state(save_dir,
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
global_step)))
os.path.join(path, "predicted_lin_spec_step_{:09d}.png".format(
iteration)))
plt.close()
if writer is not None:
writer.add_image(
"predicted/lin_spec",
cm.viridis(lin_output),
global_step,
iteration,
dataformats="HWC")
if alignments is not None and len(alignments.shape) == 4:
path = os.path.join(save_dir, "alignments")
# alignment
path = os.path.join(self.output_dir, "alignments")
alignments = alignments[:, 0, :, :].numpy()
for idx, attn_layer in enumerate(alignments):
save_path = os.path.join(
path,
"train_attn_layer_{}_step_{}.png".format(idx, global_step))
path, "train_attn_layer_{}_step_{}.png".format(idx, iteration))
plot_alignment(attn_layer, save_path)
if writer is not None:
writer.add_image(
"train_attn/layer_{}".format(idx),
cm.viridis(attn_layer),
global_step,
iteration,
dataformats="HWC")
if lin_output is not None:
wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
n_iter, win_length, hop_length, preemphasis)
path = os.path.join(save_dir, "waveform")
# synthesize waveform
wav = spec_to_waveform(
lin_output, self.min_level_db, self.ref_level_db, self.power,
self.n_iter, self.win_length, self.hop_length, self.preemphasis)
path = os.path.join(self.output_dir, "waveform")
save_path = os.path.join(
path, "train_sample_step_{:09d}.wav".format(global_step))
sf.write(save_path, wav, sample_rate)
path, "train_sample_step_{:09d}.wav".format(iteration))
sf.write(save_path, wav, self.sample_rate)
if writer is not None:
writer.add_audio(
"train_sample", wav, global_step, sample_rate=sample_rate)
"train_sample", wav, iteration, sample_rate=self.sample_rate)
def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
win_length, hop_length, preemphasis):
"""Convert output linear spec to waveform using griffin-lim vocoder.
Args:
spec (ndarray): the output linear spectrogram, shape(C, T), where C means n_fft, T means frames.
"""
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(
lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
if preemphasis > 0:
wav = signal.lfilter([1.], [1., -preemphasis], wav)
wav = np.clip(wav, -1.0, 1.0)
return wav
def make_output_tree(output_dir):
print("creating output tree: {}".format(output_dir))
ckpt_dir = os.path.join(output_dir, "checkpoints")
state_dir = os.path.join(output_dir, "states")
eval_dir = os.path.join(output_dir, "eval")
for x in [ckpt_dir, state_dir, eval_dir]:
if not os.path.exists(x):
os.makedirs(x)
for x in ["alignments", "waveform", "lin_spec", "mel_spec"]:
p = os.path.join(state_dir, x)
if not os.path.exists(p):
os.makedirs(p)
def plot_alignment(alignment, path):
"""
Plot an attention layer's alignment for a sentence.
alignment: shape(T_dec, T_enc).
"""
plt.figure()
plt.imshow(alignment)
plt.colorbar()
plt.xlabel('Encoder timestep')
plt.ylabel('Decoder timestep')
plt.savefig(path)
plt.close()
......@@ -19,6 +19,7 @@ import argparse
from tqdm import tqdm
from tensorboardX import SummaryWriter
from paddle import fluid
fluid.require_version('1.8.0')
import paddle.fluid.dygraph as dg
from parakeet.modules.weight_norm import WeightNormWrapper
......@@ -55,6 +56,13 @@ if __name__ == "__main__":
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
dg.enable_dygraph(place)
ljspeech_meta = LJSpeechMetaData(args.data)
data_config = config["data"]
......@@ -99,12 +107,6 @@ if __name__ == "__main__":
if not os.path.exists(args.output):
os.makedirs(args.output)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
model_config = config["model"]
upsampling_factors = model_config["upsampling_factors"]
encoder = UpsampleNet(upsampling_factors)
......@@ -115,8 +117,8 @@ if __name__ == "__main__":
output_dim = model_config["output_dim"]
loss_type = model_config["loss_type"]
log_scale_min = model_config["log_scale_min"]
decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim,
n_mels, filter_size, loss_type, log_scale_min)
decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
filter_size, loss_type, log_scale_min)
model = ConditionalWavenet(encoder, decoder)
summary(model)
......@@ -124,8 +126,7 @@ if __name__ == "__main__":
# load model parameters
checkpoint_dir = os.path.join(args.output, "checkpoints")
if args.checkpoint:
iteration = io.load_parameters(
model, checkpoint_path=args.checkpoint)
iteration = io.load_parameters(model, checkpoint_path=args.checkpoint)
else:
iteration = io.load_parameters(
model, checkpoint_dir=checkpoint_dir, iteration=args.iteration)
......
......@@ -19,9 +19,10 @@ import argparse
import tqdm
from tensorboardX import SummaryWriter
from paddle import fluid
fluid.require_version('1.8.0')
import paddle.fluid.dygraph as dg
from parakeet.data import SliceDataset, TransformDataset, DataCargo, SequentialSampler, RandomSampler
from parakeet.data import SliceDataset, TransformDataset, CacheDataset, DataCargo, SequentialSampler, RandomSampler
from parakeet.models.wavenet import UpsampleNet, WaveNet, ConditionalWavenet
from parakeet.utils.layer_tools import summary
from parakeet.utils import io
......@@ -51,6 +52,13 @@ if __name__ == "__main__":
with open(args.config, 'rt') as f:
config = ruamel.yaml.safe_load(f)
if args.device == -1:
place = fluid.CPUPlace()
else:
place = fluid.CUDAPlace(args.device)
dg.enable_dygraph(place)
print("Command Line Args: ")
for k, v in vars(args).items():
print("{}: {}".format(k, v))
......@@ -68,8 +76,9 @@ if __name__ == "__main__":
ljspeech = TransformDataset(ljspeech_meta, transform)
valid_size = data_config["valid_size"]
ljspeech_valid = SliceDataset(ljspeech, 0, valid_size)
ljspeech_train = SliceDataset(ljspeech, valid_size, len(ljspeech))
ljspeech_valid = CacheDataset(SliceDataset(ljspeech, 0, valid_size))
ljspeech_train = CacheDataset(
SliceDataset(ljspeech, valid_size, len(ljspeech)))
model_config = config["model"]
n_loop = model_config["n_loop"]
......@@ -103,7 +112,6 @@ if __name__ == "__main__":
else:
place = fluid.CUDAPlace(args.device)
with dg.guard(place):
model_config = config["model"]
upsampling_factors = model_config["upsampling_factors"]
encoder = UpsampleNet(upsampling_factors)
......@@ -114,8 +122,8 @@ if __name__ == "__main__":
output_dim = model_config["output_dim"]
loss_type = model_config["loss_type"]
log_scale_min = model_config["log_scale_min"]
decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim,
n_mels, filter_size, loss_type, log_scale_min)
decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
filter_size, loss_type, log_scale_min)
model = ConditionalWavenet(encoder, decoder)
summary(model)
......@@ -178,16 +186,14 @@ if __name__ == "__main__":
writer.add_scalar("loss", loss_np[0], global_step)
writer.add_scalar("learning_rate",
optim._learning_rate.step().numpy()[0],
global_step)
optim._learning_rate.step().numpy()[0], global_step)
optim.minimize(loss_var)
optim.clear_gradients()
print("global_step: {}\tloss: {:<8.6f}".format(global_step,
loss_np[0]))
print("global_step: {}\tloss: {:<8.6f}".format(global_step, loss_np[
0]))
if global_step % snap_interval == 0:
valid_model(model, valid_loader, writer, global_step,
sample_rate)
valid_model(model, valid_loader, writer, global_step, sample_rate)
if global_step % checkpoint_interval == 0:
io.save_parameters(checkpoint_dir, global_step, model, optim)
......
......@@ -176,6 +176,79 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
return len(self.sorted_indices)
class BucketSampler(Sampler):
def __init__(self,
lengths,
batch_size=4,
batch_group_size=None,
permutate=True,
num_trainers=1,
rank=0):
# maybe better implement length as a sort key
_lengths = np.array(lengths, dtype=np.int64)
self.lengths = np.sort(_lengths)
self.sorted_indices = np.argsort(_lengths)
self.num_trainers = num_trainers
self.rank = rank
self.dataset_size = len(_lengths)
self.num_samples = int(np.ceil(self.dataset_size / num_trainers))
self.total_size = self.num_samples * num_trainers
assert self.total_size >= self.dataset_size
self.batch_size = batch_size
total_batch_size = num_trainers * batch_size
self.total_batch_size = total_batch_size
if batch_group_size is None:
batch_group_size = min(total_batch_size * 32, len(self.lengths))
if batch_group_size % total_batch_size != 0:
batch_group_size -= batch_group_size % total_batch_size
self.batch_group_size = batch_group_size
assert batch_group_size % total_batch_size == 0
self.permutate = permutate
def __iter__(self):
indices = self.sorted_indices
# Append extra samples to make it evenly distributed on all trainers.
num_extras = self.total_size - self.dataset_size
extra_indices = np.random.choice(
indices, size=(num_extras, ), replace=False)
indices = np.concatenate((indices, extra_indices))
assert len(indices) == self.total_size
batch_group_size = self.batch_group_size
s, e = 0, 0
for i in range(len(indices) // batch_group_size):
s = i * batch_group_size
e = s + batch_group_size
random.shuffle(indices[s:e]) # inplace
# Permutate batches
total_batch_size = self.total_batch_size
if self.permutate:
perm = np.arange(len(indices[:e]) // total_batch_size)
random.shuffle(perm)
indices[:e] = indices[:e].reshape(
-1, total_batch_size)[perm, :].reshape(-1)
# Handle last elements
s += batch_group_size
#print(indices)
if s < len(indices):
random.shuffle(indices[s:])
# Subset samples for each trainer.
indices = indices[self.rank:self.total_size:self.num_trainers]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return len(self.sorted_indices)
class WeightedRandomSampler(Sampler):
"""Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights).
Args:
......
......@@ -15,4 +15,5 @@
from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
from parakeet.models.deepvoice3.converter import Converter
from parakeet.models.deepvoice3.loss import TTSLoss
from parakeet.models.deepvoice3.model import DeepVoice3
......@@ -210,56 +210,43 @@ class TTSLoss(object):
loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
return loss
def __call__(self,
mel_hyp,
lin_hyp,
done_hyp,
attn_hyp,
mel_ref,
lin_ref,
done_ref,
input_lengths,
n_frames,
compute_lin_loss=True,
compute_mel_loss=True,
compute_done_loss=True,
compute_attn_loss=True):
def __call__(self, outputs, inputs):
"""Total loss
Args:
outpus is a tuple of (mel_hyp, lin_hyp, attn_hyp, done_hyp).
mel_hyp (Variable): shape(B, T, C_mel), dtype float32, predicted mel spectrogram.
lin_hyp (Variable): shape(B, T, C_lin), dtype float32, predicted linear spectrogram.
done_hyp (Variable): shape(B, T), dtype float32, predicted done probability.
attn_hyp (Variable): shape(N, B, T_dec, T_enc), dtype float32, predicted attention.
inputs is a tuple of (mel_ref, lin_ref, done_ref, input_lengths, n_frames)
mel_ref (Variable): shape(B, T, C_mel), dtype float32, ground truth mel spectrogram.
lin_ref (Variable): shape(B, T, C_lin), dtype float32, ground truth linear spectrogram.
done_ref (Variable): shape(B, T), dtype float32, ground truth done flag.
input_lengths (Variable): shape(B, ), dtype: int, encoder valid lengths.
n_frames (Variable): shape(B, ), dtype: int, decoder valid lengths.
compute_lin_loss (bool, optional): whether to compute linear loss. Defaults to True.
compute_mel_loss (bool, optional): whether to compute mel loss. Defaults to True.
compute_done_loss (bool, optional): whether to compute done loss. Defaults to True.
compute_attn_loss (bool, optional): whether to compute atention loss. Defaults to True.
Returns:
Dict(str, Variable): details of loss.
"""
total_loss = 0.
mel_hyp, lin_hyp, attn_hyp, done_hyp = outputs
mel_ref, lin_ref, done_ref, input_lengths, n_frames = inputs
# n_frames # mel_lengths # decoder_lengths
max_frames = lin_hyp.shape[1]
max_mel_steps = max_frames // self.downsample_factor
max_decoder_steps = max_mel_steps // self.r
decoder_mask = F.sequence_mask(
n_frames // self.downsample_factor // self.r,
max_decoder_steps,
dtype="float32")
# max_decoder_steps = max_mel_steps // self.r
# decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
# self.r,
# max_decoder_steps,
# dtype="float32")
mel_mask = F.sequence_mask(
n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
if compute_lin_loss:
lin_hyp = lin_hyp[:, :-self.time_shift, :]
lin_ref = lin_ref[:, self.time_shift:, :]
lin_mask = lin_mask[:, self.time_shift:]
......@@ -270,7 +257,6 @@ class TTSLoss(object):
+ (1 - self.binary_divergence_weight) * lin_l1_loss
total_loss += lin_loss
if compute_mel_loss:
mel_hyp = mel_hyp[:, :-self.time_shift, :]
mel_ref = mel_ref[:, self.time_shift:, :]
mel_mask = mel_mask[:, self.time_shift:]
......@@ -281,27 +267,25 @@ class TTSLoss(object):
+ (1 - self.binary_divergence_weight) * mel_l1_loss
total_loss += mel_loss
if compute_attn_loss:
attn_loss = self.attention_loss(attn_hyp,
input_lengths.numpy(),
n_frames.numpy() //
(self.downsample_factor * self.r))
total_loss += attn_loss
if compute_done_loss:
done_loss = self.done_loss(done_hyp, done_ref)
total_loss += done_loss
result = {
losses = {
"loss": total_loss,
"mel/mel_loss": mel_loss if compute_mel_loss else None,
"mel/l1_loss": mel_l1_loss if compute_mel_loss else None,
"mel/bce_loss": mel_bce_loss if compute_mel_loss else None,
"lin/lin_loss": lin_loss if compute_lin_loss else None,
"lin/l1_loss": lin_l1_loss if compute_lin_loss else None,
"lin/bce_loss": lin_bce_loss if compute_lin_loss else None,
"done": done_loss if compute_done_loss else None,
"attn": attn_loss if compute_attn_loss else None,
"mel/mel_loss": mel_loss,
"mel/l1_loss": mel_l1_loss,
"mel/bce_loss": mel_bce_loss,
"lin/lin_loss": lin_loss,
"lin/l1_loss": lin_l1_loss,
"lin/bce_loss": lin_bce_loss,
"done": done_loss,
"attn": attn_loss,
}
return result
return losses
......@@ -19,6 +19,34 @@ import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg
def lookup(weight, indices, padding_idx):
out = fluid.core.ops.lookup_table_v2(
weight, indices, 'is_sparse', False, 'is_distributed', False,
'remote_prefetch', False, 'padding_idx', padding_idx)
return out
def compute_position_embedding_single_speaker(radians, speaker_position_rate):
"""Compute sin/cos interleaved matrix from the radians.
Arg:
radians (Variable): shape(n_vocab, embed_dim), dtype float32, the radians matrix.
speaker_position_rate (float or Variable): float or Variable of shape(1, ), speaker positioning rate.
Returns:
Variable: shape(n_vocab, embed_dim), the sin, cos interleaved matrix.
"""
_, embed_dim = radians.shape
scaled_radians = radians * speaker_position_rate
odd_mask = (np.arange(embed_dim) % 2).astype(np.float32)
odd_mask = dg.to_variable(odd_mask)
out = odd_mask * F.cos(scaled_radians) \
+ (1 - odd_mask) * F.sin(scaled_radians)
return out
def compute_position_embedding(radians, speaker_position_rate):
"""Compute sin/cos interleaved matrix from the radians.
......@@ -106,16 +134,14 @@ class PositionEmbedding(dg.Layer):
"""
batch_size, time_steps = indices.shape
# convert speaker_position_rate to a Variable with shape(B, )
if isinstance(speaker_position_rate, float):
speaker_position_rate = dg.to_variable(
np.array([speaker_position_rate]).astype("float32"))
speaker_position_rate = F.expand(speaker_position_rate,
[batch_size])
elif isinstance(speaker_position_rate, fluid.framework.Variable) \
and list(speaker_position_rate.shape) == [1]:
speaker_position_rate = F.expand(speaker_position_rate,
[batch_size])
if isinstance(speaker_position_rate, float) or \
(isinstance(speaker_position_rate, fluid.framework.Variable)
and list(speaker_position_rate.shape) == [1]):
temp_weight = compute_position_embedding_single_speaker(
self.weight, speaker_position_rate)
out = lookup(temp_weight, indices, 0)
return out
assert len(speaker_position_rate.shape) == 1 and \
list(speaker_position_rate.shape) == [batch_size]
......@@ -128,6 +154,5 @@ class PositionEmbedding(dg.Layer):
0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
# (B, T, 2)
gather_nd_id = F.stack([batch_id, indices], -1)
out = F.gather_nd(weight, gather_nd_id)
return out
......@@ -57,8 +57,38 @@ def norm_except(param, dim, power):
return norm_except(transposed_param, dim=0, power=power)
def compute_l2_normalized_weight(v, g, dim):
shape = v.shape
ndim = len(shape)
if dim is None:
v_normalized = v / (F.reduce_sum(F.square(v)) + 1e-12)
elif dim == 0:
param_matrix = F.reshape(v, (shape[0], np.prod(shape[1:])))
v_normalized = F.l2_normalize(param_matrix, axis=1)
elif dim == -1 or dim == ndim - 1:
param_matrix = F.reshape(v, (np.prod(shape[:-1]), shape[-1]))
v_normalized = F.l2_normalize(param_matrix, axis=0)
else:
perm = list(range(ndim))
perm[0] = dim
perm[dim] = 0
transposed_param = F.transpose(v, perm)
param_matrix = F.reshape(
transposed_param,
(transposed_param.shape[0], np.prod(transposed_param.shape[1:])))
v_normalized = F.l2_normalize(param_matrix, axis=1)
v_normalized = F.transpose(v_normalized, perm)
v_normalized = F.reshape(v_normalized, shape)
weight = F.elementwise_mul(v_normalized, g, axis=dim)
return weight
def compute_weight(v, g, dim, power):
assert len(g.shape) == 1, "magnitude should be a vector"
if power == 2:
return compute_l2_normalized_weight(v, g, dim)
else:
v_normalized = F.elementwise_div(
v, (norm_except(v, dim, power) + 1e-12), axis=dim)
weight = F.elementwise_mul(v_normalized, g, axis=dim)
......
......@@ -15,6 +15,8 @@
import os
import io
import re
import six
import sys
from setuptools import setup, find_packages
......@@ -63,6 +65,7 @@ setup_info = dict(
'pandas',
'sox',
'soundfile',
'llvmlite==0.31.0' if sys.version_info < (3, 6) else "llvmlite",
],
# Package info
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册