提交 9d796994 编写于 作者: L lifuchen

add license

上级 f84d6bec
......@@ -25,3 +25,11 @@
files: \.md$
- id: remove-tabs
files: \.md$
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
# Deepvoice 3
# Deepvoice 3
Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
......@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
## Project Structure
```text
├── data.py data_processing
├── data.py data_processing
├── ljspeech.yaml (example) configuration file
├── sentences.txt sample sentences
├── synthesis.py script to synthesize waveform from text
......@@ -50,7 +50,7 @@ optional arguments:
The directory to save result.
-g DEVICE, --device DEVICE
device to use
```
```
1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
......@@ -61,7 +61,7 @@ optional arguments:
├── checkpoints # checkpoint
├── log # tensorboard log
└── states # train and evaluation results
├── alignments # attention
├── alignments # attention
├── lin_spec # linear spectrogram
├── mel_spec # mel spectrogram
└── waveform # waveform (.wav files)
......@@ -112,4 +112,3 @@ example script:
```bash
python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import csv
from pathlib import Path
......@@ -79,10 +93,11 @@ class Transform(object):
y = signal.lfilter([1., -self.preemphasis], [1.], wav)
# STFT
D = librosa.stft(y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
D = librosa.stft(
y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
S = np.abs(D)
# to db and normalize to 0-1
......@@ -96,11 +111,8 @@ class Transform(object):
# mel scale and to db and normalize to 0-1,
# CAUTION: pass linear scale S, not dbscaled S
S_mel = librosa.feature.melspectrogram(S=S,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=self.fmax,
power=1.)
S_mel = librosa.feature.melspectrogram(
S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
S_mel = 20 * np.log10(np.maximum(amplitude_min,
S_mel)) - self.ref_level_db
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
......@@ -148,20 +160,18 @@ class DataCollector(object):
(mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, num_frames) = example
text_sequences.append(
np.pad(mix_grapheme_phonemes,
(0, max_text_length - text_length)))
np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
)))
lin_specs.append(
np.pad(S_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
self._pad_begin - num_frames))))
mel_specs.append(
np.pad(S_mel_norm,
((0, 0), (self._pad_begin,
max_frames - self._pad_begin - num_frames))))
np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
self._pad_begin - num_frames))))
done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
(0, max_decoder_length -
int(np.ceil(num_frames // self._factor))),
(0, max_decoder_length - int(
np.ceil(num_frames // self._factor))),
constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs),
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import ruamel.yaml
......@@ -22,11 +36,8 @@ if __name__ == "__main__":
parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
parser.add_argument("text", type=str, help="text file to synthesize")
parser.add_argument("output_path", type=str, help="path to save results")
parser.add_argument("-g",
"--device",
type=int,
default=-1,
help="device to use")
parser.add_argument(
"-g", "--device", type=int, default=-1, help="device to use")
args = parser.parse_args()
with open(args.config, 'rt') as f:
......@@ -76,15 +87,14 @@ if __name__ == "__main__":
window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
padding_idx, embedding_std, max_positions, n_vocab,
freeze_embedding, filter_size, encoder_channels,
n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate,
window_backward, window_ahead, key_projection,
value_projection, downsample_factor, linear_dim,
use_decoder_states, converter_channels, dropout)
dv3 = make_model(
n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
embedding_std, max_positions, n_vocab, freeze_embedding,
filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, window_backward,
window_ahead, key_projection, value_projection, downsample_factor,
linear_dim, use_decoder_states, converter_channels, dropout)
summary(dv3)
state, _ = dg.load_dygraph(args.checkpoint)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import ruamel.yaml
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
from matplotlib import cm
......@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
converter_channels, dropout):
"""just a simple function to create a deepvoice 3 model"""
if n_speakers > 1:
spe = dg.Embedding((n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
spe = dg.Embedding(
(n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
else:
spe = None
......@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
)
enc = Encoder(n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
dropout=dropout)
ConvSpec(h, k, 3), )
enc = Encoder(
n_vocab,
embed_dim,
n_speakers,
speaker_dim,
padding_idx=None,
embedding_weight_std=embedding_std,
convolutions=encoder_convolutions,
max_positions=max_positions,
dropout=dropout)
if freeze_embedding:
freeze(enc.embed)
......@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 3),
ConvSpec(h, k, 9),
ConvSpec(h, k, 27),
ConvSpec(h, k, 1),
)
ConvSpec(h, k, 1), )
attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True]
dec = Decoder(n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=r,
max_positions=max_positions,
padding_idx=padding_idx,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=WindowRange(window_behind, window_ahead),
key_projection=key_projection,
value_projection=value_projection)
dec = Decoder(
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=r,
max_positions=max_positions,
padding_idx=padding_idx,
preattention=prenet_convolutions,
convolutions=attentive_convolutions,
attention=attention,
dropout=dropout,
use_memory_mask=use_memory_mask,
force_monotonic_attention=force_monotonic_attention,
query_position_rate=query_position_rate,
key_position_rate=key_position_rate,
window_range=WindowRange(window_behind, window_ahead),
key_projection=key_projection,
value_projection=value_projection)
if not trainable_positional_encodings:
freeze(dec.embed_keys_positions)
freeze(dec.embed_query_positions)
......@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 1),
ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3),
)
cvt = Converter(n_speakers,
speaker_dim,
dec.state_dim if use_decoder_states else mel_dim,
linear_dim,
time_upsampling=downsample_factor,
convolutions=postnet_convolutions,
dropout=dropout)
ConvSpec(2 * h, k, 3), )
cvt = Converter(
n_speakers,
speaker_dim,
dec.state_dim if use_decoder_states else mel_dim,
linear_dim,
time_upsampling=downsample_factor,
convolutions=postnet_convolutions,
dropout=dropout)
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
return dv3
......@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length,
preemphasis):
"""generate waveform from text using a deepvoice 3 model"""
text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
dtype=np.int64)
text = np.array(
en.text_to_sequence(
text, p=replace_pronounciation_prob),
dtype=np.int64)
length = len(text)
print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length)
......@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
"""
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
wav = librosa.griffinlim(
lin_scaled**power,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length)
if preemphasis > 0:
wav = signal.lfilter([1.], [1., -preemphasis], wav)
return wav
......@@ -225,28 +243,30 @@ def save_state(save_dir,
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"target_mel_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "target_mel_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("target/mel_spec",
cm.viridis(mel_input),
global_step,
dataformats="HWC")
writer.add_image(
"target/mel_spec",
cm.viridis(mel_input),
global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3))
display.specshow(mel_output)
plt.colorbar()
plt.title("mel_output")
plt.savefig(
os.path.join(
path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("predicted/mel_spec",
cm.viridis(mel_output),
global_step,
dataformats="HWC")
writer.add_image(
"predicted/mel_spec",
cm.viridis(mel_output),
global_step,
dataformats="HWC")
if lin_input is not None and lin_output is not None:
lin_input = lin_input[0].numpy().T
......@@ -258,28 +278,30 @@ def save_state(save_dir,
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(path,
"target_lin_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "target_lin_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("target/lin_spec",
cm.viridis(lin_input),
global_step,
dataformats="HWC")
writer.add_image(
"target/lin_spec",
cm.viridis(lin_input),
global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3))
display.specshow(lin_output)
plt.colorbar()
plt.title("mel_input")
plt.savefig(
os.path.join(
path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
global_step)))
plt.close()
writer.add_image("predicted/lin_spec",
cm.viridis(lin_output),
global_step,
dataformats="HWC")
writer.add_image(
"predicted/lin_spec",
cm.viridis(lin_output),
global_step,
dataformats="HWC")
if alignments is not None and len(alignments.shape) == 4:
path = os.path.join(save_dir, "alignments")
......@@ -290,10 +312,11 @@ def save_state(save_dir,
"train_attn_layer_{}_step_{}.png".format(idx, global_step))
plot_alignment(attn_layer, save_path)
writer.add_image("train_attn/layer_{}".format(idx),
cm.viridis(attn_layer),
global_step,
dataformats="HWC")
writer.add_image(
"train_attn/layer_{}".format(idx),
cm.viridis(attn_layer),
global_step,
dataformats="HWC")
if lin_output is not None:
wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
......@@ -302,7 +325,5 @@ def save_state(save_dir,
save_path = os.path.join(
path, "train_sample_step_{:09d}.wav".format(global_step))
sf.write(save_path, wav, sample_rate)
writer.add_audio("train_sample",
wav,
global_step,
sample_rate=sample_rate)
writer.add_audio(
"train_sample", wav, global_step, sample_rate=sample_rate)
......@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
For more help on arguments:
For more help on arguments:
``python train.py --help``.
## Synthesis
......@@ -75,5 +75,5 @@ or you can run the script file directly.
sh synthesis.sh
```
For more help on arguments:
For more help on arguments:
``python synthesis.py --help``.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
parser.add_argument(
'--config_path',
type=str,
default='config/fastspeech.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
parser.add_argument(
'--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000,
parser.add_argument(
'--fastspeech_step',
type=int,
default=70000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1,
parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
parser.add_argument(
'--transtts_path',
type=str,
default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000,
parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="the step to load transformerTTS model.")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from tensorboardX import SummaryWriter
from collections import OrderedDict
......@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
from parakeet import audio
from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
......@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict
def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'synthesis')
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
......@@ -37,24 +52,28 @@ def synthesis(text_input, args):
with dg.guard(place):
model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
model.set_dict(
load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech")))
model.eval()
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
mel_output, mel_output_postnet = model(
text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'],
sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length=cfg['audio']['win_length'],
hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'],
signal_norm=True,
......@@ -67,14 +86,17 @@ def synthesis(text_input, args):
do_trim_silence=False,
sound_norm=False)
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
mel_output_postnet = fluid.layers.transpose(
fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
))
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!")
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser)
args = parser.parse_args()
synthesis("Transformer model is so fast!", args)
\ No newline at end of file
synthesis("Transformer model is so fast!", args)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import os
......@@ -20,8 +33,10 @@ import sys
sys.path.append("../transformer_tts")
from data import LJSpeechLoader
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict()
for param in model_dict:
if param.startswith('_layers.'):
......@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict
def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
......@@ -43,26 +59,33 @@ def main(args):
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'fastspeech')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
model_dict, _ = load_checkpoint(
str(args.transformer_step),
os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict)
transformerTTS.eval()
model = FastSpeech(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
model_dict, opti_dict = load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = args.fastspeech_step
......@@ -76,31 +99,42 @@ def main(args):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
_, _, attn_probs, _, _, _ = transformerTTS(
character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(
get_alignment(attn_probs, mel_lens, cfg[
'transformer_head'])).astype(np.float32)
global_step += 1
#Forward
result= model(character,
pos_text,
mel_pos=pos_mel,
length_target=alignment)
result = model(
character,
pos_text,
mel_pos=pos_mel,
length_target=alignment)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
duration_loss = layers.mean(
layers.abs(
layers.elementwise_sub(duration_predictor_output,
alignment)))
total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0:
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if local_rank == 0:
writer.add_scalar('mel_loss',
mel_loss.numpy(), global_step)
writer.add_scalar('post_mel_loss',
mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss',
duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if args.use_data_parallel:
total_loss = model.scale_loss(total_loss)
......@@ -108,21 +142,25 @@ def main(args):
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
optimizer.minimize(
total_loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % args.save_step == 0:
# save checkpoint
if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
save_path = os.path.join(args.save_path,
'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
if local_rank == 0:
writer.close()
if __name__ =='__main__':
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser)
args = parser.parse_args()
......
......@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
For more help on arguments:
For more help on arguments:
``python train_transformer.py --help``.
## Train Vocoder
......@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
```
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
For more help on arguments:
For more help on arguments:
``python train_vocoder.py --help``.
## Synthesis
......@@ -101,5 +101,5 @@ sh synthesis.sh
And the audio file will be saved in ``--sample_path``.
For more help on arguments:
For more help on arguments:
``python synthesis.py --help``.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import numpy as np
import pandas as pd
......@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader:
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
def __init__(self,
config,
args,
nranks,
rank,
is_vocoder=False,
shuffle=True):
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
sampler = DistributedSampler(
len(metadata), nranks, rank, shuffle=shuffle)
assert args.batch_size % nranks == 0
each_bs = args.batch_size // nranks
if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples_vocoder,
drop_last=True)
else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples,
drop_last=True)
self.reader = fluid.io.DataLoader.from_generator(
capacity=32,
iterable=True,
......@@ -63,13 +96,13 @@ class LJSpeech(object):
super(LJSpeech, self).__init__()
self.config = config
self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config['audio']['sr'],
num_mels=config['audio']['num_mels'],
min_level_db=config['audio']['min_level_db'],
ref_level_db=config['audio']['ref_level_db'],
n_fft=config['audio']['n_fft'],
win_length= config['audio']['win_length'],
hop_length= config['audio']['hop_length'],
sample_rate=config['audio']['sr'],
num_mels=config['audio']['num_mels'],
min_level_db=config['audio']['min_level_db'],
ref_level_db=config['audio']['ref_level_db'],
n_fft=config['audio']['n_fft'],
win_length=config['audio']['win_length'],
hop_length=config['audio']['hop_length'],
power=config['audio']['power'],
preemphasis=config['audio']['preemphasis'],
signal_norm=True,
......@@ -81,7 +114,7 @@ class LJSpeech(object):
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
def __call__(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
......@@ -90,13 +123,15 @@ class LJSpeech(object):
method.
"""
fname, raw_text, normalized_text = metadatum
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = self._ljspeech_processor.load_wav(str(fname))
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
phonemes = np.array(
g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes
) # maybe we need to implement it as a map in the future
def batch_examples(batch):
......@@ -109,44 +144,71 @@ def batch_examples(batch):
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
mel_inputs.append(
np.concatenate(
[np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
axis=-1))
mel_lens.append(mel.shape[1])
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
texts = [
i
for i, _ in sorted(
zip(texts, text_lens), key=lambda x: x[1], reverse=True)
]
mels = [
i
for i, _ in sorted(
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
]
mel_inputs = [
i
for i, _ in sorted(
zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
]
mel_lens = [
i
for i, _ in sorted(
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
]
pos_texts = [
i
for i, _ in sorted(
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
]
pos_mels = [
i
for i, _ in sorted(
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
mel_inputs = np.transpose(
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
np.array(mel_lens))
def batch_examples_vocoder(batch):
mels=[]
mags=[]
mels = []
mags = []
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
return (mels, mags)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
parser.add_argument(
'--config_path',
type=str,
default='config/train_transformer.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
parser.add_argument(
'--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
parser.add_argument(
'--image_step',
type=int,
default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
parser.add_argument(
'--max_len',
type=int,
default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="Global step to restore checkpoint of transformer.")
parser.add_argument('--vocoder_step', type=int, default=90000,
parser.add_argument(
'--vocoder_step',
type=int,
default=90000,
help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=int, default=1,
parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=int, default=0,
parser.add_argument(
'--stop_token',
type=int,
default=0,
help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
......@@ -16,6 +29,7 @@ from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
......@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict
def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
......@@ -34,46 +49,53 @@ def synthesis(text_input, args):
# tensorboard
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'synthesis')
writer = SummaryWriter(path)
with dg.guard(place):
with fluid.unique_name.guard():
model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
model.set_dict(
load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer")))
model.eval()
with fluid.unique_name.guard():
model_vocoder = Vocoder(cfg, args.batch_size)
model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
model_vocoder.set_dict(
load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder")))
model_vocoder.eval()
# init input
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
pbar = tqdm(range(args.max_len))
for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
pos_mel = np.arange(1, mel_input.shape[1] + 1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat(
[mel_input, postnet_pred[:, -1:, :]], axis=1)
mag_pred = model_vocoder(postnet_pred)
_ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'],
sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'],
win_length=cfg['audio']['win_length'],
hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'],
signal_norm=True,
......@@ -86,13 +108,18 @@ def synthesis(text_input, args):
do_trim_silence=False,
sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
wav = _ljspeech_processor.inv_spectrogram(
fluid.layers.transpose(
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(args.sample_path):
os.mkdir(args.sample_path)
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
write(
os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
wav)
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from tqdm import tqdm
from tensorboardX import SummaryWriter
......@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
from data import LJSpeechLoader
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict()
for param in model_dict:
if param.startswith('_layers.'):
......@@ -40,22 +55,27 @@ def main(args):
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'transformer')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
model = TransformerTTS(cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
model_dict, opti_dict = load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = args.transformer_step
......@@ -64,86 +84,112 @@ def main(args):
if args.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
character, mel_input, pos_text, pos_mel)
label = (pos_mel == 0).astype(np.float32)
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work.
if args.stop_token:
stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss
if local_rank==0:
if local_rank == 0:
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy()
'mel_loss': mel_loss.numpy(),
'post_mel_loss': post_mel_loss.numpy()
}, global_step)
if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
writer.add_scalar('stop_loss',
stop_loss.numpy(), global_step)
if args.use_data_parallel:
writer.add_scalars('alphas', {
'encoder_alpha':model._layers.encoder.alpha.numpy(),
'decoder_alpha':model._layers.decoder.alpha.numpy(),
'encoder_alpha':
model._layers.encoder.alpha.numpy(),
'decoder_alpha':
model._layers.decoder.alpha.numpy(),
}, global_step)
else:
writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(),
'encoder_alpha': model.encoder.alpha.numpy(),
'decoder_alpha': model.decoder.alpha.numpy(),
}, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
x = np.uint8(
cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_enc):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
x = np.uint8(
cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_enc_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_dec):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
x = np.uint8(
cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_dec_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
if args.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % args.save_step == 0:
if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
save_path = os.path.join(args.save_path,
'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
if local_rank == 0:
writer.close()
if __name__ =='__main__':
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorboardX import SummaryWriter
import os
from tqdm import tqdm
......@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
from data import LJSpeechLoader
from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict()
......@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict
def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
......@@ -35,23 +50,26 @@ def main(args):
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'vocoder')
os.mkdir(args.log_dir)
path = os.path.join(args.log_dir, 'vocoder')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
with dg.guard(place):
model = Vocoder(cfg, args.batch_size)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters())
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
model_dict, opti_dict = load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder"))
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
global_step = args.vocoder_step
......@@ -61,48 +79,55 @@ def main(args):
strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(args.epochs):
pbar = tqdm(reader)
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
pbar.set_description('Processing at epoch %d' % epoch)
mel, mag = data
mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy())
global_step += 1
mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
loss = layers.mean(
layers.abs(layers.elementwise_sub(mag_pred, mag)))
if args.use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients()
if local_rank==0:
writer.add_scalars('training_loss',{
'loss':loss.numpy(),
if local_rank == 0:
writer.add_scalars('training_loss', {
'loss': loss.numpy(),
}, global_step)
if global_step % args.save_step == 0:
if not os.path.exists(args.save_path):
os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
save_path = os.path.join(args.save_path,
'vocoder/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
if local_rank == 0:
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train vocoder model")
add_config_options_to_parser(parser)
args = parser.parse_args()
# Print the whole config setting.
pprint(args)
main(args)
\ No newline at end of file
main(args)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from pprint import pprint
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
from pprint import pprint
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import subprocess
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import os
import time
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.0.0"
from . import data, g2p, models, modules
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import soundfile as sf
import numpy as np
import scipy.io
import scipy.signal
class AudioProcessor(object):
def __init__(self,
sample_rate=None, # int, sampling rate
num_mels=None, # int, bands of mel spectrogram
min_level_db=None, # float, minimum level db
ref_level_db=None, # float, reference level db
n_fft=None, # int: number of samples in a frame for stft
win_length=None, # int: the same meaning with n_fft
hop_length=None, # int: number of samples between neighboring frame
power=None, # float:power to raise before griffin-lim
preemphasis=None, # float: preemphasis coefficident
signal_norm=None, #
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
max_norm=None, # float, max norm
mel_fmin=None, # int: mel spectrogram's minimum frequency
mel_fmax=None, # int: mel spectrogram's maximum frequency
clip_norm=True, # bool: clip spectrogram's norm
griffin_lim_iters=None, # int:
do_trim_silence=False, # bool: trim silence
sound_norm=False,
**kwargs):
def __init__(
self,
sample_rate=None, # int, sampling rate
num_mels=None, # int, bands of mel spectrogram
min_level_db=None, # float, minimum level db
ref_level_db=None, # float, reference level db
n_fft=None, # int: number of samples in a frame for stft
win_length=None, # int: the same meaning with n_fft
hop_length=None, # int: number of samples between neighboring frame
power=None, # float:power to raise before griffin-lim
preemphasis=None, # float: preemphasis coefficident
signal_norm=None, #
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
max_norm=None, # float, max norm
mel_fmin=None, # int: mel spectrogram's minimum frequency
mel_fmax=None, # int: mel spectrogram's maximum frequency
clip_norm=True, # bool: clip spectrogram's norm
griffin_lim_iters=None, # int:
do_trim_silence=False, # bool: trim silence
sound_norm=False,
**kwargs):
self.sample_rate = sample_rate
self.num_mels = num_mels
self.min_level_db = min_level_db
......@@ -34,8 +50,8 @@ class AudioProcessor(object):
self.n_fft = n_fft
self.win_length = win_length or n_fft
# hop length defaults to 1/4 window_length
self.hop_length = hop_length or 0.25 * self.win_length
self.hop_length = hop_length or 0.25 * self.win_length
self.power = power
self.preemphasis = float(preemphasis)
......@@ -52,7 +68,8 @@ class AudioProcessor(object):
self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
)
def _stft_parameters(self):
"""compute frame length and hop length in ms"""
......@@ -65,44 +82,54 @@ class AudioProcessor(object):
"""object repr"""
cls_name_str = self.__class__.__name__
members = vars(self)
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()])
dict_str = "\n".join(
[" {}: {},".format(k, v) for k, v in members.items()])
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
return repr_str
def save_wav(self, path, wav):
"""save audio with scipy.io.wavfile in 16bit integers"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
scipy.io.wavfile.write(path, self.sample_rate,
wav_norm.as_type(np.int16))
def load_wav(self, path, sr=None):
"""load wav -> trim_silence -> rescale"""
x, sr = librosa.load(path, sr=None)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
sr, self.sample_rate)
if self.do_trim_silence:
try:
x = self.trim_silence(x)
except ValueError:
print(" [!] File cannot be trimmed for silence - {}".format(path))
print(" [!] File cannot be trimmed for silence - {}".format(
path))
if self.sound_norm:
x = x / x.max() * 0.9 # why 0.9 ?
x = x / x.max() * 0.9 # why 0.9 ?
return x
def trim_silence(self, wav):
"""Trim soilent parts with a threshold and 0.01s margin"""
margin = int(self.sample_rate * 0.01)
wav = wav[margin: -margin]
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
wav = wav[margin:-margin]
trimed_wav = librosa.effects.trim(
wav,
top_db=60,
frame_length=self.win_length,
hop_length=self.hop_length)[0]
return trimed_wav
def apply_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
def apply_inv_preemphasis(self, x):
if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
def _amplitude_to_db(self, x):
......@@ -125,12 +152,11 @@ class AudioProcessor(object):
"""return mel basis for mel scale"""
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
return librosa.filters.mel(self.sample_rate,
self.n_fft,
n_mels=self.num_mels,
fmin=self.mel_fmin,
fmax=self.mel_fmax)
def _normalize(self, S):
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
......@@ -156,25 +182,29 @@ class AudioProcessor(object):
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
S_denorm = (S_denorm + self.max_norm) * (
-self.min_level_db) / (2 * self.max_norm
) + self.min_level_db
return S_denorm
else:
if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
S_denorm = S_denorm * (-self.min_level_db
) / self.max_norm + self.min_level_db
return S_denorm
else:
return S
def _stft(self, y):
return librosa.stft(
y=y,
y=y,
n_fft=self.n_fft,
win_length=self.win_length,
hop_length=self.hop_length)
def _istft(self, S):
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
return librosa.istft(
S, hop_length=self.hop_length, win_length=self.win_length)
def spectrogram(self, y):
"""compute linear spectrogram(amplitude)
......@@ -195,7 +225,8 @@ class AudioProcessor(object):
D = self._stft(self.apply_preemphasis(y))
else:
D = self._stft(y)
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
S = self._amplitude_to_db(self._linear_to_mel(np.abs(
D))) - self.ref_level_db
return self._normalize(S)
def inv_spectrogram(self, spectrogram):
......@@ -203,16 +234,16 @@ class AudioProcessor(object):
S = self._denormalize(spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram):
S = self._denormalize(mel_spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db)
S = self._mel_to_linear(np.abs(S))
if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec):
"""convert output linear spec to mel spec"""
......@@ -222,7 +253,7 @@ class AudioProcessor(object):
S = self._amplitude_to_db(S) - self.ref_level_db
mel = self._normalize(S)
return mel
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
......@@ -234,18 +265,18 @@ class AudioProcessor(object):
@staticmethod
def mulaw_encode(wav, qc):
mu = 2 ** qc - 1
mu = 2**qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,)
return np.floor(signal, )
@staticmethod
def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values."""
mu = 2 ** qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
mu = 2**qc - 1
x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
return x
@staticmethod
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .dataset import *
from .datacargo import *
from .sampler import *
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
functions to make batch for arrays which satisfy some conditions.
"""
import numpy as np
class TextIDBatcher(object):
"""A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
def __init__(self, pad_id=0, dtype=np.int64):
self.pad_id = pad_id
self.dtype = dtype
def __call__(self, minibatch):
out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
return out
def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
"""
minibatch: List[Example]
......@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
"""
peek_example = minibatch[0]
assert len(peek_example.shape) == 1, "text example is an 1D tensor"
lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
lengths = [example.shape[0] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths)
batch = []
for example in minibatch:
pad_len = max_len - example.shape[0]
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_id))
return np.array(batch, dtype=dtype)
class WavBatcher(object):
def __init__(self, pad_value=0., dtype=np.float32):
self.pad_value = pad_value
self.dtype = dtype
def __call__(self, minibatch):
out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out
def batch_wav(minibatch, pad_value=0., dtype=np.float32):
"""
minibatch: List[Example]
......@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
mono_channel = True
elif len(peek_example.shape) == 2:
mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
lengths = [example.shape[-1] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths)
batch = []
for example in minibatch:
pad_len = max_len - example.shape[-1]
if mono_channel:
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_value))
else:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)
......@@ -75,6 +104,7 @@ class SpecBatcher(object):
out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out
def batch_spec(minibatch, pad_value=0., dtype=np.float32):
"""
minibatch: List[Example]
......@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
mono_channel = True
elif len(peek_example.shape) == 3:
mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
lengths = [example.shape[-1] for example in minibatch
] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
batch = []
for example in minibatch:
pad_len = max_len - example.shape[-1]
if mono_channel:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value))
else:
batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)
\ No newline at end of file
batch.append(
np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
from .sampler import SequentialSampler, RandomSampler, BatchSampler
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import numpy as np
......@@ -9,8 +23,7 @@ class DatasetMixin(object):
if isinstance(index, slice):
start, stop, step = index.indices(len(self))
return [
self.get_example(i)
for i in six.moves.range(start, stop, step)
self.get_example(i) for i in six.moves.range(start, stop, step)
]
elif isinstance(index, (list, np.ndarray)):
return [self.get_example(i) for i in index]
......@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
def get_example(self, i):
if i < 0:
raise IndexError(
"ChainDataset doesnot support negative indexing.")
raise IndexError("ChainDataset doesnot support negative indexing.")
for dataset in self._datasets:
if i < len(dataset):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
......@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
So the sampler is only responsible for generating valid indices.
"""
import numpy as np
import random
class Sampler(object):
def __init__(self, data_source):
pass
......@@ -23,7 +36,7 @@ class Sampler(object):
class SequentialSampler(Sampler):
def __init__(self, data_source):
self.data_source = data_source
def __iter__(self):
return iter(range(len(self.data_source)))
......@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
"replacement={}".format(self.replacement))
if self._num_samples is not None and not replacement:
raise ValueError("With replacement=False, num_samples should not be specified, "
"since a random permutation will be performed.")
raise ValueError(
"With replacement=False, num_samples should not be specified, "
"since a random permutation will be performed.")
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(self.num_samples))
"value, but got num_samples={}".format(
self.num_samples))
@property
def num_samples(self):
......@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
def __iter__(self):
n = len(self.data_source)
if self.replacement:
return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
return iter(
np.random.randint(
0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
return iter(np.random.permutation(n).tolist())
def __len__(self):
......@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
self.indices = indices
def __iter__(self):
return (self.indices[i] for i in np.random.permutation(len(self.indices)))
return (self.indices[i]
for i in np.random.permutation(len(self.indices)))
def __len__(self):
return len(self.indices)
......@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
3. Permutate mini-batchs
"""
def __init__(self, lengths, batch_size=4, batch_group_size=None,
def __init__(self,
lengths,
batch_size=4,
batch_group_size=None,
permutate=True):
_lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
_lengths = np.array(
lengths,
dtype=np.int64) # maybe better implement length as a sort key
self.lengths = np.sort(_lengths)
self.sorted_indices = np.argsort(_lengths)
......@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
for i in range(len(indices) // batch_group_size):
s = i * batch_group_size
e = s + batch_group_size
random.shuffle(indices[s: e]) # inplace
random.shuffle(indices[s:e]) # inplace
# Permutate batches
if self.permutate:
perm = np.arange(len(indices[:e]) // self.batch_size)
random.shuffle(perm)
indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
indices[:e] = indices[:e].reshape(
-1, self.batch_size)[perm, :].reshape(-1)
# Handle last elements
s += batch_group_size
#print(indices)
if s < len(indices):
random.shuffle(indices[s:])
return iter(indices)
def __len__(self):
......@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
def __init__(self, weights, num_samples, replacement):
if not isinstance(num_samples, int) or num_samples <= 0:
raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(num_samples))
"value, but got num_samples={}".format(
num_samples))
self.weights = np.array(weights, dtype=np.float64)
self.num_samples = num_samples
self.replacement = replacement
def __iter__(self):
return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),
replace=self.replacement, p=self.weights).tolist())
return iter(
np.random.choice(
len(self.weights),
size=(self.num_samples, ),
replace=self.replacement,
p=self.weights).tolist())
def __len__(self):
return self.num_samples
......@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
# Subset samples for each trainer.
indices = indices[self.rank:self.total_size:self.num_trainers]
assert len(indices) == self.num_samples
assert len(indices) == self.num_samples
return iter(indices)
......@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
def __init__(self, sampler, batch_size, drop_last):
if not isinstance(sampler, Sampler):
raise ValueError("sampler should be an instance of "
"Sampler, but got sampler={}"
.format(sampler))
"Sampler, but got sampler={}".format(sampler))
if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError("batch_size should be a positive integer value, "
"but got batch_size={}".format(batch_size))
......
......@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
That is it!
That is it!
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import numpy as np
import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import pandas as pd
from ruamel.yaml import YAML
......@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, WavBatcher
class VCTK(Dataset):
def __init__(self, root):
assert isinstance(root, (str, Path)), "root should be a string or Path object"
assert isinstance(root, (
str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.text_root = self.root.joinpath("txt")
self.wav_root = self.root.joinpath("wav48")
if not (self.root.joinpath("metadata.csv").exists() and
if not (self.root.joinpath("metadata.csv").exists() and
self.root.joinpath("speaker_indices.yaml").exists()):
self._prepare_metadata()
self.speaker_indices, self.metadata = self._load_metadata()
def _load_metadata(self):
yaml=YAML(typ='safe')
yaml = YAML(typ='safe')
speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
metadata = pd.read_csv(self.root.joinpath("metadata.csv"),
sep="|", quoting=3, header=1)
metadata = pd.read_csv(
self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
return speaker_indices, metadata
def _prepare_metadata(self):
......@@ -41,15 +57,19 @@ class VCTK(Dataset):
with io.open(str(text_file)) as f:
transcription = f.read().strip()
wav_file = text_file.with_suffix(".wav")
metadata.append((wav_file.name, speaker_folder.name, transcription))
metadata = pd.DataFrame.from_records(metadata,
columns=["wave_file", "speaker", "text"])
metadata.append(
(wav_file.name, speaker_folder.name, transcription))
metadata = pd.DataFrame.from_records(
metadata, columns=["wave_file", "speaker", "text"])
# save them
yaml=YAML(typ='safe')
yaml = YAML(typ='safe')
yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
metadata.to_csv(self.root.joinpath("metadata.csv"),
sep="|", quoting=3, index=False)
metadata.to_csv(
self.root.joinpath("metadata.csv"),
sep="|",
quoting=3,
index=False)
def _get_example(self, metadatum):
wave_file, speaker, text = metadatum
......@@ -77,5 +97,3 @@ class VCTK(Dataset):
speaker_batch = np.array(speaker_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return wav_batch, speaker_batch, phoneme_batch
\ No newline at end of file
# coding: utf-8
"""Text processing frontend
All frontend module should have the following functions:
......
......@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence
text = text_to_sequence(text, ["english_cleaners"])
return text
......@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence
text = text_to_sequence(text, ["basic_cleaners"])
return text
# coding: utf-8
import MeCab
import jaconv
from random import random
......@@ -30,9 +29,9 @@ def _yomi(mecab_result):
def _mix_pronunciation(tokens, yomis, p):
return "".join(
yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
for idx in range(len(tokens)))
return "".join(yomis[idx]
if yomis[idx] is not None and random() < p else tokens[idx]
for idx in range(len(tokens)))
def mix_pronunciation(text, p):
......@@ -59,8 +58,7 @@ def normalize_delimitor(text):
def text_to_sequence(text, p=0.0):
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
"(", ")", "(", ")"]:
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
text = text.replace(c, "")
text = text.replace("!", "!")
text = text.replace("?", "?")
......
# coding: utf-8
from random import random
n_vocab = 0xffff
......@@ -13,5 +12,6 @@ _tagger = None
def text_to_sequence(text, p=0.0):
return [ord(c) for c in text] + [_eos] # EOS
def sequence_to_text(seq):
return "".join(chr(n) for n in seq)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from . import cleaners
from .symbols import symbols
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
......@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
if not m:
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
break
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
sequence += _symbols_to_sequence(
_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Cleaners are transformations that run over the input text at both training and eval time.
......@@ -14,31 +27,31 @@ import re
from unidecode import unidecode
from .numbers import normalize_numbers
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):
......
import re
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
valid_symbols = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
'Y', 'Z', 'ZH'
]
_valid_symbol_set = set(valid_symbols)
......@@ -24,7 +38,10 @@ class CMUDict:
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries
def __len__(self):
......
......@@ -3,7 +3,6 @@
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
......@@ -56,7 +55,8 @@ def _expand_number(m):
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
return _inflect.number_to_words(
num, andword='', zero='oh', group=2).replace(', ', ' ')
else:
return _inflect.number_to_words(num, andword='')
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Defines the set of symbols used in text input to the model.
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
from parakeet.models.deepvoice3.converter import Converter
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from collections import namedtuple
from paddle import fluid
......@@ -19,23 +33,19 @@ class Attention(dg.Layer):
value_projection=True):
super(Attention, self).__init__()
std = np.sqrt(1 / query_dim)
self.query_proj = Linear(query_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.query_proj = Linear(
query_dim, embed_dim, param_attr=I.Normal(scale=std))
if key_projection:
std = np.sqrt(1 / embed_dim)
self.key_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.key_proj = Linear(
embed_dim, embed_dim, param_attr=I.Normal(scale=std))
if value_projection:
std = np.sqrt(1 / embed_dim)
self.value_proj = Linear(embed_dim,
embed_dim,
param_attr=I.Normal(scale=std))
self.value_proj = Linear(
embed_dim, embed_dim, param_attr=I.Normal(scale=std))
std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim,
query_dim,
param_attr=I.Normal(scale=std))
self.out_proj = Linear(
embed_dim, query_dim, param_attr=I.Normal(scale=std))
self.key_projection = key_projection
self.value_projection = value_projection
......@@ -102,9 +112,8 @@ class Attention(dg.Layer):
x = F.softmax(x)
attn_scores = x
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = F.matmul(x, values)
encoder_length = keys.shape[1]
# CAUTION: is it wrong? let it be now
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
......@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
has residual connection from the input x, and scale the output by
np.sqrt(0.5).
"""
def __init__(self,
n_speakers,
speaker_dim,
......@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
), "this block uses residual connection"\
"the input_channes should equals num_filters"
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
self.conv = Conv1DCell(in_channels,
2 * num_filters,
filter_size,
dilation,
causal,
param_attr=I.Normal(scale=std))
self.conv = Conv1DCell(
in_channels,
2 * num_filters,
filter_size,
dilation,
causal,
param_attr=I.Normal(scale=std))
if n_speakers > 1:
assert (speaker_dim is not None
), "speaker embed should not be null in multi-speaker case"
std = np.sqrt(1 / speaker_dim)
self.fc = Linear(speaker_dim,
num_filters,
param_attr=I.Normal(scale=std))
self.fc = Linear(
speaker_dim, num_filters, param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None):
"""
......@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU.
"""
residual = x
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = self.conv(x)
content, gate = F.split(x, num_or_sections=2, dim=1)
......@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU.
"""
residual = x_t
x_t = F.dropout(x_t,
self.dropout,
dropout_implementation="upscale_in_train")
x_t = F.dropout(
x_t, self.dropout, dropout_implementation="upscale_in_train")
x_t = self.conv.add_input(x_t)
content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from itertools import chain
......@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout),
Conv1DTranspose(
Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout), Conv1DTranspose(
target_channels,
target_channels,
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(
4. / (2 * target_channels)))), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
......@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2,
stride=2,
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
]
return upsampling_convolutions
......@@ -108,6 +125,7 @@ class Converter(dg.Layer):
Vocoder that transforms mel spectrogram (or ecoder hidden states)
to waveform.
"""
def __init__(self,
n_speakers,
speaker_dim,
......@@ -161,33 +179,36 @@ class Converter(dg.Layer):
std = np.sqrt(std_mul / in_channels)
# CAUTION: relu
self.convolutions.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
Conv1D(
in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation=dilation,
std_mul=std_mul,
dropout=dropout))
Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation=dilation,
std_mul=std_mul,
dropout=dropout))
in_channels = out_channels
std_mul = 4.0
# final conv proj, channel transformed to linear dim
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
# CAUTION: sigmoid
self.last_conv_proj = Conv1D(in_channels,
linear_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
self.last_conv_proj = Conv1D(
in_channels,
linear_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None):
"""
......@@ -229,4 +250,4 @@ class Converter(dg.Layer):
out = self.last_conv_proj(x)
out = F.transpose(out, [0, 2, 1])
return out
\ No newline at end of file
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid.layers as F
import paddle.fluid.initializer as I
......@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
class Decoder(dg.Layer):
def __init__(
self,
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=1,
max_positions=512,
padding_idx=None, # remove it!
preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True,
dropout=0.0,
use_memory_mask=False,
force_monotonic_attention=False,
query_position_rate=1.0,
key_position_rate=1.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True):
self,
n_speakers,
speaker_dim,
embed_dim,
mel_dim,
r=1,
max_positions=512,
padding_idx=None, # remove it!
preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True,
dropout=0.0,
use_memory_mask=False,
force_monotonic_attention=False,
query_position_rate=1.0,
key_position_rate=1.0,
window_range=WindowRange(-1, 3),
key_projection=True,
value_projection=True):
super(Decoder, self).__init__()
self.dropout = dropout
......@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
conv_channels = convolutions[0].out_channels
# only when padding idx is 0 can we easilt handle it
self.embed_keys_positions = PositionEmbedding(max_positions,
embed_dim,
padding_idx=0)
self.embed_query_positions = PositionEmbedding(max_positions,
conv_channels,
padding_idx=0)
self.embed_keys_positions = PositionEmbedding(
max_positions, embed_dim, padding_idx=0)
self.embed_query_positions = PositionEmbedding(
max_positions, conv_channels, padding_idx=0)
if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim)
self.speaker_proj1 = Linear(speaker_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(speaker_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
self.speaker_proj1 = Linear(
speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(
speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
# prenet
self.prenet = dg.LayerList()
......@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
# conv1d & relu
std = np.sqrt(std_mul / in_channels)
self.prenet.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
Conv1D(
in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.prenet.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=True))
Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=True))
in_channels = out_channels
std_mul = 4.0
......@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
assert (
in_channels == out_channels
), "the stack of convolution & attention does not change channels"
conv_layer = Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=False)
conv_layer = Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=True,
residual=False)
attn_layer = Attention(
out_channels,
embed_dim,
......@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
# 1 * 1 conv to transform channels
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.last_conv = Conv1D(in_channels,
mel_dim * r,
1,
param_attr=I.Normal(scale=std))
self.last_conv = Conv1D(
in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
# mel (before sigmoid) to done hat
std = np.sqrt(1 / in_channels)
......@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
# (B, C, T)
frames = F.transpose(frames, [0, 2, 1])
x = frames
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
if isinstance(layer, Conv1DGLU):
......@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
test_inputs = fold_adjacent_frames(test_inputs, self.r)
test_inputs = F.transpose(test_inputs, [0, 2, 1])
initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
dtype=keys.dtype)
initial_input = F.zeros(
(batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
t = 0 # decoder time step
while True:
frame_pos = F.fill_constant((batch_size, 1),
value=t + 1,
dtype="int64")
frame_pos = F.fill_constant(
(batch_size, 1), value=t + 1, dtype="int64")
w = self.query_position_rate
if self.n_speakers > 1:
w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
......@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
current_input = initial_input
x_t = current_input
x_t = F.dropout(x_t,
self.dropout,
dropout_implementation="upscale_in_train")
x_t = F.dropout(
x_t, self.dropout, dropout_implementation="upscale_in_train")
# Prenet
for layer in self.prenet:
......@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
x_t = F.transpose(x_t, [0, 2, 1])
if frame_pos_embed is not None:
x_t += frame_pos_embed
x_t, attn_scores = attn(
x_t, (keys, values), mask,
last_attended[i] if test_inputs is None else None)
x_t, attn_scores = attn(x_t, (keys, values), mask,
last_attended[i]
if test_inputs is None else None)
x_t = F.transpose(x_t, [0, 2, 1])
step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc)
# update last attended when necessary
if self.force_monotonic_attention[i]:
last_attended[i] = np.argmax(attn_scores.numpy(),
axis=-1)[0][0]
last_attended[i] = np.argmax(
attn_scores.numpy(), axis=-1)[0][0]
x_t = F.scale(residual + x_t, np.sqrt(0.5))
if len(step_attn_scores):
# (B, 1, T_enc) again
......@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
t += 1
if test_inputs is None:
if F.reduce_min(done_t).numpy(
)[0] > 0.5 and t > self.min_decoder_steps:
if F.reduce_min(done_t).numpy()[
0] > 0.5 and t > self.min_decoder_steps:
break
elif t > self.max_decoder_steps:
break
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from collections import namedtuple
......@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
self.dropout = dropout
if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim)
self.sp_proj1 = Linear(speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.sp_proj2 = Linear(speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.sp_proj1 = Linear(
speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.sp_proj2 = Linear(
speaker_dim,
embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.n_speakers = n_speakers
self.convolutions = dg.LayerList()
......@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
if in_channels != out_channels:
std = np.sqrt(std_mul / in_channels)
self.convolutions.append(
Conv1D(in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
Conv1D(
in_channels,
out_channels,
1,
act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels
std_mul = 2.0
self.convolutions.append(
Conv1DGLU(n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=False,
residual=True))
Conv1DGLU(
n_speakers,
speaker_dim,
in_channels,
out_channels,
filter_size,
dilation,
std_mul,
dropout,
causal=False,
residual=True))
in_channels = out_channels
std_mul = 4.0
std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.convolutions.append(
Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
Conv1D(
in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def forward(self, x, speaker_embed=None):
"""
......@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
representation for values.
"""
x = self.embed(x)
x = F.dropout(x,
self.dropout,
dropout_implementation="upscale_in_train")
x = F.dropout(
x, self.dropout, dropout_implementation="upscale_in_train")
x = F.transpose(x, [0, 2, 1])
if self.n_speakers > 1 and speaker_embed is not None:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from numba import jit
......@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
return W
def guided_attentions(encoder_lengths,
decoder_lengths,
max_decoder_len,
def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
g=0.2):
B = len(encoder_lengths)
max_input_len = encoder_lengths.max()
......@@ -93,9 +105,8 @@ class TTSLoss(object):
def binary_divergence(self, prediction, target, mask):
flattened_prediction = F.reshape(prediction, [-1, 1])
flattened_target = F.reshape(target, [-1, 1])
flattened_loss = F.log_loss(flattened_prediction,
flattened_target,
epsilon=1e-8)
flattened_loss = F.log_loss(
flattened_prediction, flattened_target, epsilon=1e-8)
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
w = self.masked_weight
......@@ -163,23 +174,20 @@ class TTSLoss(object):
max_mel_steps = max_frames // self.downsample_factor
max_decoder_steps = max_mel_steps // self.r
decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
self.r,
max_decoder_steps,
dtype="float32")
mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
max_mel_steps,
dtype="float32")
decoder_mask = F.sequence_mask(
n_frames // self.downsample_factor // self.r,
max_decoder_steps,
dtype="float32")
mel_mask = F.sequence_mask(
n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
if compute_lin_loss:
lin_hyp = lin_hyp[:, :-self.time_shift, :]
lin_ref = lin_ref[:, self.time_shift:, :]
lin_mask = lin_mask[:, self.time_shift:, :]
lin_l1_loss = self.l1_loss(lin_hyp,
lin_ref,
lin_mask,
priority_bin=self.priority_bin)
lin_l1_loss = self.l1_loss(
lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
lin_loss = self.binary_divergence_weight * lin_bce_loss \
+ (1 - self.binary_divergence_weight) * lin_l1_loss
......@@ -197,9 +205,10 @@ class TTSLoss(object):
total_loss += mel_loss
if compute_attn_loss:
attn_loss = self.attention_loss(
attn_hyp, input_lengths.numpy(),
n_frames.numpy() // (self.downsample_factor * self.r))
attn_loss = self.attention_loss(attn_hyp,
input_lengths.numpy(),
n_frames.numpy() //
(self.downsample_factor * self.r))
total_loss += attn_loss
if compute_done_loss:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid.layers as F
......@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
mel_outputs, alignments, done, decoder_states = self.decoder(
(keys, values), valid_lengths, mel_inputs, text_positions,
frame_positions, speaker_embed)
linear_outputs = self.converter(
decoder_states if self.use_decoder_states else mel_outputs,
speaker_embed)
linear_outputs = self.converter(decoder_states
if self.use_decoder_states else
mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done
def transduce(self, text_sequences, text_positions, speaker_indices=None):
......@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder.decode(
(keys, values), text_positions, speaker_embed)
linear_outputs = self.converter(
decoder_states if self.use_decoder_states else mel_outputs,
speaker_embed)
linear_outputs = self.converter(decoder_states
if self.use_decoder_states else
mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle import fluid
import paddle.fluid.layers as F
......@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
speaker_position_rate) # (B, V, C)
# make indices for gather_nd
batch_id = F.expand(
F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
[1, time_steps])
F.unsqueeze(
F.range(
0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
# (B, T, 2)
gather_nd_id = F.stack([batch_id, indices], -1)
out = F.gather_nd(weight, gather_nd_id)
return out
\ No newline at end of file
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock
class Decoder(dg.Layer):
def __init__(self,
len_max_seq,
......@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
self.pos_inp = get_sinusoid_encoding_table(
n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(
size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos):
"""
Decoder layer of FastSpeech.
......@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list
\ No newline at end of file
return dec_output, dec_slf_attn_list
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock
class Encoder(dg.Layer):
def __init__(self,
n_src_vocab,
......@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
self.src_word_emb = dg.Embedding(
size=[n_src_vocab, d_model], padding_idx=0)
self.pos_inp = get_sinusoid_encoding_table(
n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding(
size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer)
......@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
non_pad_mask = get_non_pad_mask(character)
# -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
enc_output = self.src_word_emb(character) + self.position_enc(
text_pos) #(N, T, C)
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
......@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
return enc_output, non_pad_mask, enc_slf_attn_list
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
......@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder
class FastSpeech(dg.Layer):
def __init__(self, cfg):
" FastSpeech"
super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['encoder_n_layer'],
n_head=cfg['encoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg['dropout'])
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
n_layers=cfg['decoder_n_layer'],
n_head=cfg['decoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.encoder = Encoder(
n_src_vocab=len(symbols) + 1,
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['encoder_n_layer'],
n_head=cfg['encoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.length_regulator = LengthRegulator(
input_size=cfg['fs_hidden_size'],
out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg['dropout'])
self.decoder = Decoder(
len_max_seq=cfg['max_seq_len'],
n_layers=cfg['decoder_n_layer'],
n_head=cfg['decoder_head'],
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg['fs_hidden_size'],
d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
param_attr = self.weight,
bias_attr = self.bias,)
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.mel_linear = dg.Linear(
cfg['fs_hidden_size'],
cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
param_attr=self.weight,
bias_attr=self.bias, )
self.postnet = PostConvNet(
n_mels=cfg['audio']['num_mels'],
num_hidden=512,
filter_size=5,
padding=int(5 / 2),
num_conv=5,
outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
def forward(self,
character,
text_pos,
mel_pos=None,
length_target=None,
alpha=1.0):
"""
FastSpeech model.
......@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
"""
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
character, text_pos)
if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
target=length_target,
alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
length_regulator_output, duration_predictor_output = self.length_regulator(
encoder_output, target=length_target, alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder(
length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
else:
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
length_regulator_output, decoder_pos = self.length_regulator(
encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output,
decoder_pos)
mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet
\ No newline at end of file
return mel_output, mel_output_postnet
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import paddle.fluid.dygraph as dg
......@@ -6,11 +19,32 @@ import paddle.fluid as fluid
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
def __init__(self,
d_model,
d_inner,
n_head,
d_k,
d_v,
filter_size,
padding,
dropout=0.2):
super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
self.slf_attn = MultiheadAttention(
d_model,
d_k,
d_v,
num_head=n_head,
is_bias=True,
dropout=dropout,
is_concat=False)
self.pos_ffn = PositionwiseFeedForward(
d_model,
d_inner,
filter_size=filter_size,
padding=padding,
dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
"""
......@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
"""
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
output, slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask
output = self.pos_ffn(output)
output *= non_pad_mask
return output, slf_attn
\ No newline at end of file
return output, slf_attn
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import math
import parakeet.models.fastspeech.utils
......@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D
class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor(input_size=input_size,
out_channels=out_channels,
filter_size=filter_size,
dropout=dropout)
self.duration_predictor = DurationPredictor(
input_size=input_size,
out_channels=out_channels,
filter_size=filter_size,
dropout=dropout)
def LR(self, x, duration_predictor_output, alpha=1.0):
output = []
batch_size = x.shape[0]
for i in range(batch_size):
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
output.append(
self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
alpha))
output = self.pad(output)
return output
def pad(self, input_ele):
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
out_list = []
for i in range(len(input_ele)):
pad_len = max_len - input_ele[i].shape[0]
one_batch_padded = layers.pad(
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
pad_value=0.0)
out_list.append(one_batch_padded)
out_padded = layers.stack(out_list)
return out_padded
def expand(self, batch, predicted, alpha):
out = []
time_steps = batch.shape[1]
fertilities = predicted.numpy()
batch = layers.squeeze(batch,[0])
batch = layers.squeeze(batch, [0])
for i in range(time_steps):
if fertilities[0,i]==0:
if fertilities[0, i] == 0:
continue
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
out.append(
layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
out = layers.concat(out, axis=0)
return out
def forward(self, x, alpha=1.0, target=None):
"""
......@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
else:
duration_predictor_output = layers.round(duration_predictor_output)
output = self.LR(x, duration_predictor_output, alpha)
mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
mel_pos = layers.unsqueeze(mel_pos, [0])
return output, mel_pos
class DurationPredictor(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(DurationPredictor, self).__init__()
......@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
self.dropout = dropout
k = math.sqrt(1 / self.input_size)
self.conv1 = Conv1D(num_channels = self.input_size,
num_filters = self.out_channels,
filter_size = self.filter_size,
padding=1,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
#data_format='NTC')
self.conv1 = Conv1D(
num_channels=self.input_size,
num_filters=self.out_channels,
filter_size=self.filter_size,
padding=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
k = math.sqrt(1 / self.out_channels)
self.conv2 = Conv1D(num_channels = self.out_channels,
num_filters = self.out_channels,
filter_size = self.filter_size,
padding=1,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
#data_format='NTC')
self.conv2 = Conv1D(
num_channels=self.out_channels,
num_filters=self.out_channels,
filter_size=self.filter_size,
padding=1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / self.out_channels)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
bias_attr = self.bias)
self.linear = dg.Linear(
self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
def forward(self, encoder_output):
"""
......@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
out (Variable), Shape(B, T, C), the output of duration predictor.
"""
# encoder_output.shape(N, T, C)
out = layers.transpose(encoder_output, [0,2,1])
out = layers.transpose(encoder_output, [0, 2, 1])
out = self.conv1(out)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
out = self.conv2(out)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1])
return out
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
def get_alignment(attn_probs, mel_lens, n_head):
max_F = 0
assert attn_probs[0].shape[0] % n_head == 0
......@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy()
for j in range(n_head):
attn = multi_attn[j*batch_size:(j+1)*batch_size]
attn = multi_attn[j * batch_size:(j + 1) * batch_size]
F = score_F(attn)
if max_F < F:
max_F = F
max_attn = attn
alignment = compute_duration(max_attn, mel_lens)
return alignment
def score_F(attn):
max = np.max(attn, axis=-1)
mean = np.mean(max)
return mean
def compute_duration(attn, mel_lens):
alignment = np.zeros([attn.shape[0],attn.shape[2]])
alignment = np.zeros([attn.shape[0], attn.shape[2]])
mel_lens = mel_lens.numpy()
for i in range(attn.shape[0]):
for j in range(mel_lens[i]):
max_index = np.argmax(attn[i,j])
alignment[i,max_index] += 1
max_index = np.argmax(attn[i, j])
alignment[i, max_index] += 1
return alignment
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
......@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
from parakeet.modules.dynamic_gru import DynamicGRU
import numpy as np
class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False):
def __init__(self,
hidden_size,
batch_size,
K=16,
projection_size=256,
num_gru_layers=2,
max_pool_kernel_size=2,
is_post=False):
super(CBHG, self).__init__()
"""
:param hidden_size: dimension of hidden unit
......@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
self.projection_size = projection_size
self.conv_list = []
k = math.sqrt(1 / projection_size)
self.conv_list.append(Conv1D(num_channels = projection_size,
num_filters = hidden_size,
filter_size = 1,
padding = int(np.floor(1/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
self.conv_list.append(
Conv1D(
num_channels=projection_size,
num_filters=hidden_size,
filter_size=1,
padding=int(np.floor(1 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
k = math.sqrt(1 / hidden_size)
for i in range(2,K+1):
self.conv_list.append(Conv1D(num_channels = hidden_size,
num_filters = hidden_size,
filter_size = i,
padding = int(np.floor(i/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
for i in range(2, K + 1):
self.conv_list.append(
Conv1D(
num_channels=hidden_size,
num_filters=hidden_size,
filter_size=i,
padding=int(np.floor(i / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
data_layout='NCHW'))
self.batchnorm_list.append(
dg.BatchNorm(
hidden_size, data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
......@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
conv_outdim = hidden_size * K
k = math.sqrt(1 / conv_outdim)
self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
num_filters = hidden_size,
filter_size = 3,
padding = int(np.floor(3/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
self.conv_projection_1 = Conv1D(
num_channels=conv_outdim,
num_filters=hidden_size,
filter_size=3,
padding=int(np.floor(3 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size)
self.conv_projection_2 = Conv1D(num_channels = hidden_size,
num_filters = projection_size,
filter_size = 3,
padding = int(np.floor(3/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format = "NCT")
self.conv_projection_2 = Conv1D(
num_channels=hidden_size,
num_filters=projection_size,
filter_size=3,
padding=int(np.floor(3 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(
projection_size, data_layout='NCHW')
self.max_pool = Pool1D(
pool_size=max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format="NCT")
self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
k = math.sqrt(1 / hidden_size)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward1 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse1 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward1 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse1 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
self.fc_forward2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:,:,:-1]
return x[:, :, :-1]
else:
return x
return x
def forward(self, input_):
# input_.shape = [N, C, T]
conv_list = []
conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
for i, (conv, batchnorm
) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
conv_pool = self.max_pool(conv_cat)[:, :, :-1]
conv_proj = layers.relu(
self.batchnorm_proj_1(
self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1])
highway = layers.transpose(conv_proj, [0, 2, 1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
......@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1])
out = layers.transpose(out, [0, 2, 1])
return out
class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__()
......@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
self.linears = []
k = math.sqrt(1 / num_units)
for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
self.gates.append(dg.Linear(num_units, num_units,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.linears.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
self.gates.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
......@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
return out
out = h * t_ + out * c
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
......@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.alpha = self.create_parameter(
shape=(1, ),
attr=param,
dtype='float32',
default_initializer=fluid.initializer.ConstantInitializer(
value=1.0))
self.pos_inp = get_sinusoid_encoding_table(
1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(
input_size=config['audio']['num_mels'],
hidden_size=num_hidden * 2,
output_size=num_hidden,
dropout_rate=0.2)
k = math.sqrt(1 / num_hidden)
self.linear = dg.Linear(num_hidden, num_hidden,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.linear = dg.Linear(
num_hidden,
num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
self.selfattn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
self.attn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
self.ffns = [
PositionwiseFeedForward(
num_hidden, num_hidden * num_head, filter_size=1)
for _ in range(3)
]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.stop_linear = dg.Linear(num_hidden, 1,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn = True)
self.mel_linear = dg.Linear(
num_hidden,
config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.stop_linear = dg.Linear(
num_hidden,
1,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(
config['audio']['num_mels'],
config['hidden_size'],
filter_size=5,
padding=4,
num_conv=5,
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn=True)
def forward(self, key, value, query, c_mask, positional):
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
query)
triu_tensor = dg.to_variable(
get_triu_tensor(query.numpy(), query.numpy())).astype(
np.float32)
mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
zero_mask = get_attn_key_pad_mask(
layers.squeeze(c_mask, [-1]), query)
else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = get_triu_tensor(query.numpy(),
query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
......@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
self.ffns):
query, attn_dec = selfattn(
query, query, query, mask=mask, query_mask=m_mask)
query, attn_dot = attn(
key, value, query, mask=zero_mask, query_mask=m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
......@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
# Post Mel Network
out = self.postconvnet(mel_out)
out = mel_out + out
# Stop tokens
stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1])
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import *
......@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=True)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=1.0))
self.alpha = self.create_parameter(
shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(
1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(
embedding_size=embedding_size,
num_hidden=num_hidden,
use_cudnn=True)
self.layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
self.ffns = [
PositionwiseFeedForward(
num_hidden,
num_hidden * num_head,
filter_size=1,
use_cudnn=True) for _ in range(3)
]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
......@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
mask = get_attn_key_pad_mask(positional, x)
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
x = ffn(x)
attentions.append(attention)
return x, query_mask, attentions
\ No newline at end of file
return x, query_mask, attentions
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
......@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
padding_idx = None)
self.embedding = dg.Embedding(
size=[len(symbols), embedding_size], padding_idx=None)
self.conv_list = []
k = math.sqrt(1 / embedding_size)
self.conv_list.append(Conv1D(num_channels = embedding_size,
num_filters = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=embedding_size,
num_filters=num_hidden,
filter_size=5,
padding=int(np.floor(5 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1 / num_hidden)
for _ in range(2):
self.conv_list.append(Conv1D(num_channels = num_hidden,
num_filters = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
use_cudnn = use_cudnn))
self.conv_list.append(
Conv1D(
num_channels=num_hidden,
num_filters=num_hidden,
filter_size=5,
padding=int(np.floor(5 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
data_layout='NCHW') for _ in range(3)]
self.batch_norm_list = [
dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(3)
]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
k = math.sqrt(1 / num_hidden)
self.projection = dg.Linear(num_hidden, num_hidden,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.projection = dg.Linear(
num_hidden,
num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x, [0, 2, 1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
x = self.projection(x)
return x
\ No newline at end of file
return x
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
"""
......@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
self.dropout_rate = dropout_rate
k = math.sqrt(1 / input_size)
self.linear1 = dg.Linear(input_size, hidden_size,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.linear1 = dg.Linear(
input_size,
hidden_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.linear2 = dg.Linear(
hidden_size,
output_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x):
"""
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册