提交 9d796994 编写于 作者: L lifuchen

add license

上级 f84d6bec
...@@ -25,3 +25,11 @@ ...@@ -25,3 +25,11 @@
files: \.md$ files: \.md$
- id: remove-tabs - id: remove-tabs
files: \.md$ files: \.md$
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
# Deepvoice 3 # Deepvoice 3
Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654). Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
...@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed ...@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
## Project Structure ## Project Structure
```text ```text
├── data.py data_processing ├── data.py data_processing
├── ljspeech.yaml (example) configuration file ├── ljspeech.yaml (example) configuration file
├── sentences.txt sample sentences ├── sentences.txt sample sentences
├── synthesis.py script to synthesize waveform from text ├── synthesis.py script to synthesize waveform from text
...@@ -50,7 +50,7 @@ optional arguments: ...@@ -50,7 +50,7 @@ optional arguments:
The directory to save result. The directory to save result.
-g DEVICE, --device DEVICE -g DEVICE, --device DEVICE
device to use device to use
``` ```
1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config. 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt). 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
...@@ -61,7 +61,7 @@ optional arguments: ...@@ -61,7 +61,7 @@ optional arguments:
├── checkpoints # checkpoint ├── checkpoints # checkpoint
├── log # tensorboard log ├── log # tensorboard log
└── states # train and evaluation results └── states # train and evaluation results
├── alignments # attention ├── alignments # attention
├── lin_spec # linear spectrogram ├── lin_spec # linear spectrogram
├── mel_spec # mel spectrogram ├── mel_spec # mel spectrogram
└── waveform # waveform (.wav files) └── waveform # waveform (.wav files)
...@@ -112,4 +112,3 @@ example script: ...@@ -112,4 +112,3 @@ example script:
```bash ```bash
python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
``` ```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import csv import csv
from pathlib import Path from pathlib import Path
...@@ -79,10 +93,11 @@ class Transform(object): ...@@ -79,10 +93,11 @@ class Transform(object):
y = signal.lfilter([1., -self.preemphasis], [1.], wav) y = signal.lfilter([1., -self.preemphasis], [1.], wav)
# STFT # STFT
D = librosa.stft(y=y, D = librosa.stft(
n_fft=self.n_fft, y=y,
win_length=self.win_length, n_fft=self.n_fft,
hop_length=self.hop_length) win_length=self.win_length,
hop_length=self.hop_length)
S = np.abs(D) S = np.abs(D)
# to db and normalize to 0-1 # to db and normalize to 0-1
...@@ -96,11 +111,8 @@ class Transform(object): ...@@ -96,11 +111,8 @@ class Transform(object):
# mel scale and to db and normalize to 0-1, # mel scale and to db and normalize to 0-1,
# CAUTION: pass linear scale S, not dbscaled S # CAUTION: pass linear scale S, not dbscaled S
S_mel = librosa.feature.melspectrogram(S=S, S_mel = librosa.feature.melspectrogram(
n_mels=self.n_mels, S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
fmin=self.fmin,
fmax=self.fmax,
power=1.)
S_mel = 20 * np.log10(np.maximum(amplitude_min, S_mel = 20 * np.log10(np.maximum(amplitude_min,
S_mel)) - self.ref_level_db S_mel)) - self.ref_level_db
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db) S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
...@@ -148,20 +160,18 @@ class DataCollector(object): ...@@ -148,20 +160,18 @@ class DataCollector(object):
(mix_grapheme_phonemes, text_length, speaker_id, S_norm, (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, num_frames) = example S_mel_norm, num_frames) = example
text_sequences.append( text_sequences.append(
np.pad(mix_grapheme_phonemes, np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
(0, max_text_length - text_length))) )))
lin_specs.append( lin_specs.append(
np.pad(S_norm, np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
((0, 0), (self._pad_begin, self._pad_begin - num_frames))))
max_frames - self._pad_begin - num_frames))))
mel_specs.append( mel_specs.append(
np.pad(S_mel_norm, np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
((0, 0), (self._pad_begin, self._pad_begin - num_frames))))
max_frames - self._pad_begin - num_frames))))
done_flags.append( done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )), np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
(0, max_decoder_length - (0, max_decoder_length - int(
int(np.ceil(num_frames // self._factor))), np.ceil(num_frames // self._factor))),
constant_values=1)) constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64) text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs), lin_specs = np.transpose(np.array(lin_specs),
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import argparse import argparse
import ruamel.yaml import ruamel.yaml
...@@ -22,11 +36,8 @@ if __name__ == "__main__": ...@@ -22,11 +36,8 @@ if __name__ == "__main__":
parser.add_argument("checkpoint", type=str, help="checkpoint to load.") parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
parser.add_argument("text", type=str, help="text file to synthesize") parser.add_argument("text", type=str, help="text file to synthesize")
parser.add_argument("output_path", type=str, help="path to save results") parser.add_argument("output_path", type=str, help="path to save results")
parser.add_argument("-g", parser.add_argument(
"--device", "-g", "--device", type=int, default=-1, help="device to use")
type=int,
default=-1,
help="device to use")
args = parser.parse_args() args = parser.parse_args()
with open(args.config, 'rt') as f: with open(args.config, 'rt') as f:
...@@ -76,15 +87,14 @@ if __name__ == "__main__": ...@@ -76,15 +87,14 @@ if __name__ == "__main__":
window_ahead = model_config["window_ahead"] window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"] key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"] value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, dv3 = make_model(
padding_idx, embedding_std, max_positions, n_vocab, n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
freeze_embedding, filter_size, encoder_channels, embedding_std, max_positions, n_vocab, freeze_embedding,
n_mels, decoder_channels, r, filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask, trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, query_position_rate, key_position_rate, window_backward,
window_backward, window_ahead, key_projection, window_ahead, key_projection, value_projection, downsample_factor,
value_projection, downsample_factor, linear_dim, linear_dim, use_decoder_states, converter_channels, dropout)
use_decoder_states, converter_channels, dropout)
summary(dv3) summary(dv3)
state, _ = dg.load_dygraph(args.checkpoint) state, _ = dg.load_dygraph(args.checkpoint)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import argparse import argparse
import ruamel.yaml import ruamel.yaml
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import numpy as np import numpy as np
from matplotlib import cm from matplotlib import cm
...@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
converter_channels, dropout): converter_channels, dropout):
"""just a simple function to create a deepvoice 3 model""" """just a simple function to create a deepvoice 3 model"""
if n_speakers > 1: if n_speakers > 1:
spe = dg.Embedding((n_speakers, speaker_dim), spe = dg.Embedding(
param_attr=I.Normal(scale=speaker_embed_std)) (n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
else: else:
spe = None spe = None
...@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 9), ConvSpec(h, k, 9),
ConvSpec(h, k, 27), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 1),
ConvSpec(h, k, 3), ConvSpec(h, k, 3), )
) enc = Encoder(
enc = Encoder(n_vocab, n_vocab,
embed_dim, embed_dim,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
padding_idx=None, padding_idx=None,
embedding_weight_std=embedding_std, embedding_weight_std=embedding_std,
convolutions=encoder_convolutions, convolutions=encoder_convolutions,
max_positions=max_positions, max_positions=max_positions,
dropout=dropout) dropout=dropout)
if freeze_embedding: if freeze_embedding:
freeze(enc.embed) freeze(enc.embed)
...@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 3), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 9),
ConvSpec(h, k, 27), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 1), )
)
attention = [True, False, False, False, True] attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True]
dec = Decoder(n_speakers, dec = Decoder(
speaker_dim, n_speakers,
embed_dim, speaker_dim,
mel_dim, embed_dim,
r=r, mel_dim,
max_positions=max_positions, r=r,
padding_idx=padding_idx, max_positions=max_positions,
preattention=prenet_convolutions, padding_idx=padding_idx,
convolutions=attentive_convolutions, preattention=prenet_convolutions,
attention=attention, convolutions=attentive_convolutions,
dropout=dropout, attention=attention,
use_memory_mask=use_memory_mask, dropout=dropout,
force_monotonic_attention=force_monotonic_attention, use_memory_mask=use_memory_mask,
query_position_rate=query_position_rate, force_monotonic_attention=force_monotonic_attention,
key_position_rate=key_position_rate, query_position_rate=query_position_rate,
window_range=WindowRange(window_behind, window_ahead), key_position_rate=key_position_rate,
key_projection=key_projection, window_range=WindowRange(window_behind, window_ahead),
value_projection=value_projection) key_projection=key_projection,
value_projection=value_projection)
if not trainable_positional_encodings: if not trainable_positional_encodings:
freeze(dec.embed_keys_positions) freeze(dec.embed_keys_positions)
freeze(dec.embed_query_positions) freeze(dec.embed_query_positions)
...@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 1), ConvSpec(h, k, 1),
ConvSpec(h, k, 3), ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3), ConvSpec(2 * h, k, 3), )
) cvt = Converter(
cvt = Converter(n_speakers, n_speakers,
speaker_dim, speaker_dim,
dec.state_dim if use_decoder_states else mel_dim, dec.state_dim if use_decoder_states else mel_dim,
linear_dim, linear_dim,
time_upsampling=downsample_factor, time_upsampling=downsample_factor,
convolutions=postnet_convolutions, convolutions=postnet_convolutions,
dropout=dropout) dropout=dropout)
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states) dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
return dv3 return dv3
...@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db, ...@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length, ref_level_db, power, n_iter, win_length, hop_length,
preemphasis): preemphasis):
"""generate waveform from text using a deepvoice 3 model""" """generate waveform from text using a deepvoice 3 model"""
text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob), text = np.array(
dtype=np.int64) en.text_to_sequence(
text, p=replace_pronounciation_prob),
dtype=np.int64)
length = len(text) length = len(text)
print("text sequence's length: {}".format(length)) print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length) text_positions = np.arange(1, 1 + length)
...@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter, ...@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
""" """
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10)) lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(lin_scaled**power, wav = librosa.griffinlim(
n_iter=n_iter, lin_scaled**power,
hop_length=hop_length, n_iter=n_iter,
win_length=win_length) hop_length=hop_length,
win_length=win_length)
if preemphasis > 0: if preemphasis > 0:
wav = signal.lfilter([1.], [1., -preemphasis], wav) wav = signal.lfilter([1.], [1., -preemphasis], wav)
return wav return wav
...@@ -225,28 +243,30 @@ def save_state(save_dir, ...@@ -225,28 +243,30 @@ def save_state(save_dir,
plt.colorbar() plt.colorbar()
plt.title("mel_input") plt.title("mel_input")
plt.savefig( plt.savefig(
os.path.join(path, os.path.join(path, "target_mel_spec_step{:09d}.png".format(
"target_mel_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("target/mel_spec", writer.add_image(
cm.viridis(mel_input), "target/mel_spec",
global_step, cm.viridis(mel_input),
dataformats="HWC") global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3)) plt.figure(figsize=(10, 3))
display.specshow(mel_output) display.specshow(mel_output)
plt.colorbar() plt.colorbar()
plt.title("mel_output") plt.title("mel_output")
plt.savefig( plt.savefig(
os.path.join( os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
path, "predicted_mel_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("predicted/mel_spec", writer.add_image(
cm.viridis(mel_output), "predicted/mel_spec",
global_step, cm.viridis(mel_output),
dataformats="HWC") global_step,
dataformats="HWC")
if lin_input is not None and lin_output is not None: if lin_input is not None and lin_output is not None:
lin_input = lin_input[0].numpy().T lin_input = lin_input[0].numpy().T
...@@ -258,28 +278,30 @@ def save_state(save_dir, ...@@ -258,28 +278,30 @@ def save_state(save_dir,
plt.colorbar() plt.colorbar()
plt.title("mel_input") plt.title("mel_input")
plt.savefig( plt.savefig(
os.path.join(path, os.path.join(path, "target_lin_spec_step{:09d}.png".format(
"target_lin_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("target/lin_spec", writer.add_image(
cm.viridis(lin_input), "target/lin_spec",
global_step, cm.viridis(lin_input),
dataformats="HWC") global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3)) plt.figure(figsize=(10, 3))
display.specshow(lin_output) display.specshow(lin_output)
plt.colorbar() plt.colorbar()
plt.title("mel_input") plt.title("mel_input")
plt.savefig( plt.savefig(
os.path.join( os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
path, "predicted_lin_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("predicted/lin_spec", writer.add_image(
cm.viridis(lin_output), "predicted/lin_spec",
global_step, cm.viridis(lin_output),
dataformats="HWC") global_step,
dataformats="HWC")
if alignments is not None and len(alignments.shape) == 4: if alignments is not None and len(alignments.shape) == 4:
path = os.path.join(save_dir, "alignments") path = os.path.join(save_dir, "alignments")
...@@ -290,10 +312,11 @@ def save_state(save_dir, ...@@ -290,10 +312,11 @@ def save_state(save_dir,
"train_attn_layer_{}_step_{}.png".format(idx, global_step)) "train_attn_layer_{}_step_{}.png".format(idx, global_step))
plot_alignment(attn_layer, save_path) plot_alignment(attn_layer, save_path)
writer.add_image("train_attn/layer_{}".format(idx), writer.add_image(
cm.viridis(attn_layer), "train_attn/layer_{}".format(idx),
global_step, cm.viridis(attn_layer),
dataformats="HWC") global_step,
dataformats="HWC")
if lin_output is not None: if lin_output is not None:
wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power, wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
...@@ -302,7 +325,5 @@ def save_state(save_dir, ...@@ -302,7 +325,5 @@ def save_state(save_dir,
save_path = os.path.join( save_path = os.path.join(
path, "train_sample_step_{:09d}.wav".format(global_step)) path, "train_sample_step_{:09d}.wav".format(global_step))
sf.write(save_path, wav, sample_rate) sf.write(save_path, wav, sample_rate)
writer.add_audio("train_sample", writer.add_audio(
wav, "train_sample", wav, global_step, sample_rate=sample_rate)
global_step,
sample_rate=sample_rate)
...@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ...@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step`` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
For more help on arguments: For more help on arguments:
``python train.py --help``. ``python train.py --help``.
## Synthesis ## Synthesis
...@@ -75,5 +75,5 @@ or you can run the script file directly. ...@@ -75,5 +75,5 @@ or you can run the script file directly.
sh synthesis.sh sh synthesis.sh
``` ```
For more help on arguments: For more help on arguments:
``python synthesis.py --help``. ``python synthesis.py --help``.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml', parser.add_argument(
'--config_path',
type=str,
default='config/fastspeech.yaml',
help="the yaml config file path.") help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32, parser.add_argument(
help="batch size for training.") '--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000, parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.") help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001, parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.") help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500, parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.") help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000, parser.add_argument(
'--fastspeech_step',
type=int,
default=70000,
help="Global step to restore checkpoint of fastspeech.") help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1, parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.") help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0, parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.") help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.") help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None, parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.") help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint', parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.") help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log', parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.") help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample', parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.") help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log', parser.add_argument(
'--transtts_path',
type=str,
default='./log',
help="the directory to load pretrain transformerTTS model.") help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000, parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="the step to load transformerTTS model.") help="the step to load transformerTTS model.")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from collections import OrderedDict from collections import OrderedDict
...@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence ...@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
from parakeet import audio from parakeet import audio
from parakeet.models.fastspeech.fastspeech import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
...@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path): ...@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, args): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard # tensorboard
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis') path = os.path.join(args.log_dir, 'synthesis')
with open(args.config_path) as f: with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader) cfg = yaml.load(f, Loader=yaml.Loader)
...@@ -37,24 +52,28 @@ def synthesis(text_input, args): ...@@ -37,24 +52,28 @@ def synthesis(text_input, args):
with dg.guard(place): with dg.guard(place):
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))) model.set_dict(
load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech")))
model.eval() model.eval()
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
pos_text = np.arange(1, text.shape[1]+1) pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha) mel_output, mel_output_postnet = model(
text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'], sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'], num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'], min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'], ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'], n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'], win_length=cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'], hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'], power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'], preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
...@@ -67,14 +86,17 @@ def synthesis(text_input, args): ...@@ -67,14 +86,17 @@ def synthesis(text_input, args):
do_trim_silence=False, do_trim_silence=False,
sound_norm=False) sound_norm=False)
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) mel_output_postnet = fluid.layers.transpose(
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
))
writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!") print("Synthesis completed !!!")
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model") parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
synthesis("Transformer model is so fast!", args) synthesis("Transformer model is so fast!", args)
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import argparse import argparse
import os import os
...@@ -20,8 +33,10 @@ import sys ...@@ -20,8 +33,10 @@ import sys
sys.path.append("../transformer_tts") sys.path.append("../transformer_tts")
from data import LJSpeechLoader from data import LJSpeechLoader
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
for param in model_dict: for param in model_dict:
if param.startswith('_layers.'): if param.startswith('_layers.'):
...@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path): ...@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(args): def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
...@@ -43,26 +59,33 @@ def main(args): ...@@ -43,26 +59,33 @@ def main(args):
if args.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'fastspeech') path = os.path.join(args.log_dir, 'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg) transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) model_dict, _ = load_checkpoint(
str(args.transformer_step),
os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict) transformerTTS.set_dict(model_dict)
transformerTTS.eval() transformerTTS.eval()
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), optimizer = fluid.optimizer.AdamOptimizer(
parameter_list=model.parameters()) learning_rate=dg.NoamDecay(1 / (
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) model_dict, opti_dict = load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = args.fastspeech_step global_step = args.fastspeech_step
...@@ -76,31 +99,42 @@ def main(args): ...@@ -76,31 +99,42 @@ def main(args):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) _, _, attn_probs, _, _, _ = transformerTTS(
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32) character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(
get_alignment(attn_probs, mel_lens, cfg[
'transformer_head'])).astype(np.float32)
global_step += 1 global_step += 1
#Forward #Forward
result= model(character, result = model(
pos_text, character,
mel_pos=pos_mel, pos_text,
length_target=alignment) mel_pos=pos_mel,
length_target=alignment)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel) mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment))) duration_loss = layers.mean(
layers.abs(
layers.elementwise_sub(duration_predictor_output,
alignment)))
total_loss = mel_loss + mel_postnet_loss + duration_loss total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0: if local_rank == 0:
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('mel_loss',
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) mel_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('post_mel_loss',
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss',
duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if args.use_data_parallel: if args.use_data_parallel:
total_loss = model.scale_loss(total_loss) total_loss = model.scale_loss(total_loss)
...@@ -108,21 +142,25 @@ def main(args): ...@@ -108,21 +142,25 @@ def main(args):
model.apply_collective_grads() model.apply_collective_grads()
else: else:
total_loss.backward() total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) optimizer.minimize(
total_loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % args.save_step == 0: if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path): if not os.path.exists(args.save_path):
os.mkdir(args.save_path) os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step) save_path = os.path.join(args.save_path,
'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank == 0:
writer.close() writer.close()
if __name__ =='__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model") parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
......
...@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ...@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step`` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
For more help on arguments: For more help on arguments:
``python train_transformer.py --help``. ``python train_transformer.py --help``.
## Train Vocoder ## Train Vocoder
...@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ...@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
``` ```
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step`` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
For more help on arguments: For more help on arguments:
``python train_vocoder.py --help``. ``python train_vocoder.py --help``.
## Synthesis ## Synthesis
...@@ -101,5 +101,5 @@ sh synthesis.sh ...@@ -101,5 +101,5 @@ sh synthesis.sh
And the audio file will be saved in ``--sample_path``. And the audio file will be saved in ``--sample_path``.
For more help on arguments: For more help on arguments:
``python synthesis.py --help``. ``python synthesis.py --help``.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo ...@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader: class LJSpeechLoader:
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True): def __init__(self,
config,
args,
nranks,
rank,
is_vocoder=False,
shuffle=True):
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(args.data_path) LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT) metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config) transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer) dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) sampler = DistributedSampler(
len(metadata), nranks, rank, shuffle=shuffle)
assert args.batch_size % nranks == 0 assert args.batch_size % nranks == 0
each_bs = args.batch_size // nranks each_bs = args.batch_size // nranks
if is_vocoder: if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples_vocoder,
drop_last=True)
else: else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True) dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples,
drop_last=True)
self.reader = fluid.io.DataLoader.from_generator( self.reader = fluid.io.DataLoader.from_generator(
capacity=32, capacity=32,
iterable=True, iterable=True,
...@@ -63,13 +96,13 @@ class LJSpeech(object): ...@@ -63,13 +96,13 @@ class LJSpeech(object):
super(LJSpeech, self).__init__() super(LJSpeech, self).__init__()
self.config = config self.config = config
self._ljspeech_processor = audio.AudioProcessor( self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config['audio']['sr'], sample_rate=config['audio']['sr'],
num_mels=config['audio']['num_mels'], num_mels=config['audio']['num_mels'],
min_level_db=config['audio']['min_level_db'], min_level_db=config['audio']['min_level_db'],
ref_level_db=config['audio']['ref_level_db'], ref_level_db=config['audio']['ref_level_db'],
n_fft=config['audio']['n_fft'], n_fft=config['audio']['n_fft'],
win_length= config['audio']['win_length'], win_length=config['audio']['win_length'],
hop_length= config['audio']['hop_length'], hop_length=config['audio']['hop_length'],
power=config['audio']['power'], power=config['audio']['power'],
preemphasis=config['audio']['preemphasis'], preemphasis=config['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
...@@ -81,7 +114,7 @@ class LJSpeech(object): ...@@ -81,7 +114,7 @@ class LJSpeech(object):
griffin_lim_iters=60, griffin_lim_iters=60,
do_trim_silence=False, do_trim_silence=False,
sound_norm=False) sound_norm=False)
def __call__(self, metadatum): def __call__(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a """All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method. different preprocessing pipeline, you can override this method.
...@@ -90,13 +123,15 @@ class LJSpeech(object): ...@@ -90,13 +123,15 @@ class LJSpeech(object):
method. method.
""" """
fname, raw_text, normalized_text = metadatum fname, raw_text, normalized_text = metadatum
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = self._ljspeech_processor.load_wav(str(fname)) wav = self._ljspeech_processor.load_wav(str(fname))
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) phonemes = np.array(
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes
) # maybe we need to implement it as a map in the future
def batch_examples(batch): def batch_examples(batch):
...@@ -109,44 +144,71 @@ def batch_examples(batch): ...@@ -109,44 +144,71 @@ def batch_examples(batch):
pos_mels = [] pos_mels = []
for data in batch: for data in batch:
_, mel, text = data _, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) mel_inputs.append(
np.concatenate(
[np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
axis=-1))
mel_lens.append(mel.shape[1]) mel_lens.append(mel.shape[1])
text_lens.append(len(text)) text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1)) pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1)) pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel) mels.append(mel)
texts.append(text) texts.append(text)
# Sort by text_len in descending order # Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] texts = [
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] i
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] for i, _ in sorted(
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)] zip(texts, text_lens), key=lambda x: x[1], reverse=True)
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] ]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] mels = [
i
for i, _ in sorted(
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
]
mel_inputs = [
i
for i, _ in sorted(
zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
]
mel_lens = [
i
for i, _ in sorted(
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
]
pos_texts = [
i
for i, _ in sorted(
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
]
pos_mels = [
i
for i, _ in sorted(
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
]
text_lens = sorted(text_lens, reverse=True) text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch # Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts) #(B, T) texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) mels = np.transpose(
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens)) mel_inputs = np.transpose(
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
np.array(mel_lens))
def batch_examples_vocoder(batch): def batch_examples_vocoder(batch):
mels=[] mels = []
mags=[] mags = []
for data in batch: for data in batch:
mag, mel, _ = data mag, mel, _ = data
mels.append(mel) mels.append(mel)
mags.append(mag) mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
return (mels, mags) return (mels, mags)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml', parser.add_argument(
'--config_path',
type=str,
default='config/train_transformer.yaml',
help="the yaml config file path.") help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32, parser.add_argument(
help="batch size for training.") '--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000, parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.") help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001, parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.") help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500, parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.") help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000, parser.add_argument(
'--image_step',
type=int,
default=2000,
help="attention image interval during training.") help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400, parser.add_argument(
'--max_len',
type=int,
default=400,
help="The max length of audio when synthsis.") help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000, parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="Global step to restore checkpoint of transformer.") help="Global step to restore checkpoint of transformer.")
parser.add_argument('--vocoder_step', type=int, default=90000, parser.add_argument(
'--vocoder_step',
type=int,
default=90000,
help="Global step to restore checkpoint of postnet.") help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=int, default=1, parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.") help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0, parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.") help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=int, default=0, parser.add_argument(
'--stop_token',
type=int,
default=0,
help="use stop token loss in network or not.") help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.") help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None, parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.") help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint', parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.") help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log', parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.") help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample', parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.") help="the directory to save audio sample in synthesis.")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from scipy.io.wavfile import write from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
...@@ -16,6 +29,7 @@ from parakeet import audio ...@@ -16,6 +29,7 @@ from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
...@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path): ...@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, args): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
...@@ -34,46 +49,53 @@ def synthesis(text_input, args): ...@@ -34,46 +49,53 @@ def synthesis(text_input, args):
# tensorboard # tensorboard
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis') path = os.path.join(args.log_dir, 'synthesis')
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))) model.set_dict(
load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer")))
model.eval() model.eval()
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model_vocoder = Vocoder(cfg, args.batch_size) model_vocoder = Vocoder(cfg, args.batch_size)
model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))) model_vocoder.set_dict(
load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder")))
model_vocoder.eval() model_vocoder.eval()
# init input # init input
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1) pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
pbar = tqdm(range(args.max_len)) pbar = tqdm(range(args.max_len))
for i in pbar: for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1) pos_mel = np.arange(1, mel_input.shape[1] + 1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0]) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat(
[mel_input, postnet_pred[:, -1:, :]], axis=1)
mag_pred = model_vocoder(postnet_pred) mag_pred = model_vocoder(postnet_pred)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'], sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'], num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'], min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'], ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'], n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'], win_length=cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'], hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'], power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'], preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
...@@ -86,13 +108,18 @@ def synthesis(text_input, args): ...@@ -86,13 +108,18 @@ def synthesis(text_input, args):
do_trim_silence=False, do_trim_silence=False,
sound_norm=False) sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) wav = _ljspeech_processor.inv_spectrogram(
fluid.layers.transpose(
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(args.sample_path): if not os.path.exists(args.sample_path):
os.mkdir(args.sample_path) os.mkdir(args.sample_path)
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav) write(
os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
wav)
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Synthesis model") parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
...@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy ...@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
from data import LJSpeechLoader from data import LJSpeechLoader
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
for param in model_dict: for param in model_dict:
if param.startswith('_layers.'): if param.startswith('_layers.'):
...@@ -40,22 +55,27 @@ def main(args): ...@@ -40,22 +55,27 @@ def main(args):
if args.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'transformer') path = os.path.join(args.log_dir, 'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), optimizer = fluid.optimizer.AdamOptimizer(
parameter_list=model.parameters()) learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")) model_dict, opti_dict = load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = args.transformer_step global_step = args.transformer_step
...@@ -64,86 +84,112 @@ def main(args): ...@@ -64,86 +84,112 @@ def main(args):
if args.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(args.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
global_step += 1 global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
character, mel_input, pos_text, pos_mel)
label = (pos_mel == 0).astype(np.float32) label = (pos_mel == 0).astype(np.float32)
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) mel_loss = layers.mean(
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work. # Note: When used stop token loss the learning did not work.
if args.stop_token: if args.stop_token:
stop_loss = cross_entropy(stop_preds, label) stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss loss = loss + stop_loss
if local_rank==0: if local_rank == 0:
writer.add_scalars('training_loss', { writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(), 'mel_loss': mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy() 'post_mel_loss': post_mel_loss.numpy()
}, global_step) }, global_step)
if args.stop_token: if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) writer.add_scalar('stop_loss',
stop_loss.numpy(), global_step)
if args.use_data_parallel: if args.use_data_parallel:
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
'encoder_alpha':model._layers.encoder.alpha.numpy(), 'encoder_alpha':
'decoder_alpha':model._layers.decoder.alpha.numpy(), model._layers.encoder.alpha.numpy(),
'decoder_alpha':
model._layers.decoder.alpha.numpy(),
}, global_step) }, global_step)
else: else:
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(), 'encoder_alpha': model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(),
}, global_step) }, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if global_step % args.image_step == 1: if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs): for i, prob in enumerate(attn_probs):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC") cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_enc): for i, prob in enumerate(attn_enc):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_enc_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_dec): for i, prob in enumerate(attn_dec):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_dec_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
if args.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % args.save_step == 0: if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path): if not os.path.exists(args.save_path):
os.mkdir(args.save_path) os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'transformer/%d' % global_step) save_path = os.path.join(args.save_path,
'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank == 0:
writer.close() writer.close()
if __name__ =='__main__':
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train TransformerTTS model") parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import os import os
from tqdm import tqdm from tqdm import tqdm
...@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers ...@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
from data import LJSpeechLoader from data import LJSpeechLoader
from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
...@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path): ...@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(args): def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
...@@ -35,23 +50,26 @@ def main(args): ...@@ -35,23 +50,26 @@ def main(args):
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'vocoder') path = os.path.join(args.log_dir, 'vocoder')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = Vocoder(cfg, args.batch_size) model = Vocoder(cfg, args.batch_size)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), optimizer = fluid.optimizer.AdamOptimizer(
parameter_list=model.parameters()) learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
if args.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")) model_dict, opti_dict = load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = args.vocoder_step global_step = args.vocoder_step
...@@ -61,48 +79,55 @@ def main(args): ...@@ -61,48 +79,55 @@ def main(args):
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader() reader = LJSpeechLoader(
cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(args.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d' % epoch)
mel, mag = data mel, mag = data
mag = dg.to_variable(mag.numpy()) mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy()) mel = dg.to_variable(mel.numpy())
global_step += 1 global_step += 1
mag_pred = model(mel) mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) loss = layers.mean(
layers.abs(layers.elementwise_sub(mag_pred, mag)))
if args.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
if local_rank==0: if local_rank == 0:
writer.add_scalars('training_loss',{ writer.add_scalars('training_loss', {
'loss':loss.numpy(), 'loss': loss.numpy(),
}, global_step) }, global_step)
if global_step % args.save_step == 0: if global_step % args.save_step == 0:
if not os.path.exists(args.save_path): if not os.path.exists(args.save_path):
os.mkdir(args.save_path) os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'vocoder/%d' % global_step) save_path = os.path.join(args.save_path,
'vocoder/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank == 0:
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train vocoder model") parser = argparse.ArgumentParser(description="Train vocoder model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
# Print the whole config setting. # Print the whole config setting.
pprint(args) pprint(args)
main(args) main(args)
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
from pprint import pprint from pprint import pprint
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
from pprint import pprint from pprint import pprint
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
import subprocess import subprocess
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import os import os
import time import time
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.0.0" __version__ = "0.0.0"
from . import data, g2p, models, modules from . import data, g2p, models, modules
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor from .audio import AudioProcessor
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa import librosa
import soundfile as sf import soundfile as sf
import numpy as np import numpy as np
import scipy.io import scipy.io
import scipy.signal import scipy.signal
class AudioProcessor(object): class AudioProcessor(object):
def __init__(self, def __init__(
sample_rate=None, # int, sampling rate self,
num_mels=None, # int, bands of mel spectrogram sample_rate=None, # int, sampling rate
min_level_db=None, # float, minimum level db num_mels=None, # int, bands of mel spectrogram
ref_level_db=None, # float, reference level db min_level_db=None, # float, minimum level db
n_fft=None, # int: number of samples in a frame for stft ref_level_db=None, # float, reference level db
win_length=None, # int: the same meaning with n_fft n_fft=None, # int: number of samples in a frame for stft
hop_length=None, # int: number of samples between neighboring frame win_length=None, # int: the same meaning with n_fft
power=None, # float:power to raise before griffin-lim hop_length=None, # int: number of samples between neighboring frame
preemphasis=None, # float: preemphasis coefficident power=None, # float:power to raise before griffin-lim
signal_norm=None, # preemphasis=None, # float: preemphasis coefficident
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] signal_norm=None, #
max_norm=None, # float, max norm symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
mel_fmin=None, # int: mel spectrogram's minimum frequency max_norm=None, # float, max norm
mel_fmax=None, # int: mel spectrogram's maximum frequency mel_fmin=None, # int: mel spectrogram's minimum frequency
clip_norm=True, # bool: clip spectrogram's norm mel_fmax=None, # int: mel spectrogram's maximum frequency
griffin_lim_iters=None, # int: clip_norm=True, # bool: clip spectrogram's norm
do_trim_silence=False, # bool: trim silence griffin_lim_iters=None, # int:
sound_norm=False, do_trim_silence=False, # bool: trim silence
**kwargs): sound_norm=False,
**kwargs):
self.sample_rate = sample_rate self.sample_rate = sample_rate
self.num_mels = num_mels self.num_mels = num_mels
self.min_level_db = min_level_db self.min_level_db = min_level_db
...@@ -34,8 +50,8 @@ class AudioProcessor(object): ...@@ -34,8 +50,8 @@ class AudioProcessor(object):
self.n_fft = n_fft self.n_fft = n_fft
self.win_length = win_length or n_fft self.win_length = win_length or n_fft
# hop length defaults to 1/4 window_length # hop length defaults to 1/4 window_length
self.hop_length = hop_length or 0.25 * self.win_length self.hop_length = hop_length or 0.25 * self.win_length
self.power = power self.power = power
self.preemphasis = float(preemphasis) self.preemphasis = float(preemphasis)
...@@ -52,7 +68,8 @@ class AudioProcessor(object): ...@@ -52,7 +68,8 @@ class AudioProcessor(object):
self.do_trim_silence = do_trim_silence self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm self.sound_norm = sound_norm
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters() self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
)
def _stft_parameters(self): def _stft_parameters(self):
"""compute frame length and hop length in ms""" """compute frame length and hop length in ms"""
...@@ -65,44 +82,54 @@ class AudioProcessor(object): ...@@ -65,44 +82,54 @@ class AudioProcessor(object):
"""object repr""" """object repr"""
cls_name_str = self.__class__.__name__ cls_name_str = self.__class__.__name__
members = vars(self) members = vars(self)
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()]) dict_str = "\n".join(
[" {}: {},".format(k, v) for k, v in members.items()])
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str) repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
return repr_str return repr_str
def save_wav(self, path, wav): def save_wav(self, path, wav):
"""save audio with scipy.io.wavfile in 16bit integers""" """save audio with scipy.io.wavfile in 16bit integers"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16)) scipy.io.wavfile.write(path, self.sample_rate,
wav_norm.as_type(np.int16))
def load_wav(self, path, sr=None): def load_wav(self, path, sr=None):
"""load wav -> trim_silence -> rescale""" """load wav -> trim_silence -> rescale"""
x, sr = librosa.load(path, sr=None) x, sr = librosa.load(path, sr=None)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate) assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
sr, self.sample_rate)
if self.do_trim_silence: if self.do_trim_silence:
try: try:
x = self.trim_silence(x) x = self.trim_silence(x)
except ValueError: except ValueError:
print(" [!] File cannot be trimmed for silence - {}".format(path)) print(" [!] File cannot be trimmed for silence - {}".format(
path))
if self.sound_norm: if self.sound_norm:
x = x / x.max() * 0.9 # why 0.9 ? x = x / x.max() * 0.9 # why 0.9 ?
return x return x
def trim_silence(self, wav): def trim_silence(self, wav):
"""Trim soilent parts with a threshold and 0.01s margin""" """Trim soilent parts with a threshold and 0.01s margin"""
margin = int(self.sample_rate * 0.01) margin = int(self.sample_rate * 0.01)
wav = wav[margin: -margin] wav = wav[margin:-margin]
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] trimed_wav = librosa.effects.trim(
wav,
top_db=60,
frame_length=self.win_length,
hop_length=self.hop_length)[0]
return trimed_wav return trimed_wav
def apply_preemphasis(self, x): def apply_preemphasis(self, x):
if self.preemphasis == 0.: if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ") raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x) return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
def apply_inv_preemphasis(self, x): def apply_inv_preemphasis(self, x):
if self.preemphasis == 0.: if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ") raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x) return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
def _amplitude_to_db(self, x): def _amplitude_to_db(self, x):
...@@ -125,12 +152,11 @@ class AudioProcessor(object): ...@@ -125,12 +152,11 @@ class AudioProcessor(object):
"""return mel basis for mel scale""" """return mel basis for mel scale"""
if self.mel_fmax is not None: if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2 assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel( return librosa.filters.mel(self.sample_rate,
self.sample_rate, self.n_fft,
self.n_fft, n_mels=self.num_mels,
n_mels=self.num_mels, fmin=self.mel_fmin,
fmin=self.mel_fmin, fmax=self.mel_fmax)
fmax=self.mel_fmax)
def _normalize(self, S): def _normalize(self, S):
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]""" """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
...@@ -156,25 +182,29 @@ class AudioProcessor(object): ...@@ -156,25 +182,29 @@ class AudioProcessor(object):
if self.symmetric_norm: if self.symmetric_norm:
if self.clip_norm: if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db S_denorm = (S_denorm + self.max_norm) * (
-self.min_level_db) / (2 * self.max_norm
) + self.min_level_db
return S_denorm return S_denorm
else: else:
if self.clip_norm: if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm) S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db S_denorm = S_denorm * (-self.min_level_db
) / self.max_norm + self.min_level_db
return S_denorm return S_denorm
else: else:
return S return S
def _stft(self, y): def _stft(self, y):
return librosa.stft( return librosa.stft(
y=y, y=y,
n_fft=self.n_fft, n_fft=self.n_fft,
win_length=self.win_length, win_length=self.win_length,
hop_length=self.hop_length) hop_length=self.hop_length)
def _istft(self, S): def _istft(self, S):
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length) return librosa.istft(
S, hop_length=self.hop_length, win_length=self.win_length)
def spectrogram(self, y): def spectrogram(self, y):
"""compute linear spectrogram(amplitude) """compute linear spectrogram(amplitude)
...@@ -195,7 +225,8 @@ class AudioProcessor(object): ...@@ -195,7 +225,8 @@ class AudioProcessor(object):
D = self._stft(self.apply_preemphasis(y)) D = self._stft(self.apply_preemphasis(y))
else: else:
D = self._stft(y) D = self._stft(y)
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db S = self._amplitude_to_db(self._linear_to_mel(np.abs(
D))) - self.ref_level_db
return self._normalize(S) return self._normalize(S)
def inv_spectrogram(self, spectrogram): def inv_spectrogram(self, spectrogram):
...@@ -203,16 +234,16 @@ class AudioProcessor(object): ...@@ -203,16 +234,16 @@ class AudioProcessor(object):
S = self._denormalize(spectrogram) S = self._denormalize(spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db) S = self._db_to_amplitude(S + self.ref_level_db)
if self.preemphasis: if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S ** self.power) return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram): def inv_melspectrogram(self, mel_spectrogram):
S = self._denormalize(mel_spectrogram) S = self._denormalize(mel_spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db) S = self._db_to_amplitude(S + self.ref_level_db)
S = self._mel_to_linear(np.abs(S)) S = self._mel_to_linear(np.abs(S))
if self.preemphasis: if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S ** self.power) return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec): def out_linear_to_mel(self, linear_spec):
"""convert output linear spec to mel spec""" """convert output linear spec to mel spec"""
...@@ -222,7 +253,7 @@ class AudioProcessor(object): ...@@ -222,7 +253,7 @@ class AudioProcessor(object):
S = self._amplitude_to_db(S) - self.ref_level_db S = self._amplitude_to_db(S) - self.ref_level_db
mel = self._normalize(S) mel = self._normalize(S)
return mel return mel
def _griffin_lim(self, S): def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex) S_complex = np.abs(S).astype(np.complex)
...@@ -234,18 +265,18 @@ class AudioProcessor(object): ...@@ -234,18 +265,18 @@ class AudioProcessor(object):
@staticmethod @staticmethod
def mulaw_encode(wav, qc): def mulaw_encode(wav, qc):
mu = 2 ** qc - 1 mu = 2**qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0) # wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels. # Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5 signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,) return np.floor(signal, )
@staticmethod @staticmethod
def mulaw_decode(wav, qc): def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values.""" """Recovers waveform from quantized values."""
mu = 2 ** qc - 1 mu = 2**qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
return x return x
@staticmethod @staticmethod
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .dataset import * from .dataset import *
from .datacargo import * from .datacargo import *
from .sampler import * from .sampler import *
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
functions to make batch for arrays which satisfy some conditions. functions to make batch for arrays which satisfy some conditions.
""" """
import numpy as np import numpy as np
class TextIDBatcher(object): class TextIDBatcher(object):
"""A wrapper class for a function to build a functor, which holds the configs to pass to the function.""" """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
def __init__(self, pad_id=0, dtype=np.int64): def __init__(self, pad_id=0, dtype=np.int64):
self.pad_id = pad_id self.pad_id = pad_id
self.dtype = dtype self.dtype = dtype
def __call__(self, minibatch): def __call__(self, minibatch):
out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype) out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
return out return out
def batch_text_id(minibatch, pad_id=0, dtype=np.int64): def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
""" """
minibatch: List[Example] minibatch: List[Example]
...@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64): ...@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
""" """
peek_example = minibatch[0] peek_example = minibatch[0]
assert len(peek_example.shape) == 1, "text example is an 1D tensor" assert len(peek_example.shape) == 1, "text example is an 1D tensor"
lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, ) lengths = [example.shape[0] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths) max_len = np.max(lengths)
batch = [] batch = []
for example in minibatch: for example in minibatch:
pad_len = max_len - example.shape[0] pad_len = max_len - example.shape[0]
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id)) batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_id))
return np.array(batch, dtype=dtype) return np.array(batch, dtype=dtype)
class WavBatcher(object): class WavBatcher(object):
def __init__(self, pad_value=0., dtype=np.float32): def __init__(self, pad_value=0., dtype=np.float32):
self.pad_value = pad_value self.pad_value = pad_value
self.dtype = dtype self.dtype = dtype
def __call__(self, minibatch): def __call__(self, minibatch):
out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype) out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out return out
def batch_wav(minibatch, pad_value=0., dtype=np.float32): def batch_wav(minibatch, pad_value=0., dtype=np.float32):
""" """
minibatch: List[Example] minibatch: List[Example]
...@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): ...@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
mono_channel = True mono_channel = True
elif len(peek_example.shape) == 2: elif len(peek_example.shape) == 2:
mono_channel = False mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, ) lengths = [example.shape[-1] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths) max_len = np.max(lengths)
batch = [] batch = []
for example in minibatch: for example in minibatch:
pad_len = max_len - example.shape[-1] pad_len = max_len - example.shape[-1]
if mono_channel: if mono_channel:
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value)) batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_value))
else: else:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype) return np.array(batch, dtype=dtype)
...@@ -75,6 +104,7 @@ class SpecBatcher(object): ...@@ -75,6 +104,7 @@ class SpecBatcher(object):
out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype) out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out return out
def batch_spec(minibatch, pad_value=0., dtype=np.float32): def batch_spec(minibatch, pad_value=0., dtype=np.float32):
""" """
minibatch: List[Example] minibatch: List[Example]
...@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): ...@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
mono_channel = True mono_channel = True
elif len(peek_example.shape) == 3: elif len(peek_example.shape) == 3:
mono_channel = False mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame) lengths = [example.shape[-1] for example in minibatch
max_len = np.max(lengths) ] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
batch = [] batch = []
for example in minibatch: for example in minibatch:
pad_len = max_len - example.shape[-1] pad_len = max_len - example.shape[-1]
if mono_channel: if mono_channel:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value))
else: else:
batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no batch.append(
np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
return np.array(batch, dtype=dtype) mode='constant',
\ No newline at end of file constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six import six
from .sampler import SequentialSampler, RandomSampler, BatchSampler from .sampler import SequentialSampler, RandomSampler, BatchSampler
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six import six
import numpy as np import numpy as np
...@@ -9,8 +23,7 @@ class DatasetMixin(object): ...@@ -9,8 +23,7 @@ class DatasetMixin(object):
if isinstance(index, slice): if isinstance(index, slice):
start, stop, step = index.indices(len(self)) start, stop, step = index.indices(len(self))
return [ return [
self.get_example(i) self.get_example(i) for i in six.moves.range(start, stop, step)
for i in six.moves.range(start, stop, step)
] ]
elif isinstance(index, (list, np.ndarray)): elif isinstance(index, (list, np.ndarray)):
return [self.get_example(i) for i in index] return [self.get_example(i) for i in index]
...@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin): ...@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
def get_example(self, i): def get_example(self, i):
if i < 0: if i < 0:
raise IndexError( raise IndexError("ChainDataset doesnot support negative indexing.")
"ChainDataset doesnot support negative indexing.")
for dataset in self._datasets: for dataset in self._datasets:
if i < len(dataset): if i < len(dataset):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__. At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
...@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices. ...@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
So the sampler is only responsible for generating valid indices. So the sampler is only responsible for generating valid indices.
""" """
import numpy as np import numpy as np
import random import random
class Sampler(object): class Sampler(object):
def __init__(self, data_source): def __init__(self, data_source):
pass pass
...@@ -23,7 +36,7 @@ class Sampler(object): ...@@ -23,7 +36,7 @@ class Sampler(object):
class SequentialSampler(Sampler): class SequentialSampler(Sampler):
def __init__(self, data_source): def __init__(self, data_source):
self.data_source = data_source self.data_source = data_source
def __iter__(self): def __iter__(self):
return iter(range(len(self.data_source))) return iter(range(len(self.data_source)))
...@@ -42,12 +55,14 @@ class RandomSampler(Sampler): ...@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
"replacement={}".format(self.replacement)) "replacement={}".format(self.replacement))
if self._num_samples is not None and not replacement: if self._num_samples is not None and not replacement:
raise ValueError("With replacement=False, num_samples should not be specified, " raise ValueError(
"since a random permutation will be performed.") "With replacement=False, num_samples should not be specified, "
"since a random permutation will be performed.")
if not isinstance(self.num_samples, int) or self.num_samples <= 0: if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integer " raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(self.num_samples)) "value, but got num_samples={}".format(
self.num_samples))
@property @property
def num_samples(self): def num_samples(self):
...@@ -59,7 +74,9 @@ class RandomSampler(Sampler): ...@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
def __iter__(self): def __iter__(self):
n = len(self.data_source) n = len(self.data_source)
if self.replacement: if self.replacement:
return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist()) return iter(
np.random.randint(
0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
return iter(np.random.permutation(n).tolist()) return iter(np.random.permutation(n).tolist())
def __len__(self): def __len__(self):
...@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler): ...@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
self.indices = indices self.indices = indices
def __iter__(self): def __iter__(self):
return (self.indices[i] for i in np.random.permutation(len(self.indices))) return (self.indices[i]
for i in np.random.permutation(len(self.indices)))
def __len__(self): def __len__(self):
return len(self.indices) return len(self.indices)
...@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): ...@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
3. Permutate mini-batchs 3. Permutate mini-batchs
""" """
def __init__(self, lengths, batch_size=4, batch_group_size=None, def __init__(self,
lengths,
batch_size=4,
batch_group_size=None,
permutate=True): permutate=True):
_lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key _lengths = np.array(
lengths,
dtype=np.int64) # maybe better implement length as a sort key
self.lengths = np.sort(_lengths) self.lengths = np.sort(_lengths)
self.sorted_indices = np.argsort(_lengths) self.sorted_indices = np.argsort(_lengths)
...@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): ...@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
for i in range(len(indices) // batch_group_size): for i in range(len(indices) // batch_group_size):
s = i * batch_group_size s = i * batch_group_size
e = s + batch_group_size e = s + batch_group_size
random.shuffle(indices[s: e]) # inplace random.shuffle(indices[s:e]) # inplace
# Permutate batches # Permutate batches
if self.permutate: if self.permutate:
perm = np.arange(len(indices[:e]) // self.batch_size) perm = np.arange(len(indices[:e]) // self.batch_size)
random.shuffle(perm) random.shuffle(perm)
indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1) indices[:e] = indices[:e].reshape(
-1, self.batch_size)[perm, :].reshape(-1)
# Handle last elements # Handle last elements
s += batch_group_size s += batch_group_size
#print(indices) #print(indices)
if s < len(indices): if s < len(indices):
random.shuffle(indices[s:]) random.shuffle(indices[s:])
return iter(indices) return iter(indices)
def __len__(self): def __len__(self):
...@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler): ...@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
def __init__(self, weights, num_samples, replacement): def __init__(self, weights, num_samples, replacement):
if not isinstance(num_samples, int) or num_samples <= 0: if not isinstance(num_samples, int) or num_samples <= 0:
raise ValueError("num_samples should be a positive integer " raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(num_samples)) "value, but got num_samples={}".format(
num_samples))
self.weights = np.array(weights, dtype=np.float64) self.weights = np.array(weights, dtype=np.float64)
self.num_samples = num_samples self.num_samples = num_samples
self.replacement = replacement self.replacement = replacement
def __iter__(self): def __iter__(self):
return iter(np.random.choice(len(self.weights), size=(self.num_samples, ), return iter(
replace=self.replacement, p=self.weights).tolist()) np.random.choice(
len(self.weights),
size=(self.num_samples, ),
replace=self.replacement,
p=self.weights).tolist())
def __len__(self): def __len__(self):
return self.num_samples return self.num_samples
...@@ -184,7 +213,7 @@ class DistributedSampler(Sampler): ...@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
# Subset samples for each trainer. # Subset samples for each trainer.
indices = indices[self.rank:self.total_size:self.num_trainers] indices = indices[self.rank:self.total_size:self.num_trainers]
assert len(indices) == self.num_samples assert len(indices) == self.num_samples
return iter(indices) return iter(indices)
...@@ -209,8 +238,7 @@ class BatchSampler(Sampler): ...@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
def __init__(self, sampler, batch_size, drop_last): def __init__(self, sampler, batch_size, drop_last):
if not isinstance(sampler, Sampler): if not isinstance(sampler, Sampler):
raise ValueError("sampler should be an instance of " raise ValueError("sampler should be an instance of "
"Sampler, but got sampler={}" "Sampler, but got sampler={}".format(sampler))
.format(sampler))
if not isinstance(batch_size, int) or batch_size <= 0: if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError("batch_size should be a positive integer value, " raise ValueError("batch_size should be a positive integer value, "
"but got batch_size={}".format(batch_size)) "but got batch_size={}".format(batch_size))
......
...@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand ...@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`. For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
That is it! That is it!
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
from ruamel.yaml import YAML from ruamel.yaml import YAML
...@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset ...@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, WavBatcher from parakeet.data.batch import TextIDBatcher, WavBatcher
class VCTK(Dataset): class VCTK(Dataset):
def __init__(self, root): def __init__(self, root):
assert isinstance(root, (str, Path)), "root should be a string or Path object" assert isinstance(root, (
str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root) self.root = root if isinstance(root, Path) else Path(root)
self.text_root = self.root.joinpath("txt") self.text_root = self.root.joinpath("txt")
self.wav_root = self.root.joinpath("wav48") self.wav_root = self.root.joinpath("wav48")
if not (self.root.joinpath("metadata.csv").exists() and if not (self.root.joinpath("metadata.csv").exists() and
self.root.joinpath("speaker_indices.yaml").exists()): self.root.joinpath("speaker_indices.yaml").exists()):
self._prepare_metadata() self._prepare_metadata()
self.speaker_indices, self.metadata = self._load_metadata() self.speaker_indices, self.metadata = self._load_metadata()
def _load_metadata(self): def _load_metadata(self):
yaml=YAML(typ='safe') yaml = YAML(typ='safe')
speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml")) speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
metadata = pd.read_csv(self.root.joinpath("metadata.csv"), metadata = pd.read_csv(
sep="|", quoting=3, header=1) self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
return speaker_indices, metadata return speaker_indices, metadata
def _prepare_metadata(self): def _prepare_metadata(self):
...@@ -41,15 +57,19 @@ class VCTK(Dataset): ...@@ -41,15 +57,19 @@ class VCTK(Dataset):
with io.open(str(text_file)) as f: with io.open(str(text_file)) as f:
transcription = f.read().strip() transcription = f.read().strip()
wav_file = text_file.with_suffix(".wav") wav_file = text_file.with_suffix(".wav")
metadata.append((wav_file.name, speaker_folder.name, transcription)) metadata.append(
metadata = pd.DataFrame.from_records(metadata, (wav_file.name, speaker_folder.name, transcription))
columns=["wave_file", "speaker", "text"]) metadata = pd.DataFrame.from_records(
metadata, columns=["wave_file", "speaker", "text"])
# save them # save them
yaml=YAML(typ='safe') yaml = YAML(typ='safe')
yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml")) yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
metadata.to_csv(self.root.joinpath("metadata.csv"), metadata.to_csv(
sep="|", quoting=3, index=False) self.root.joinpath("metadata.csv"),
sep="|",
quoting=3,
index=False)
def _get_example(self, metadatum): def _get_example(self, metadatum):
wave_file, speaker, text = metadatum wave_file, speaker, text = metadatum
...@@ -77,5 +97,3 @@ class VCTK(Dataset): ...@@ -77,5 +97,3 @@ class VCTK(Dataset):
speaker_batch = np.array(speaker_batch) speaker_batch = np.array(speaker_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return wav_batch, speaker_batch, phoneme_batch return wav_batch, speaker_batch, phoneme_batch
\ No newline at end of file
# coding: utf-8 # coding: utf-8
"""Text processing frontend """Text processing frontend
All frontend module should have the following functions: All frontend module should have the following functions:
......
...@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0): ...@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence from ..text import text_to_sequence
text = text_to_sequence(text, ["english_cleaners"]) text = text_to_sequence(text, ["english_cleaners"])
return text return text
...@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0): ...@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence from ..text import text_to_sequence
text = text_to_sequence(text, ["basic_cleaners"]) text = text_to_sequence(text, ["basic_cleaners"])
return text return text
# coding: utf-8 # coding: utf-8
import MeCab import MeCab
import jaconv import jaconv
from random import random from random import random
...@@ -30,9 +29,9 @@ def _yomi(mecab_result): ...@@ -30,9 +29,9 @@ def _yomi(mecab_result):
def _mix_pronunciation(tokens, yomis, p): def _mix_pronunciation(tokens, yomis, p):
return "".join( return "".join(yomis[idx]
yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx] if yomis[idx] is not None and random() < p else tokens[idx]
for idx in range(len(tokens))) for idx in range(len(tokens)))
def mix_pronunciation(text, p): def mix_pronunciation(text, p):
...@@ -59,8 +58,7 @@ def normalize_delimitor(text): ...@@ -59,8 +58,7 @@ def normalize_delimitor(text):
def text_to_sequence(text, p=0.0): def text_to_sequence(text, p=0.0):
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
"(", ")", "(", ")"]:
text = text.replace(c, "") text = text.replace(c, "")
text = text.replace("!", "!") text = text.replace("!", "!")
text = text.replace("?", "?") text = text.replace("?", "?")
......
# coding: utf-8 # coding: utf-8
from random import random from random import random
n_vocab = 0xffff n_vocab = 0xffff
...@@ -13,5 +12,6 @@ _tagger = None ...@@ -13,5 +12,6 @@ _tagger = None
def text_to_sequence(text, p=0.0): def text_to_sequence(text, p=0.0):
return [ord(c) for c in text] + [_eos] # EOS return [ord(c) for c in text] + [_eos] # EOS
def sequence_to_text(seq): def sequence_to_text(seq):
return "".join(chr(n) for n in seq) return "".join(chr(n) for n in seq)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re import re
from . import cleaners from . import cleaners
from .symbols import symbols from .symbols import symbols
# Mappings from symbol to numeric ID and vice versa: # Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)} _symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)}
...@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names): ...@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
if not m: if not m:
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
break break
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) sequence += _symbols_to_sequence(
_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2)) sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3) text = m.group(3)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
''' '''
Cleaners are transformations that run over the input text at both training and eval time. Cleaners are transformations that run over the input text at both training and eval time.
...@@ -14,31 +27,31 @@ import re ...@@ -14,31 +27,31 @@ import re
from unidecode import unidecode from unidecode import unidecode
from .numbers import normalize_numbers from .numbers import normalize_numbers
# Regular expression matching whitespace: # Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+') _whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations: # List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
('mrs', 'misess'), for x in [
('mr', 'mister'), ('mrs', 'misess'),
('dr', 'doctor'), ('mr', 'mister'),
('st', 'saint'), ('dr', 'doctor'),
('co', 'company'), ('st', 'saint'),
('jr', 'junior'), ('co', 'company'),
('maj', 'major'), ('jr', 'junior'),
('gen', 'general'), ('maj', 'major'),
('drs', 'doctors'), ('gen', 'general'),
('rev', 'reverend'), ('drs', 'doctors'),
('lt', 'lieutenant'), ('rev', 'reverend'),
('hon', 'honorable'), ('lt', 'lieutenant'),
('sgt', 'sergeant'), ('hon', 'honorable'),
('capt', 'captain'), ('sgt', 'sergeant'),
('esq', 'esquire'), ('capt', 'captain'),
('ltd', 'limited'), ('esq', 'esquire'),
('col', 'colonel'), ('ltd', 'limited'),
('ft', 'fort'), ('col', 'colonel'),
]] ('ft', 'fort'),
]]
def expand_abbreviations(text): def expand_abbreviations(text):
......
import re # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
valid_symbols = [ valid_symbols = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
'Y', 'Z', 'ZH'
] ]
_valid_symbol_set = set(valid_symbols) _valid_symbol_set = set(valid_symbols)
...@@ -24,7 +38,10 @@ class CMUDict: ...@@ -24,7 +38,10 @@ class CMUDict:
else: else:
entries = _parse_cmudict(file_or_path) entries = _parse_cmudict(file_or_path)
if not keep_ambiguous: if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1} entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries self._entries = entries
def __len__(self): def __len__(self):
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
import inflect import inflect
import re import re
_inflect = inflect.engine() _inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
...@@ -56,7 +55,8 @@ def _expand_number(m): ...@@ -56,7 +55,8 @@ def _expand_number(m):
elif num % 100 == 0: elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred' return _inflect.number_to_words(num // 100) + ' hundred'
else: else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') return _inflect.number_to_words(
num, andword='', zero='oh', group=2).replace(', ', ' ')
else: else:
return _inflect.number_to_words(num, andword='') return _inflect.number_to_words(num, andword='')
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
''' '''
Defines the set of symbols used in text input to the model. Defines the set of symbols used in text input to the model.
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
from parakeet.models.deepvoice3.decoder import Decoder, WindowRange from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
from parakeet.models.deepvoice3.converter import Converter from parakeet.models.deepvoice3.converter import Converter
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from collections import namedtuple from collections import namedtuple
from paddle import fluid from paddle import fluid
...@@ -19,23 +33,19 @@ class Attention(dg.Layer): ...@@ -19,23 +33,19 @@ class Attention(dg.Layer):
value_projection=True): value_projection=True):
super(Attention, self).__init__() super(Attention, self).__init__()
std = np.sqrt(1 / query_dim) std = np.sqrt(1 / query_dim)
self.query_proj = Linear(query_dim, self.query_proj = Linear(
embed_dim, query_dim, embed_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
if key_projection: if key_projection:
std = np.sqrt(1 / embed_dim) std = np.sqrt(1 / embed_dim)
self.key_proj = Linear(embed_dim, self.key_proj = Linear(
embed_dim, embed_dim, embed_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
if value_projection: if value_projection:
std = np.sqrt(1 / embed_dim) std = np.sqrt(1 / embed_dim)
self.value_proj = Linear(embed_dim, self.value_proj = Linear(
embed_dim, embed_dim, embed_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
std = np.sqrt(1 / embed_dim) std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim, self.out_proj = Linear(
query_dim, embed_dim, query_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
self.key_projection = key_projection self.key_projection = key_projection
self.value_projection = value_projection self.value_projection = value_projection
...@@ -102,9 +112,8 @@ class Attention(dg.Layer): ...@@ -102,9 +112,8 @@ class Attention(dg.Layer):
x = F.softmax(x) x = F.softmax(x)
attn_scores = x attn_scores = x
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x = F.matmul(x, values) x = F.matmul(x, values)
encoder_length = keys.shape[1] encoder_length = keys.shape[1]
# CAUTION: is it wrong? let it be now # CAUTION: is it wrong? let it be now
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle import fluid from paddle import fluid
...@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer): ...@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
has residual connection from the input x, and scale the output by has residual connection from the input x, and scale the output by
np.sqrt(0.5). np.sqrt(0.5).
""" """
def __init__(self, def __init__(self,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
...@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer): ...@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
), "this block uses residual connection"\ ), "this block uses residual connection"\
"the input_channes should equals num_filters" "the input_channes should equals num_filters"
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels)) std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
self.conv = Conv1DCell(in_channels, self.conv = Conv1DCell(
2 * num_filters, in_channels,
filter_size, 2 * num_filters,
dilation, filter_size,
causal, dilation,
param_attr=I.Normal(scale=std)) causal,
param_attr=I.Normal(scale=std))
if n_speakers > 1: if n_speakers > 1:
assert (speaker_dim is not None assert (speaker_dim is not None
), "speaker embed should not be null in multi-speaker case" ), "speaker embed should not be null in multi-speaker case"
std = np.sqrt(1 / speaker_dim) std = np.sqrt(1 / speaker_dim)
self.fc = Linear(speaker_dim, self.fc = Linear(
num_filters, speaker_dim, num_filters, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None): def forward(self, x, speaker_embed=None):
""" """
...@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer): ...@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU. C_out means the output channels of Conv1DGLU.
""" """
residual = x residual = x
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x = self.conv(x) x = self.conv(x)
content, gate = F.split(x, num_or_sections=2, dim=1) content, gate = F.split(x, num_or_sections=2, dim=1)
...@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer): ...@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU. C_out means the output channels of Conv1DGLU.
""" """
residual = x_t residual = x_t
x_t = F.dropout(x_t, x_t = F.dropout(
self.dropout, x_t, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x_t = self.conv.add_input(x_t) x_t = self.conv.add_input(x_t)
content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1) content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from itertools import chain from itertools import chain
...@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): ...@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2, 2,
stride=2, stride=2,
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))), param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout),
Conv1DTranspose(
target_channels, target_channels,
target_channels, target_channels,
2, 3,
stride=2, dilation=1,
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))), std_mul=1.,
Conv1DGLU(n_speakers, dropout=dropout), Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=1, 3,
std_mul=1., dilation=3,
dropout=dropout), std_mul=4.,
Conv1DGLU(n_speakers, dropout=dropout), Conv1DTranspose(
speaker_dim, target_channels,
target_channels, target_channels,
target_channels, 2,
3, stride=2,
dilation=3, param_attr=I.Normal(scale=np.sqrt(
std_mul=4., 4. / (2 * target_channels)))), Conv1DGLU(
dropout=dropout) n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
] ]
return upsampling_convolutions return upsampling_convolutions
...@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): ...@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2, 2,
stride=2, stride=2,
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))), param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=1, 3,
std_mul=1., dilation=1,
dropout=dropout), std_mul=1.,
Conv1DGLU(n_speakers, dropout=dropout), Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=3, 3,
std_mul=4., dilation=3,
dropout=dropout) std_mul=4.,
dropout=dropout)
] ]
return upsampling_convolutions return upsampling_convolutions
def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout): def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [ upsampling_convolutions = [
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=3, 3,
std_mul=4., dilation=3,
dropout=dropout) std_mul=4.,
dropout=dropout)
] ]
return upsampling_convolutions return upsampling_convolutions
...@@ -108,6 +125,7 @@ class Converter(dg.Layer): ...@@ -108,6 +125,7 @@ class Converter(dg.Layer):
Vocoder that transforms mel spectrogram (or ecoder hidden states) Vocoder that transforms mel spectrogram (or ecoder hidden states)
to waveform. to waveform.
""" """
def __init__(self, def __init__(self,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
...@@ -161,33 +179,36 @@ class Converter(dg.Layer): ...@@ -161,33 +179,36 @@ class Converter(dg.Layer):
std = np.sqrt(std_mul / in_channels) std = np.sqrt(std_mul / in_channels)
# CAUTION: relu # CAUTION: relu
self.convolutions.append( self.convolutions.append(
Conv1D(in_channels, Conv1D(
out_channels, in_channels,
1, out_channels,
act="relu", 1,
param_attr=I.Normal(scale=std))) act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels in_channels = out_channels
std_mul = 2.0 std_mul = 2.0
self.convolutions.append( self.convolutions.append(
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation=dilation, filter_size,
std_mul=std_mul, dilation=dilation,
dropout=dropout)) std_mul=std_mul,
dropout=dropout))
in_channels = out_channels in_channels = out_channels
std_mul = 4.0 std_mul = 4.0
# final conv proj, channel transformed to linear dim # final conv proj, channel transformed to linear dim
std = np.sqrt(std_mul * (1 - dropout) / in_channels) std = np.sqrt(std_mul * (1 - dropout) / in_channels)
# CAUTION: sigmoid # CAUTION: sigmoid
self.last_conv_proj = Conv1D(in_channels, self.last_conv_proj = Conv1D(
linear_dim, in_channels,
1, linear_dim,
act="sigmoid", 1,
param_attr=I.Normal(scale=std)) act="sigmoid",
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None): def forward(self, x, speaker_embed=None):
""" """
...@@ -229,4 +250,4 @@ class Converter(dg.Layer): ...@@ -229,4 +250,4 @@ class Converter(dg.Layer):
out = self.last_conv_proj(x) out = self.last_conv_proj(x)
out = F.transpose(out, [0, 2, 1]) out = F.transpose(out, [0, 2, 1])
return out return out
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import paddle.fluid.layers as F import paddle.fluid.layers as F
import paddle.fluid.initializer as I import paddle.fluid.initializer as I
...@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r): ...@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__( def __init__(
self, self,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
embed_dim, embed_dim,
mel_dim, mel_dim,
r=1, r=1,
max_positions=512, max_positions=512,
padding_idx=None, # remove it! padding_idx=None, # remove it!
preattention=(ConvSpec(128, 5, 1), ) * 4, preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4, convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True, attention=True,
dropout=0.0, dropout=0.0,
use_memory_mask=False, use_memory_mask=False,
force_monotonic_attention=False, force_monotonic_attention=False,
query_position_rate=1.0, query_position_rate=1.0,
key_position_rate=1.0, key_position_rate=1.0,
window_range=WindowRange(-1, 3), window_range=WindowRange(-1, 3),
key_projection=True, key_projection=True,
value_projection=True): value_projection=True):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.dropout = dropout self.dropout = dropout
...@@ -111,23 +125,17 @@ class Decoder(dg.Layer): ...@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
conv_channels = convolutions[0].out_channels conv_channels = convolutions[0].out_channels
# only when padding idx is 0 can we easilt handle it # only when padding idx is 0 can we easilt handle it
self.embed_keys_positions = PositionEmbedding(max_positions, self.embed_keys_positions = PositionEmbedding(
embed_dim, max_positions, embed_dim, padding_idx=0)
padding_idx=0) self.embed_query_positions = PositionEmbedding(
self.embed_query_positions = PositionEmbedding(max_positions, max_positions, conv_channels, padding_idx=0)
conv_channels,
padding_idx=0)
if n_speakers > 1: if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) std = np.sqrt((1 - dropout) / speaker_dim)
self.speaker_proj1 = Linear(speaker_dim, self.speaker_proj1 = Linear(
1, speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
act="sigmoid", self.speaker_proj2 = Linear(
param_attr=I.Normal(scale=std)) speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(speaker_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
# prenet # prenet
self.prenet = dg.LayerList() self.prenet = dg.LayerList()
...@@ -138,24 +146,26 @@ class Decoder(dg.Layer): ...@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
# conv1d & relu # conv1d & relu
std = np.sqrt(std_mul / in_channels) std = np.sqrt(std_mul / in_channels)
self.prenet.append( self.prenet.append(
Conv1D(in_channels, Conv1D(
out_channels, in_channels,
1, out_channels,
act="relu", 1,
param_attr=I.Normal(scale=std))) act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels in_channels = out_channels
std_mul = 2.0 std_mul = 2.0
self.prenet.append( self.prenet.append(
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation, filter_size,
std_mul, dilation,
dropout, std_mul,
causal=True, dropout,
residual=True)) causal=True,
residual=True))
in_channels = out_channels in_channels = out_channels
std_mul = 4.0 std_mul = 4.0
...@@ -184,16 +194,17 @@ class Decoder(dg.Layer): ...@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
assert ( assert (
in_channels == out_channels in_channels == out_channels
), "the stack of convolution & attention does not change channels" ), "the stack of convolution & attention does not change channels"
conv_layer = Conv1DGLU(n_speakers, conv_layer = Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation, filter_size,
std_mul, dilation,
dropout, std_mul,
causal=True, dropout,
residual=False) causal=True,
residual=False)
attn_layer = Attention( attn_layer = Attention(
out_channels, out_channels,
embed_dim, embed_dim,
...@@ -211,10 +222,8 @@ class Decoder(dg.Layer): ...@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
# 1 * 1 conv to transform channels # 1 * 1 conv to transform channels
std = np.sqrt(std_mul * (1 - dropout) / in_channels) std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.last_conv = Conv1D(in_channels, self.last_conv = Conv1D(
mel_dim * r, in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
1,
param_attr=I.Normal(scale=std))
# mel (before sigmoid) to done hat # mel (before sigmoid) to done hat
std = np.sqrt(1 / in_channels) std = np.sqrt(1 / in_channels)
...@@ -308,9 +317,8 @@ class Decoder(dg.Layer): ...@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
# (B, C, T) # (B, C, T)
frames = F.transpose(frames, [0, 2, 1]) frames = F.transpose(frames, [0, 2, 1])
x = frames x = frames
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
# Prenet # Prenet
for layer in self.prenet: for layer in self.prenet:
if isinstance(layer, Conv1DGLU): if isinstance(layer, Conv1DGLU):
...@@ -408,14 +416,13 @@ class Decoder(dg.Layer): ...@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
test_inputs = fold_adjacent_frames(test_inputs, self.r) test_inputs = fold_adjacent_frames(test_inputs, self.r)
test_inputs = F.transpose(test_inputs, [0, 2, 1]) test_inputs = F.transpose(test_inputs, [0, 2, 1])
initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1), initial_input = F.zeros(
dtype=keys.dtype) (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
t = 0 # decoder time step t = 0 # decoder time step
while True: while True:
frame_pos = F.fill_constant((batch_size, 1), frame_pos = F.fill_constant(
value=t + 1, (batch_size, 1), value=t + 1, dtype="int64")
dtype="int64")
w = self.query_position_rate w = self.query_position_rate
if self.n_speakers > 1: if self.n_speakers > 1:
w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1]) w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
...@@ -433,9 +440,8 @@ class Decoder(dg.Layer): ...@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
current_input = initial_input current_input = initial_input
x_t = current_input x_t = current_input
x_t = F.dropout(x_t, x_t = F.dropout(
self.dropout, x_t, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
# Prenet # Prenet
for layer in self.prenet: for layer in self.prenet:
...@@ -453,15 +459,15 @@ class Decoder(dg.Layer): ...@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
x_t = F.transpose(x_t, [0, 2, 1]) x_t = F.transpose(x_t, [0, 2, 1])
if frame_pos_embed is not None: if frame_pos_embed is not None:
x_t += frame_pos_embed x_t += frame_pos_embed
x_t, attn_scores = attn( x_t, attn_scores = attn(x_t, (keys, values), mask,
x_t, (keys, values), mask, last_attended[i]
last_attended[i] if test_inputs is None else None) if test_inputs is None else None)
x_t = F.transpose(x_t, [0, 2, 1]) x_t = F.transpose(x_t, [0, 2, 1])
step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc) step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc)
# update last attended when necessary # update last attended when necessary
if self.force_monotonic_attention[i]: if self.force_monotonic_attention[i]:
last_attended[i] = np.argmax(attn_scores.numpy(), last_attended[i] = np.argmax(
axis=-1)[0][0] attn_scores.numpy(), axis=-1)[0][0]
x_t = F.scale(residual + x_t, np.sqrt(0.5)) x_t = F.scale(residual + x_t, np.sqrt(0.5))
if len(step_attn_scores): if len(step_attn_scores):
# (B, 1, T_enc) again # (B, 1, T_enc) again
...@@ -485,8 +491,8 @@ class Decoder(dg.Layer): ...@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
t += 1 t += 1
if test_inputs is None: if test_inputs is None:
if F.reduce_min(done_t).numpy( if F.reduce_min(done_t).numpy()[
)[0] > 0.5 and t > self.min_decoder_steps: 0] > 0.5 and t > self.min_decoder_steps:
break break
elif t > self.max_decoder_steps: elif t > self.max_decoder_steps:
break break
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from collections import namedtuple from collections import namedtuple
...@@ -33,14 +47,16 @@ class Encoder(dg.Layer): ...@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
self.dropout = dropout self.dropout = dropout
if n_speakers > 1: if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) std = np.sqrt((1 - dropout) / speaker_dim)
self.sp_proj1 = Linear(speaker_dim, self.sp_proj1 = Linear(
embed_dim, speaker_dim,
act="softsign", embed_dim,
param_attr=I.Normal(scale=std)) act="softsign",
self.sp_proj2 = Linear(speaker_dim, param_attr=I.Normal(scale=std))
embed_dim, self.sp_proj2 = Linear(
act="softsign", speaker_dim,
param_attr=I.Normal(scale=std)) embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.n_speakers = n_speakers self.n_speakers = n_speakers
self.convolutions = dg.LayerList() self.convolutions = dg.LayerList()
...@@ -51,31 +67,34 @@ class Encoder(dg.Layer): ...@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
if in_channels != out_channels: if in_channels != out_channels:
std = np.sqrt(std_mul / in_channels) std = np.sqrt(std_mul / in_channels)
self.convolutions.append( self.convolutions.append(
Conv1D(in_channels, Conv1D(
out_channels, in_channels,
1, out_channels,
act="relu", 1,
param_attr=I.Normal(scale=std))) act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels in_channels = out_channels
std_mul = 2.0 std_mul = 2.0
self.convolutions.append( self.convolutions.append(
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation, filter_size,
std_mul, dilation,
dropout, std_mul,
causal=False, dropout,
residual=True)) causal=False,
residual=True))
in_channels = out_channels in_channels = out_channels
std_mul = 4.0 std_mul = 4.0
std = np.sqrt(std_mul * (1 - dropout) / in_channels) std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.convolutions.append( self.convolutions.append(
Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std))) Conv1D(
in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def forward(self, x, speaker_embed=None): def forward(self, x, speaker_embed=None):
""" """
...@@ -96,9 +115,8 @@ class Encoder(dg.Layer): ...@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
representation for values. representation for values.
""" """
x = self.embed(x) x = self.embed(x)
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x = F.transpose(x, [0, 2, 1]) x = F.transpose(x, [0, 2, 1])
if self.n_speakers > 1 and speaker_embed is not None: if self.n_speakers > 1 and speaker_embed is not None:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from numba import jit from numba import jit
...@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g): ...@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
return W return W
def guided_attentions(encoder_lengths, def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
decoder_lengths,
max_decoder_len,
g=0.2): g=0.2):
B = len(encoder_lengths) B = len(encoder_lengths)
max_input_len = encoder_lengths.max() max_input_len = encoder_lengths.max()
...@@ -93,9 +105,8 @@ class TTSLoss(object): ...@@ -93,9 +105,8 @@ class TTSLoss(object):
def binary_divergence(self, prediction, target, mask): def binary_divergence(self, prediction, target, mask):
flattened_prediction = F.reshape(prediction, [-1, 1]) flattened_prediction = F.reshape(prediction, [-1, 1])
flattened_target = F.reshape(target, [-1, 1]) flattened_target = F.reshape(target, [-1, 1])
flattened_loss = F.log_loss(flattened_prediction, flattened_loss = F.log_loss(
flattened_target, flattened_prediction, flattened_target, epsilon=1e-8)
epsilon=1e-8)
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape) bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
w = self.masked_weight w = self.masked_weight
...@@ -163,23 +174,20 @@ class TTSLoss(object): ...@@ -163,23 +174,20 @@ class TTSLoss(object):
max_mel_steps = max_frames // self.downsample_factor max_mel_steps = max_frames // self.downsample_factor
max_decoder_steps = max_mel_steps // self.r max_decoder_steps = max_mel_steps // self.r
decoder_mask = F.sequence_mask(n_frames // self.downsample_factor // decoder_mask = F.sequence_mask(
self.r, n_frames // self.downsample_factor // self.r,
max_decoder_steps, max_decoder_steps,
dtype="float32") dtype="float32")
mel_mask = F.sequence_mask(n_frames // self.downsample_factor, mel_mask = F.sequence_mask(
max_mel_steps, n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
dtype="float32")
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32") lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
if compute_lin_loss: if compute_lin_loss:
lin_hyp = lin_hyp[:, :-self.time_shift, :] lin_hyp = lin_hyp[:, :-self.time_shift, :]
lin_ref = lin_ref[:, self.time_shift:, :] lin_ref = lin_ref[:, self.time_shift:, :]
lin_mask = lin_mask[:, self.time_shift:, :] lin_mask = lin_mask[:, self.time_shift:, :]
lin_l1_loss = self.l1_loss(lin_hyp, lin_l1_loss = self.l1_loss(
lin_ref, lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
lin_mask,
priority_bin=self.priority_bin)
lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask) lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
lin_loss = self.binary_divergence_weight * lin_bce_loss \ lin_loss = self.binary_divergence_weight * lin_bce_loss \
+ (1 - self.binary_divergence_weight) * lin_l1_loss + (1 - self.binary_divergence_weight) * lin_l1_loss
...@@ -197,9 +205,10 @@ class TTSLoss(object): ...@@ -197,9 +205,10 @@ class TTSLoss(object):
total_loss += mel_loss total_loss += mel_loss
if compute_attn_loss: if compute_attn_loss:
attn_loss = self.attention_loss( attn_loss = self.attention_loss(attn_hyp,
attn_hyp, input_lengths.numpy(), input_lengths.numpy(),
n_frames.numpy() // (self.downsample_factor * self.r)) n_frames.numpy() //
(self.downsample_factor * self.r))
total_loss += attn_loss total_loss += attn_loss
if compute_done_loss: if compute_done_loss:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import paddle.fluid.layers as F import paddle.fluid.layers as F
...@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer): ...@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
mel_outputs, alignments, done, decoder_states = self.decoder( mel_outputs, alignments, done, decoder_states = self.decoder(
(keys, values), valid_lengths, mel_inputs, text_positions, (keys, values), valid_lengths, mel_inputs, text_positions,
frame_positions, speaker_embed) frame_positions, speaker_embed)
linear_outputs = self.converter( linear_outputs = self.converter(decoder_states
decoder_states if self.use_decoder_states else mel_outputs, if self.use_decoder_states else
speaker_embed) mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done return mel_outputs, linear_outputs, alignments, done
def transduce(self, text_sequences, text_positions, speaker_indices=None): def transduce(self, text_sequences, text_positions, speaker_indices=None):
...@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer): ...@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
keys, values = self.encoder(text_sequences, speaker_embed) keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder.decode( mel_outputs, alignments, done, decoder_states = self.decoder.decode(
(keys, values), text_positions, speaker_embed) (keys, values), text_positions, speaker_embed)
linear_outputs = self.converter( linear_outputs = self.converter(decoder_states
decoder_states if self.use_decoder_states else mel_outputs, if self.use_decoder_states else
speaker_embed) mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done return mel_outputs, linear_outputs, alignments, done
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle import fluid from paddle import fluid
import paddle.fluid.layers as F import paddle.fluid.layers as F
...@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer): ...@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
speaker_position_rate) # (B, V, C) speaker_position_rate) # (B, V, C)
# make indices for gather_nd # make indices for gather_nd
batch_id = F.expand( batch_id = F.expand(
F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]), F.unsqueeze(
[1, time_steps]) F.range(
0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
# (B, T, 2) # (B, T, 2)
gather_nd_id = F.stack([batch_id, indices], -1) gather_nd_id = F.stack([batch_id, indices], -1)
out = F.gather_nd(weight, gather_nd_id) out = F.gather_nd(weight, gather_nd_id)
return out return out
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock from parakeet.models.fastspeech.fft_block import FFTBlock
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, def __init__(self,
len_max_seq, len_max_seq,
...@@ -18,16 +32,29 @@ class Decoder(dg.Layer): ...@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
super(Decoder, self).__init__() super(Decoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(
self.position_enc = dg.Embedding(size=[n_position, d_model], n_position, d_model, padding_idx=0)
padding_idx=0, self.position_enc = dg.Embedding(
param_attr=fluid.ParamAttr( size=[n_position, d_model],
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), padding_idx=0,
trainable=False)) param_attr=fluid.ParamAttr(
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack): for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer) self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos): def forward(self, enc_seq, enc_pos):
""" """
Decoder layer of FastSpeech. Decoder layer of FastSpeech.
...@@ -57,4 +84,4 @@ class Decoder(dg.Layer): ...@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
slf_attn_mask=slf_attn_mask) slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn] dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list return dec_output, dec_slf_attn_list
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock from parakeet.models.fastspeech.fft_block import FFTBlock
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, def __init__(self,
n_src_vocab, n_src_vocab,
...@@ -19,14 +33,28 @@ class Encoder(dg.Layer): ...@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
super(Encoder, self).__init__() super(Encoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0) self.src_word_emb = dg.Embedding(
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) size=[n_src_vocab, d_model], padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model], self.pos_inp = get_sinusoid_encoding_table(
padding_idx=0, n_position, d_model, padding_idx=0)
param_attr=fluid.ParamAttr( self.position_enc = dg.Embedding(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), size=[n_position, d_model],
trainable=False)) padding_idx=0,
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack): for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer) self.add_sublayer('fft_{}'.format(i), layer)
...@@ -52,7 +80,8 @@ class Encoder(dg.Layer): ...@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
non_pad_mask = get_non_pad_mask(character) non_pad_mask = get_non_pad_mask(character)
# -- Forward # -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) enc_output = self.src_word_emb(character) + self.position_enc(
text_pos) #(N, T, C)
for enc_layer in self.layer_stack: for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer( enc_output, enc_slf_attn = enc_layer(
...@@ -60,5 +89,5 @@ class Encoder(dg.Layer): ...@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
non_pad_mask=non_pad_mask, non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask) slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn] enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator ...@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder from parakeet.models.fastspeech.decoder import Decoder
class FastSpeech(dg.Layer): class FastSpeech(dg.Layer):
def __init__(self, cfg): def __init__(self, cfg):
" FastSpeech" " FastSpeech"
super(FastSpeech, self).__init__() super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(
len_max_seq=cfg['max_seq_len'], n_src_vocab=len(symbols) + 1,
n_layers=cfg['encoder_n_layer'], len_max_seq=cfg['max_seq_len'],
n_head=cfg['encoder_head'], n_layers=cfg['encoder_n_layer'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'], n_head=cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'], d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'], d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_inner=cfg['encoder_conv1d_filter_size'], d_model=cfg['fs_hidden_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'], d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_padding=cfg['fft_conv1d_padding'], fft_conv1d_kernel=cfg['fft_conv1d_filter'],
dropout=0.1) fft_conv1d_padding=cfg['fft_conv1d_padding'],
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], dropout=0.1)
out_channels=cfg['duration_predictor_output_size'], self.length_regulator = LengthRegulator(
filter_size=cfg['duration_predictor_filter_size'], input_size=cfg['fs_hidden_size'],
dropout=cfg['dropout']) out_channels=cfg['duration_predictor_output_size'],
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'], filter_size=cfg['duration_predictor_filter_size'],
n_layers=cfg['decoder_n_layer'], dropout=cfg['dropout'])
n_head=cfg['decoder_head'], self.decoder = Decoder(
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'], len_max_seq=cfg['max_seq_len'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'], n_layers=cfg['decoder_n_layer'],
d_model=cfg['fs_hidden_size'], n_head=cfg['decoder_head'],
d_inner=cfg['decoder_conv1d_filter_size'], d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'], d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
fft_conv1d_padding=cfg['fft_conv1d_padding'], d_model=cfg['fs_hidden_size'],
dropout=0.1) d_inner=cfg['decoder_conv1d_filter_size'],
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg['fs_hidden_size']) k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
self.mel_linear = dg.Linear(cfg['fs_hidden_size'], low=-k, high=k))
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'], self.mel_linear = dg.Linear(
param_attr = self.weight, cfg['fs_hidden_size'],
bias_attr = self.bias,) cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'], param_attr=self.weight,
num_hidden=512, bias_attr=self.bias, )
filter_size=5, self.postnet = PostConvNet(
padding=int(5 / 2), n_mels=cfg['audio']['num_mels'],
num_conv=5, num_hidden=512,
outputs_per_step=cfg['audio']['outputs_per_step'], filter_size=5,
use_cudnn=True, padding=int(5 / 2),
dropout=0.1, num_conv=5,
batchnorm_last=True) outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): def forward(self,
character,
text_pos,
mel_pos=None,
length_target=None,
alpha=1.0):
""" """
FastSpeech model. FastSpeech model.
...@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer): ...@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
""" """
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
character, text_pos)
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, length_regulator_output, duration_predictor_output = self.length_regulator(
target=length_target, encoder_output, target=length_target, alpha=alpha)
alpha=alpha) decoder_output, dec_slf_attn_list = self.decoder(
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos) length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output) mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
else: else:
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) length_regulator_output, decoder_pos = self.length_regulator(
decoder_output, _ = self.decoder(length_regulator_output, decoder_pos) encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output,
decoder_pos)
mel_output = self.mel_linear(decoder_output) mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet return mel_output, mel_output_postnet
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -6,11 +19,32 @@ import paddle.fluid as fluid ...@@ -6,11 +19,32 @@ import paddle.fluid as fluid
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
class FFTBlock(dg.Layer): class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): def __init__(self,
d_model,
d_inner,
n_head,
d_k,
d_v,
filter_size,
padding,
dropout=0.2):
super(FFTBlock, self).__init__() super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False) self.slf_attn = MultiheadAttention(
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) d_model,
d_k,
d_v,
num_head=n_head,
is_bias=True,
dropout=dropout,
is_concat=False)
self.pos_ffn = PositionwiseFeedForward(
d_model,
d_inner,
filter_size=filter_size,
padding=padding,
dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
""" """
...@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer): ...@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
output (Variable), Shape(B, T, C), the output after self-attention & ffn. output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention. slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
""" """
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) output, slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask output *= non_pad_mask
output = self.pos_ffn(output) output = self.pos_ffn(output)
output *= non_pad_mask output *= non_pad_mask
return output, slf_attn return output, slf_attn
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import math import math
import parakeet.models.fastspeech.utils import parakeet.models.fastspeech.utils
...@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers ...@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D from parakeet.modules.customized import Conv1D
class LengthRegulator(dg.Layer): class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1): def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(LengthRegulator, self).__init__() super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor(input_size=input_size, self.duration_predictor = DurationPredictor(
out_channels=out_channels, input_size=input_size,
filter_size=filter_size, out_channels=out_channels,
dropout=dropout) filter_size=filter_size,
dropout=dropout)
def LR(self, x, duration_predictor_output, alpha=1.0): def LR(self, x, duration_predictor_output, alpha=1.0):
output = [] output = []
batch_size = x.shape[0] batch_size = x.shape[0]
for i in range(batch_size): for i in range(batch_size):
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha)) output.append(
self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
alpha))
output = self.pad(output) output = self.pad(output)
return output return output
def pad(self, input_ele): def pad(self, input_ele):
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
out_list = [] out_list = []
for i in range(len(input_ele)): for i in range(len(input_ele)):
pad_len = max_len - input_ele[i].shape[0] pad_len = max_len - input_ele[i].shape[0]
one_batch_padded = layers.pad( one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) pad_value=0.0)
out_list.append(one_batch_padded) out_list.append(one_batch_padded)
out_padded = layers.stack(out_list) out_padded = layers.stack(out_list)
return out_padded return out_padded
def expand(self, batch, predicted, alpha): def expand(self, batch, predicted, alpha):
out = [] out = []
time_steps = batch.shape[1] time_steps = batch.shape[1]
fertilities = predicted.numpy() fertilities = predicted.numpy()
batch = layers.squeeze(batch,[0]) batch = layers.squeeze(batch, [0])
for i in range(time_steps): for i in range(time_steps):
if fertilities[0,i]==0: if fertilities[0, i] == 0:
continue continue
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1])) out.append(
layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
out = layers.concat(out, axis=0) out = layers.concat(out, axis=0)
return out return out
def forward(self, x, alpha=1.0, target=None): def forward(self, x, alpha=1.0, target=None):
""" """
...@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer): ...@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
else: else:
duration_predictor_output = layers.round(duration_predictor_output) duration_predictor_output = layers.round(duration_predictor_output)
output = self.LR(x, duration_predictor_output, alpha) output = self.LR(x, duration_predictor_output, alpha)
mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1)) mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
mel_pos = layers.unsqueeze(mel_pos, [0]) mel_pos = layers.unsqueeze(mel_pos, [0])
return output, mel_pos return output, mel_pos
class DurationPredictor(dg.Layer): class DurationPredictor(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1): def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(DurationPredictor, self).__init__() super(DurationPredictor, self).__init__()
...@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer): ...@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
self.dropout = dropout self.dropout = dropout
k = math.sqrt(1 / self.input_size) k = math.sqrt(1 / self.input_size)
self.conv1 = Conv1D(num_channels = self.input_size, self.conv1 = Conv1D(
num_filters = self.out_channels, num_channels=self.input_size,
filter_size = self.filter_size, num_filters=self.out_channels,
padding=1, filter_size=self.filter_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=1,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
#data_format='NTC') initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
k = math.sqrt(1 / self.out_channels) k = math.sqrt(1 / self.out_channels)
self.conv2 = Conv1D(num_channels = self.out_channels, self.conv2 = Conv1D(
num_filters = self.out_channels, num_channels=self.out_channels,
filter_size = self.filter_size, num_filters=self.out_channels,
padding=1, filter_size=self.filter_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=1,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
#data_format='NTC') initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / self.out_channels) k = math.sqrt(1 / self.out_channels)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight, self.linear = dg.Linear(
bias_attr = self.bias) self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
def forward(self, encoder_output): def forward(self, encoder_output):
""" """
...@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer): ...@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
out (Variable), Shape(B, T, C), the output of duration predictor. out (Variable), Shape(B, T, C), the output of duration predictor.
""" """
# encoder_output.shape(N, T, C) # encoder_output.shape(N, T, C)
out = layers.transpose(encoder_output, [0,2,1]) out = layers.transpose(encoder_output, [0, 2, 1])
out = self.conv1(out) out = self.conv1(out)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
out = self.conv2(out) out = self.conv2(out)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
out = layers.relu(self.linear(out)) out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1]) out = layers.squeeze(out, axes=[-1])
return out
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
def get_alignment(attn_probs, mel_lens, n_head): def get_alignment(attn_probs, mel_lens, n_head):
max_F = 0 max_F = 0
assert attn_probs[0].shape[0] % n_head == 0 assert attn_probs[0].shape[0] % n_head == 0
...@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head): ...@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
for i in range(len(attn_probs)): for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy() multi_attn = attn_probs[i].numpy()
for j in range(n_head): for j in range(n_head):
attn = multi_attn[j*batch_size:(j+1)*batch_size] attn = multi_attn[j * batch_size:(j + 1) * batch_size]
F = score_F(attn) F = score_F(attn)
if max_F < F: if max_F < F:
max_F = F max_F = F
max_attn = attn max_attn = attn
alignment = compute_duration(max_attn, mel_lens) alignment = compute_duration(max_attn, mel_lens)
return alignment return alignment
def score_F(attn): def score_F(attn):
max = np.max(attn, axis=-1) max = np.max(attn, axis=-1)
mean = np.mean(max) mean = np.mean(max)
return mean return mean
def compute_duration(attn, mel_lens): def compute_duration(attn, mel_lens):
alignment = np.zeros([attn.shape[0],attn.shape[2]]) alignment = np.zeros([attn.shape[0], attn.shape[2]])
mel_lens = mel_lens.numpy() mel_lens = mel_lens.numpy()
for i in range(attn.shape[0]): for i in range(attn.shape[0]):
for j in range(mel_lens[i]): for j in range(mel_lens[i]):
max_index = np.argmax(attn[i,j]) max_index = np.argmax(attn[i, j])
alignment[i,max_index] += 1 alignment[i, max_index] += 1
return alignment return alignment
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D ...@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
from parakeet.modules.dynamic_gru import DynamicGRU from parakeet.modules.dynamic_gru import DynamicGRU
import numpy as np import numpy as np
class CBHG(dg.Layer): class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, def __init__(self,
max_pool_kernel_size=2, is_post=False): hidden_size,
batch_size,
K=16,
projection_size=256,
num_gru_layers=2,
max_pool_kernel_size=2,
is_post=False):
super(CBHG, self).__init__() super(CBHG, self).__init__()
""" """
:param hidden_size: dimension of hidden unit :param hidden_size: dimension of hidden unit
...@@ -24,28 +44,39 @@ class CBHG(dg.Layer): ...@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
self.projection_size = projection_size self.projection_size = projection_size
self.conv_list = [] self.conv_list = []
k = math.sqrt(1 / projection_size) k = math.sqrt(1 / projection_size)
self.conv_list.append(Conv1D(num_channels = projection_size, self.conv_list.append(
num_filters = hidden_size, Conv1D(
filter_size = 1, num_channels=projection_size,
padding = int(np.floor(1/2)), num_filters=hidden_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=1,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) padding=int(np.floor(1 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
for i in range(2,K+1): for i in range(2, K + 1):
self.conv_list.append(Conv1D(num_channels = hidden_size, self.conv_list.append(
num_filters = hidden_size, Conv1D(
filter_size = i, num_channels=hidden_size,
padding = int(np.floor(i/2)), num_filters=hidden_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=i,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) padding=int(np.floor(i / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, layer in enumerate(self.conv_list): for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = [] self.batchnorm_list = []
for i in range(K): for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size, self.batchnorm_list.append(
data_layout='NCHW')) dg.BatchNorm(
hidden_size, data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list): for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer) self.add_sublayer("batchnorm_list_{}".format(i), layer)
...@@ -53,91 +84,120 @@ class CBHG(dg.Layer): ...@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
conv_outdim = hidden_size * K conv_outdim = hidden_size * K
k = math.sqrt(1 / conv_outdim) k = math.sqrt(1 / conv_outdim)
self.conv_projection_1 = Conv1D(num_channels = conv_outdim, self.conv_projection_1 = Conv1D(
num_filters = hidden_size, num_channels=conv_outdim,
filter_size = 3, num_filters=hidden_size,
padding = int(np.floor(3/2)), filter_size=3,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=int(np.floor(3 / 2)),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
self.conv_projection_2 = Conv1D(num_channels = hidden_size, self.conv_projection_2 = Conv1D(
num_filters = projection_size, num_channels=hidden_size,
filter_size = 3, num_filters=projection_size,
padding = int(np.floor(3/2)), filter_size=3,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=int(np.floor(3 / 2)),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
data_layout='NCHW') low=-k, high=k)))
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
data_layout='NCHW') self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size, self.batchnorm_proj_2 = dg.BatchNorm(
pool_type='max', projection_size, data_layout='NCHW')
pool_stride=1, self.max_pool = Pool1D(
pool_padding=1, pool_size=max_pool_kernel_size,
data_format = "NCT") pool_type='max',
pool_stride=1,
pool_padding=1,
data_format="NCT")
self.highway = Highwaynet(self.projection_size) self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0) h_0 = dg.to_variable(h_0)
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3, self.fc_forward1 = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), hidden_size,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) hidden_size // 2 * 3,
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3, param_attr=fluid.ParamAttr(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, low=-k, high=k)))
is_reverse = False, self.fc_reverse1 = dg.Linear(
origin_mode = True, hidden_size,
h_0 = h_0) hidden_size // 2 * 3,
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, param_attr=fluid.ParamAttr(
is_reverse=True, initializer=fluid.initializer.XavierInitializer()),
origin_mode=True, bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
h_0 = h_0) low=-k, high=k)))
self.gru_forward1 = DynamicGRU(
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3, size=self.hidden_size // 2,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), is_reverse=False,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) origin_mode=True,
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3, h_0=h_0)
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), self.gru_reverse1 = DynamicGRU(
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) size=self.hidden_size // 2,
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, is_reverse=True,
is_reverse = False, origin_mode=True,
origin_mode = True, h_0=h_0)
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, self.fc_forward2 = dg.Linear(
is_reverse=True, hidden_size,
origin_mode=True, hidden_size // 2 * 3,
h_0 = h_0) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
def _conv_fit_dim(self, x, filter_size=3): def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0: if filter_size % 2 == 0:
return x[:,:,:-1] return x[:, :, :-1]
else: else:
return x return x
def forward(self, input_): def forward(self, input_):
# input_.shape = [N, C, T] # input_.shape = [N, C, T]
conv_list = [] conv_list = []
conv_input = input_ conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): for i, (conv, batchnorm
conv_input = self._conv_fit_dim(conv(conv_input), i+1) ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
conv_input = layers.relu(batchnorm(conv_input)) conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input) conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1) conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1] conv_pool = self.max_pool(conv_cat)[:, :, :-1]
conv_proj = layers.relu(
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) self.batchnorm_proj_1(
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T] # conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1]) highway = layers.transpose(conv_proj, [0, 2, 1])
highway = self.highway(highway) highway = self.highway(highway)
# highway.shape = [N, T, C] # highway.shape = [N, T, C]
...@@ -151,9 +211,10 @@ class CBHG(dg.Layer): ...@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
out_forward = self.gru_forward2(fc_forward) out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse) out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1) out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
return out return out
class Highwaynet(dg.Layer): class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4): def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__() super(Highwaynet, self).__init__()
...@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer): ...@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
self.linears = [] self.linears = []
k = math.sqrt(1 / num_units) k = math.sqrt(1 / num_units)
for i in range(num_layers): for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units, self.linears.append(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), dg.Linear(
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))) num_units,
self.gates.append(dg.Linear(num_units, num_units, num_units,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), param_attr=fluid.ParamAttr(
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))) initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): initializer=fluid.initializer.Uniform(
low=-k, high=k))))
self.gates.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate) self.add_sublayer("gates_{}".format(i), gate)
...@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer): ...@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
t_ = fluid.layers.sigmoid(gate(out)) t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_ c = 1 - t_
out = h * t_ + out * c out = h * t_ + out * c
return out
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward ...@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.prenet import PreNet from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4): def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr() param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', self.alpha = self.create_parameter(
default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) shape=(1, ),
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) attr=param,
self.pos_emb = dg.Embedding(size=[1024, num_hidden], dtype='float32',
padding_idx=0, default_initializer=fluid.initializer.ConstantInitializer(
param_attr=fluid.ParamAttr( value=1.0))
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), self.pos_inp = get_sinusoid_encoding_table(
trainable=False)) 1024, self.num_hidden, padding_idx=0)
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], self.pos_emb = dg.Embedding(
hidden_size = num_hidden * 2, size=[1024, num_hidden],
output_size = num_hidden, padding_idx=0,
dropout_rate=0.2) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(
input_size=config['audio']['num_mels'],
hidden_size=num_hidden * 2,
output_size=num_hidden,
dropout_rate=0.2)
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
self.linear = dg.Linear(num_hidden, num_hidden, self.linear = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), num_hidden,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.selfattn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.selfattn_layers): for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.attn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.attn_layers): for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer) self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] self.ffns = [
PositionwiseFeedForward(
num_hidden, num_hidden * num_head, filter_size=1)
for _ in range(3)
]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'], self.mel_linear = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), num_hidden,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) config['audio']['num_mels'] * config['audio']['outputs_per_step'],
self.stop_linear = dg.Linear(num_hidden, 1, param_attr=fluid.ParamAttr(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], self.stop_linear = dg.Linear(
filter_size = 5, padding = 4, num_conv=5, num_hidden,
outputs_per_step=config['audio']['outputs_per_step'], 1,
use_cudnn = True) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(
config['audio']['num_mels'],
config['hidden_size'],
filter_size=5,
padding=4,
num_conv=5,
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn=True)
def forward(self, key, value, query, c_mask, positional): def forward(self, key, value, query, c_mask, positional):
# get decoder mask with triangular matrix # get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional) m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query) mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) query)
triu_tensor = dg.to_variable(
get_triu_tensor(query.numpy(), query.numpy())).astype(
np.float32)
mask = mask + triu_tensor mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32) mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len) # (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) zero_mask = get_attn_key_pad_mask(
layers.squeeze(c_mask, [-1]), query)
else: else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) mask = get_triu_tensor(query.numpy(),
query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None m_mask, zero_mask = None, None
# Decoder pre-network # Decoder pre-network
query = self.decoder_prenet(query) query = self.decoder_prenet(query)
# Centered position # Centered position
query = self.linear(query) query = self.linear(query)
...@@ -84,10 +137,13 @@ class Decoder(dg.Layer): ...@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder # Attention decoder-decoder, encoder-decoder
selfattn_list = list() selfattn_list = list()
attn_list = list() attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) self.ffns):
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) query, attn_dec = selfattn(
query, query, query, mask=mask, query_mask=m_mask)
query, attn_dot = attn(
key, value, query, mask=zero_mask, query_mask=m_mask)
query = ffn(query) query = ffn(query)
selfattn_list.append(attn_dec) selfattn_list.append(attn_dec)
attn_list.append(attn_dot) attn_list.append(attn_dot)
...@@ -96,7 +152,7 @@ class Decoder(dg.Layer): ...@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
# Post Mel Network # Post Mel Network
out = self.postconvnet(mel_out) out = self.postconvnet(mel_out)
out = mel_out + out out = mel_out + out
# Stop tokens # Stop tokens
stop_tokens = self.stop_linear(query) stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1]) stop_tokens = layers.squeeze(stop_tokens, [-1])
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
...@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention ...@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, num_head=4): def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.alpha = self.create_parameter(
self.pos_emb = dg.Embedding(size=[1024, num_hidden], shape=(1, ), attr=param, dtype='float32')
padding_idx=0, self.pos_inp = get_sinusoid_encoding_table(
param_attr=fluid.ParamAttr( 1024, self.num_hidden, padding_idx=0)
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), self.pos_emb = dg.Embedding(
trainable=False)) size=[1024, num_hidden],
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, padding_idx=0,
num_hidden = num_hidden, param_attr=fluid.ParamAttr(
use_cudnn=True) initializer=fluid.initializer.NumpyArrayInitializer(
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(
embedding_size=embedding_size,
num_hidden=num_hidden,
use_cudnn=True)
self.layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)] self.ffns = [
PositionwiseFeedForward(
num_hidden,
num_hidden * num_head,
filter_size=1,
use_cudnn=True) for _ in range(3)
]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
...@@ -33,25 +62,23 @@ class Encoder(dg.Layer): ...@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
mask = get_attn_key_pad_mask(positional, x) mask = get_attn_key_pad_mask(positional, x)
else: else:
query_mask, mask = None, None query_mask, mask = None, None
# Encoder pre_network # Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C) x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding # Get positional encoding
positional = self.pos_emb(positional) positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C) x = positional * self.alpha + x #(N, T, C)
# Positional dropout # Positional dropout
x = layers.dropout(x, 0.1) x = layers.dropout(x, 0.1)
# Self attention encoder # Self attention encoder
attentions = list() attentions = list()
for layer, ffn in zip(self.layers, self.ffns): for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
x = ffn(x) x = ffn(x)
attentions.append(attention) attentions.append(attention)
return x, query_mask, attentions return x, query_mask, attentions
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer): ...@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size], self.embedding = dg.Embedding(
padding_idx = None) size=[len(symbols), embedding_size], padding_idx=None)
self.conv_list = [] self.conv_list = []
k = math.sqrt(1 / embedding_size) k = math.sqrt(1 / embedding_size)
self.conv_list.append(Conv1D(num_channels = embedding_size, self.conv_list.append(
num_filters = num_hidden, Conv1D(
filter_size = 5, num_channels=embedding_size,
padding = int(np.floor(5/2)), num_filters=num_hidden,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=5,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=int(np.floor(5 / 2)),
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
for _ in range(2): for _ in range(2):
self.conv_list.append(Conv1D(num_channels = num_hidden, self.conv_list.append(
num_filters = num_hidden, Conv1D(
filter_size = 5, num_channels=num_hidden,
padding = int(np.floor(5/2)), num_filters=num_hidden,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=5,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=int(np.floor(5 / 2)),
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list): for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [
data_layout='NCHW') for _ in range(3)] dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(3)
]
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
self.projection = dg.Linear(num_hidden, num_hidden, self.projection = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), num_hidden,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x): def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size) x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1]) x = layers.transpose(x, [0, 2, 1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C) x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
x = self.projection(x) x = self.projection(x)
return x return x
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
class PreNet(dg.Layer): class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
""" """
...@@ -17,13 +31,21 @@ class PreNet(dg.Layer): ...@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
k = math.sqrt(1 / input_size) k = math.sqrt(1 / input_size)
self.linear1 = dg.Linear(input_size, hidden_size, self.linear1 = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), input_size,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) hidden_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size, self.linear2 = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), hidden_size,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) output_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x): def forward(self, x):
""" """
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册