提交 faa725ba 编写于 作者: L liuyibing01

Merge branch 'add_license' into 'master'

add license

See merge request !24
...@@ -25,3 +25,11 @@ ...@@ -25,3 +25,11 @@
files: \.md$ files: \.md$
- id: remove-tabs - id: remove-tabs
files: \.md$ files: \.md$
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./tools/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
# Deepvoice 3 # Deepvoice 3
Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654). Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
...@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed ...@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
## Project Structure ## Project Structure
```text ```text
├── data.py data_processing ├── data.py data_processing
├── ljspeech.yaml (example) configuration file ├── ljspeech.yaml (example) configuration file
├── sentences.txt sample sentences ├── sentences.txt sample sentences
├── synthesis.py script to synthesize waveform from text ├── synthesis.py script to synthesize waveform from text
...@@ -50,7 +50,7 @@ optional arguments: ...@@ -50,7 +50,7 @@ optional arguments:
The directory to save result. The directory to save result.
-g DEVICE, --device DEVICE -g DEVICE, --device DEVICE
device to use device to use
``` ```
1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config. 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt). 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
...@@ -61,7 +61,7 @@ optional arguments: ...@@ -61,7 +61,7 @@ optional arguments:
├── checkpoints # checkpoint ├── checkpoints # checkpoint
├── log # tensorboard log ├── log # tensorboard log
└── states # train and evaluation results └── states # train and evaluation results
├── alignments # attention ├── alignments # attention
├── lin_spec # linear spectrogram ├── lin_spec # linear spectrogram
├── mel_spec # mel spectrogram ├── mel_spec # mel spectrogram
└── waveform # waveform (.wav files) └── waveform # waveform (.wav files)
...@@ -112,4 +112,3 @@ example script: ...@@ -112,4 +112,3 @@ example script:
```bash ```bash
python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
``` ```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import csv import csv
from pathlib import Path from pathlib import Path
...@@ -79,10 +93,11 @@ class Transform(object): ...@@ -79,10 +93,11 @@ class Transform(object):
y = signal.lfilter([1., -self.preemphasis], [1.], wav) y = signal.lfilter([1., -self.preemphasis], [1.], wav)
# STFT # STFT
D = librosa.stft(y=y, D = librosa.stft(
n_fft=self.n_fft, y=y,
win_length=self.win_length, n_fft=self.n_fft,
hop_length=self.hop_length) win_length=self.win_length,
hop_length=self.hop_length)
S = np.abs(D) S = np.abs(D)
# to db and normalize to 0-1 # to db and normalize to 0-1
...@@ -96,11 +111,8 @@ class Transform(object): ...@@ -96,11 +111,8 @@ class Transform(object):
# mel scale and to db and normalize to 0-1, # mel scale and to db and normalize to 0-1,
# CAUTION: pass linear scale S, not dbscaled S # CAUTION: pass linear scale S, not dbscaled S
S_mel = librosa.feature.melspectrogram(S=S, S_mel = librosa.feature.melspectrogram(
n_mels=self.n_mels, S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
fmin=self.fmin,
fmax=self.fmax,
power=1.)
S_mel = 20 * np.log10(np.maximum(amplitude_min, S_mel = 20 * np.log10(np.maximum(amplitude_min,
S_mel)) - self.ref_level_db S_mel)) - self.ref_level_db
S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db) S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
...@@ -148,20 +160,18 @@ class DataCollector(object): ...@@ -148,20 +160,18 @@ class DataCollector(object):
(mix_grapheme_phonemes, text_length, speaker_id, S_norm, (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
S_mel_norm, num_frames) = example S_mel_norm, num_frames) = example
text_sequences.append( text_sequences.append(
np.pad(mix_grapheme_phonemes, np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
(0, max_text_length - text_length))) )))
lin_specs.append( lin_specs.append(
np.pad(S_norm, np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
((0, 0), (self._pad_begin, self._pad_begin - num_frames))))
max_frames - self._pad_begin - num_frames))))
mel_specs.append( mel_specs.append(
np.pad(S_mel_norm, np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
((0, 0), (self._pad_begin, self._pad_begin - num_frames))))
max_frames - self._pad_begin - num_frames))))
done_flags.append( done_flags.append(
np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )), np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
(0, max_decoder_length - (0, max_decoder_length - int(
int(np.ceil(num_frames // self._factor))), np.ceil(num_frames // self._factor))),
constant_values=1)) constant_values=1))
text_sequences = np.array(text_sequences).astype(np.int64) text_sequences = np.array(text_sequences).astype(np.int64)
lin_specs = np.transpose(np.array(lin_specs), lin_specs = np.transpose(np.array(lin_specs),
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import argparse import argparse
import ruamel.yaml import ruamel.yaml
...@@ -22,11 +36,8 @@ if __name__ == "__main__": ...@@ -22,11 +36,8 @@ if __name__ == "__main__":
parser.add_argument("checkpoint", type=str, help="checkpoint to load.") parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
parser.add_argument("text", type=str, help="text file to synthesize") parser.add_argument("text", type=str, help="text file to synthesize")
parser.add_argument("output_path", type=str, help="path to save results") parser.add_argument("output_path", type=str, help="path to save results")
parser.add_argument("-g", parser.add_argument(
"--device", "-g", "--device", type=int, default=-1, help="device to use")
type=int,
default=-1,
help="device to use")
args = parser.parse_args() args = parser.parse_args()
with open(args.config, 'rt') as f: with open(args.config, 'rt') as f:
...@@ -76,15 +87,14 @@ if __name__ == "__main__": ...@@ -76,15 +87,14 @@ if __name__ == "__main__":
window_ahead = model_config["window_ahead"] window_ahead = model_config["window_ahead"]
key_projection = model_config["key_projection"] key_projection = model_config["key_projection"]
value_projection = model_config["value_projection"] value_projection = model_config["value_projection"]
dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, dv3 = make_model(
padding_idx, embedding_std, max_positions, n_vocab, n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
freeze_embedding, filter_size, encoder_channels, embedding_std, max_positions, n_vocab, freeze_embedding,
n_mels, decoder_channels, r, filter_size, encoder_channels, n_mels, decoder_channels, r,
trainable_positional_encodings, use_memory_mask, trainable_positional_encodings, use_memory_mask,
query_position_rate, key_position_rate, query_position_rate, key_position_rate, window_backward,
window_backward, window_ahead, key_projection, window_ahead, key_projection, value_projection, downsample_factor,
value_projection, downsample_factor, linear_dim, linear_dim, use_decoder_states, converter_channels, dropout)
use_decoder_states, converter_channels, dropout)
summary(dv3) summary(dv3)
state, _ = dg.load_dygraph(args.checkpoint) state, _ = dg.load_dygraph(args.checkpoint)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import argparse import argparse
import ruamel.yaml import ruamel.yaml
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import numpy as np import numpy as np
from matplotlib import cm from matplotlib import cm
...@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
converter_channels, dropout): converter_channels, dropout):
"""just a simple function to create a deepvoice 3 model""" """just a simple function to create a deepvoice 3 model"""
if n_speakers > 1: if n_speakers > 1:
spe = dg.Embedding((n_speakers, speaker_dim), spe = dg.Embedding(
param_attr=I.Normal(scale=speaker_embed_std)) (n_speakers, speaker_dim),
param_attr=I.Normal(scale=speaker_embed_std))
else: else:
spe = None spe = None
...@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 9), ConvSpec(h, k, 9),
ConvSpec(h, k, 27), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 1),
ConvSpec(h, k, 3), ConvSpec(h, k, 3), )
) enc = Encoder(
enc = Encoder(n_vocab, n_vocab,
embed_dim, embed_dim,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
padding_idx=None, padding_idx=None,
embedding_weight_std=embedding_std, embedding_weight_std=embedding_std,
convolutions=encoder_convolutions, convolutions=encoder_convolutions,
max_positions=max_positions, max_positions=max_positions,
dropout=dropout) dropout=dropout)
if freeze_embedding: if freeze_embedding:
freeze(enc.embed) freeze(enc.embed)
...@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 3), ConvSpec(h, k, 3),
ConvSpec(h, k, 9), ConvSpec(h, k, 9),
ConvSpec(h, k, 27), ConvSpec(h, k, 27),
ConvSpec(h, k, 1), ConvSpec(h, k, 1), )
)
attention = [True, False, False, False, True] attention = [True, False, False, False, True]
force_monotonic_attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True]
dec = Decoder(n_speakers, dec = Decoder(
speaker_dim, n_speakers,
embed_dim, speaker_dim,
mel_dim, embed_dim,
r=r, mel_dim,
max_positions=max_positions, r=r,
padding_idx=padding_idx, max_positions=max_positions,
preattention=prenet_convolutions, padding_idx=padding_idx,
convolutions=attentive_convolutions, preattention=prenet_convolutions,
attention=attention, convolutions=attentive_convolutions,
dropout=dropout, attention=attention,
use_memory_mask=use_memory_mask, dropout=dropout,
force_monotonic_attention=force_monotonic_attention, use_memory_mask=use_memory_mask,
query_position_rate=query_position_rate, force_monotonic_attention=force_monotonic_attention,
key_position_rate=key_position_rate, query_position_rate=query_position_rate,
window_range=WindowRange(window_behind, window_ahead), key_position_rate=key_position_rate,
key_projection=key_projection, window_range=WindowRange(window_behind, window_ahead),
value_projection=value_projection) key_projection=key_projection,
value_projection=value_projection)
if not trainable_positional_encodings: if not trainable_positional_encodings:
freeze(dec.embed_keys_positions) freeze(dec.embed_keys_positions)
freeze(dec.embed_query_positions) freeze(dec.embed_query_positions)
...@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, ...@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
ConvSpec(h, k, 1), ConvSpec(h, k, 1),
ConvSpec(h, k, 3), ConvSpec(h, k, 3),
ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 1),
ConvSpec(2 * h, k, 3), ConvSpec(2 * h, k, 3), )
) cvt = Converter(
cvt = Converter(n_speakers, n_speakers,
speaker_dim, speaker_dim,
dec.state_dim if use_decoder_states else mel_dim, dec.state_dim if use_decoder_states else mel_dim,
linear_dim, linear_dim,
time_upsampling=downsample_factor, time_upsampling=downsample_factor,
convolutions=postnet_convolutions, convolutions=postnet_convolutions,
dropout=dropout) dropout=dropout)
dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states) dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
return dv3 return dv3
...@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db, ...@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
ref_level_db, power, n_iter, win_length, hop_length, ref_level_db, power, n_iter, win_length, hop_length,
preemphasis): preemphasis):
"""generate waveform from text using a deepvoice 3 model""" """generate waveform from text using a deepvoice 3 model"""
text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob), text = np.array(
dtype=np.int64) en.text_to_sequence(
text, p=replace_pronounciation_prob),
dtype=np.int64)
length = len(text) length = len(text)
print("text sequence's length: {}".format(length)) print("text sequence's length: {}".format(length))
text_positions = np.arange(1, 1 + length) text_positions = np.arange(1, 1 + length)
...@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter, ...@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
""" """
denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10)) lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
wav = librosa.griffinlim(lin_scaled**power, wav = librosa.griffinlim(
n_iter=n_iter, lin_scaled**power,
hop_length=hop_length, n_iter=n_iter,
win_length=win_length) hop_length=hop_length,
win_length=win_length)
if preemphasis > 0: if preemphasis > 0:
wav = signal.lfilter([1.], [1., -preemphasis], wav) wav = signal.lfilter([1.], [1., -preemphasis], wav)
return wav return wav
...@@ -225,28 +243,30 @@ def save_state(save_dir, ...@@ -225,28 +243,30 @@ def save_state(save_dir,
plt.colorbar() plt.colorbar()
plt.title("mel_input") plt.title("mel_input")
plt.savefig( plt.savefig(
os.path.join(path, os.path.join(path, "target_mel_spec_step{:09d}.png".format(
"target_mel_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("target/mel_spec", writer.add_image(
cm.viridis(mel_input), "target/mel_spec",
global_step, cm.viridis(mel_input),
dataformats="HWC") global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3)) plt.figure(figsize=(10, 3))
display.specshow(mel_output) display.specshow(mel_output)
plt.colorbar() plt.colorbar()
plt.title("mel_output") plt.title("mel_output")
plt.savefig( plt.savefig(
os.path.join( os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
path, "predicted_mel_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("predicted/mel_spec", writer.add_image(
cm.viridis(mel_output), "predicted/mel_spec",
global_step, cm.viridis(mel_output),
dataformats="HWC") global_step,
dataformats="HWC")
if lin_input is not None and lin_output is not None: if lin_input is not None and lin_output is not None:
lin_input = lin_input[0].numpy().T lin_input = lin_input[0].numpy().T
...@@ -258,28 +278,30 @@ def save_state(save_dir, ...@@ -258,28 +278,30 @@ def save_state(save_dir,
plt.colorbar() plt.colorbar()
plt.title("mel_input") plt.title("mel_input")
plt.savefig( plt.savefig(
os.path.join(path, os.path.join(path, "target_lin_spec_step{:09d}.png".format(
"target_lin_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("target/lin_spec", writer.add_image(
cm.viridis(lin_input), "target/lin_spec",
global_step, cm.viridis(lin_input),
dataformats="HWC") global_step,
dataformats="HWC")
plt.figure(figsize=(10, 3)) plt.figure(figsize=(10, 3))
display.specshow(lin_output) display.specshow(lin_output)
plt.colorbar() plt.colorbar()
plt.title("mel_input") plt.title("mel_input")
plt.savefig( plt.savefig(
os.path.join( os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
path, "predicted_lin_spec_step{:09d}.png".format(global_step))) global_step)))
plt.close() plt.close()
writer.add_image("predicted/lin_spec", writer.add_image(
cm.viridis(lin_output), "predicted/lin_spec",
global_step, cm.viridis(lin_output),
dataformats="HWC") global_step,
dataformats="HWC")
if alignments is not None and len(alignments.shape) == 4: if alignments is not None and len(alignments.shape) == 4:
path = os.path.join(save_dir, "alignments") path = os.path.join(save_dir, "alignments")
...@@ -290,10 +312,11 @@ def save_state(save_dir, ...@@ -290,10 +312,11 @@ def save_state(save_dir,
"train_attn_layer_{}_step_{}.png".format(idx, global_step)) "train_attn_layer_{}_step_{}.png".format(idx, global_step))
plot_alignment(attn_layer, save_path) plot_alignment(attn_layer, save_path)
writer.add_image("train_attn/layer_{}".format(idx), writer.add_image(
cm.viridis(attn_layer), "train_attn/layer_{}".format(idx),
global_step, cm.viridis(attn_layer),
dataformats="HWC") global_step,
dataformats="HWC")
if lin_output is not None: if lin_output is not None:
wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power, wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
...@@ -302,7 +325,5 @@ def save_state(save_dir, ...@@ -302,7 +325,5 @@ def save_state(save_dir,
save_path = os.path.join( save_path = os.path.join(
path, "train_sample_step_{:09d}.wav".format(global_step)) path, "train_sample_step_{:09d}.wav".format(global_step))
sf.write(save_path, wav, sample_rate) sf.write(save_path, wav, sample_rate)
writer.add_audio("train_sample", writer.add_audio(
wav, "train_sample", wav, global_step, sample_rate=sample_rate)
global_step,
sample_rate=sample_rate)
...@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ...@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step`` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
For more help on arguments: For more help on arguments:
``python train.py --help``. ``python train.py --help``.
## Synthesis ## Synthesis
...@@ -75,5 +75,5 @@ or you can run the script file directly. ...@@ -75,5 +75,5 @@ or you can run the script file directly.
sh synthesis.sh sh synthesis.sh
``` ```
For more help on arguments: For more help on arguments:
``python synthesis.py --help``. ``python synthesis.py --help``.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml', parser.add_argument(
'--config_path',
type=str,
default='config/fastspeech.yaml',
help="the yaml config file path.") help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32, parser.add_argument(
help="batch size for training.") '--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000, parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.") help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001, parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.") help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500, parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.") help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000, parser.add_argument(
'--fastspeech_step',
type=int,
default=70000,
help="Global step to restore checkpoint of fastspeech.") help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1, parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.") help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0, parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.") help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.") help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None, parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.") help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint', parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.") help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log', parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.") help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample', parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.") help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log', parser.add_argument(
'--transtts_path',
type=str,
default='./log',
help="the directory to load pretrain transformerTTS model.") help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000, parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="the step to load transformerTTS model.") help="the step to load transformerTTS model.")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from collections import OrderedDict from collections import OrderedDict
...@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence ...@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
from parakeet import audio from parakeet import audio
from parakeet.models.fastspeech.fastspeech import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
...@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path): ...@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, args): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard # tensorboard
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis') path = os.path.join(args.log_dir, 'synthesis')
with open(args.config_path) as f: with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader) cfg = yaml.load(f, Loader=yaml.Loader)
...@@ -37,24 +52,28 @@ def synthesis(text_input, args): ...@@ -37,24 +52,28 @@ def synthesis(text_input, args):
with dg.guard(place): with dg.guard(place):
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))) model.set_dict(
load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech")))
model.eval() model.eval()
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
pos_text = np.arange(1, text.shape[1]+1) pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha) mel_output, mel_output_postnet = model(
text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'], sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'], num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'], min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'], ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'], n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'], win_length=cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'], hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'], power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'], preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
...@@ -67,14 +86,17 @@ def synthesis(text_input, args): ...@@ -67,14 +86,17 @@ def synthesis(text_input, args):
do_trim_silence=False, do_trim_silence=False,
sound_norm=False) sound_norm=False)
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) mel_output_postnet = fluid.layers.transpose(
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
))
writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!") print("Synthesis completed !!!")
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model") parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
synthesis("Transformer model is so fast!", args) synthesis("Transformer model is so fast!", args)
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import argparse import argparse
import os import os
...@@ -20,8 +33,10 @@ import sys ...@@ -20,8 +33,10 @@ import sys
sys.path.append("../transformer_tts") sys.path.append("../transformer_tts")
from data import LJSpeechLoader from data import LJSpeechLoader
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
for param in model_dict: for param in model_dict:
if param.startswith('_layers.'): if param.startswith('_layers.'):
...@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path): ...@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(args): def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
...@@ -43,26 +59,33 @@ def main(args): ...@@ -43,26 +59,33 @@ def main(args):
if args.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'fastspeech') path = os.path.join(args.log_dir, 'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg) transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) model_dict, _ = load_checkpoint(
str(args.transformer_step),
os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict) transformerTTS.set_dict(model_dict)
transformerTTS.eval() transformerTTS.eval()
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), optimizer = fluid.optimizer.AdamOptimizer(
parameter_list=model.parameters()) learning_rate=dg.NoamDecay(1 / (
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) model_dict, opti_dict = load_checkpoint(
str(args.fastspeech_step),
os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = args.fastspeech_step global_step = args.fastspeech_step
...@@ -76,31 +99,42 @@ def main(args): ...@@ -76,31 +99,42 @@ def main(args):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) _, _, attn_probs, _, _, _ = transformerTTS(
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32) character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(
get_alignment(attn_probs, mel_lens, cfg[
'transformer_head'])).astype(np.float32)
global_step += 1 global_step += 1
#Forward #Forward
result= model(character, result = model(
pos_text, character,
mel_pos=pos_mel, pos_text,
length_target=alignment) mel_pos=pos_mel,
length_target=alignment)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel) mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment))) duration_loss = layers.mean(
layers.abs(
layers.elementwise_sub(duration_predictor_output,
alignment)))
total_loss = mel_loss + mel_postnet_loss + duration_loss total_loss = mel_loss + mel_postnet_loss + duration_loss
if local_rank==0: if local_rank == 0:
writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('mel_loss',
writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) mel_loss.numpy(), global_step)
writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('post_mel_loss',
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) mel_postnet_loss.numpy(), global_step)
writer.add_scalar('duration_loss',
duration_loss.numpy(), global_step)
writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if args.use_data_parallel: if args.use_data_parallel:
total_loss = model.scale_loss(total_loss) total_loss = model.scale_loss(total_loss)
...@@ -108,21 +142,25 @@ def main(args): ...@@ -108,21 +142,25 @@ def main(args):
model.apply_collective_grads() model.apply_collective_grads()
else: else:
total_loss.backward() total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) optimizer.minimize(
total_loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % args.save_step == 0: if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path): if not os.path.exists(args.save_path):
os.mkdir(args.save_path) os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step) save_path = os.path.join(args.save_path,
'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank == 0:
writer.close() writer.close()
if __name__ =='__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train Fastspeech model") parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
......
...@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ...@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step`` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
For more help on arguments: For more help on arguments:
``python train_transformer.py --help``. ``python train_transformer.py --help``.
## Train Vocoder ## Train Vocoder
...@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr ...@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
``` ```
if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step`` if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
For more help on arguments: For more help on arguments:
``python train_vocoder.py --help``. ``python train_vocoder.py --help``.
## Synthesis ## Synthesis
...@@ -101,5 +101,5 @@ sh synthesis.sh ...@@ -101,5 +101,5 @@ sh synthesis.sh
And the audio file will be saved in ``--sample_path``. And the audio file will be saved in ``--sample_path``.
For more help on arguments: For more help on arguments:
``python synthesis.py --help``. ``python synthesis.py --help``.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo ...@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader: class LJSpeechLoader:
def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True): def __init__(self,
config,
args,
nranks,
rank,
is_vocoder=False,
shuffle=True):
place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(args.data_path) LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT) metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config) transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer) dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) sampler = DistributedSampler(
len(metadata), nranks, rank, shuffle=shuffle)
assert args.batch_size % nranks == 0 assert args.batch_size % nranks == 0
each_bs = args.batch_size // nranks each_bs = args.batch_size // nranks
if is_vocoder: if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples_vocoder,
drop_last=True)
else: else:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True) dataloader = DataCargo(
dataset,
sampler=sampler,
batch_size=each_bs,
shuffle=shuffle,
batch_fn=batch_examples,
drop_last=True)
self.reader = fluid.io.DataLoader.from_generator( self.reader = fluid.io.DataLoader.from_generator(
capacity=32, capacity=32,
iterable=True, iterable=True,
...@@ -63,13 +96,13 @@ class LJSpeech(object): ...@@ -63,13 +96,13 @@ class LJSpeech(object):
super(LJSpeech, self).__init__() super(LJSpeech, self).__init__()
self.config = config self.config = config
self._ljspeech_processor = audio.AudioProcessor( self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config['audio']['sr'], sample_rate=config['audio']['sr'],
num_mels=config['audio']['num_mels'], num_mels=config['audio']['num_mels'],
min_level_db=config['audio']['min_level_db'], min_level_db=config['audio']['min_level_db'],
ref_level_db=config['audio']['ref_level_db'], ref_level_db=config['audio']['ref_level_db'],
n_fft=config['audio']['n_fft'], n_fft=config['audio']['n_fft'],
win_length= config['audio']['win_length'], win_length=config['audio']['win_length'],
hop_length= config['audio']['hop_length'], hop_length=config['audio']['hop_length'],
power=config['audio']['power'], power=config['audio']['power'],
preemphasis=config['audio']['preemphasis'], preemphasis=config['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
...@@ -81,7 +114,7 @@ class LJSpeech(object): ...@@ -81,7 +114,7 @@ class LJSpeech(object):
griffin_lim_iters=60, griffin_lim_iters=60,
do_trim_silence=False, do_trim_silence=False,
sound_norm=False) sound_norm=False)
def __call__(self, metadatum): def __call__(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a """All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method. different preprocessing pipeline, you can override this method.
...@@ -90,13 +123,15 @@ class LJSpeech(object): ...@@ -90,13 +123,15 @@ class LJSpeech(object):
method. method.
""" """
fname, raw_text, normalized_text = metadatum fname, raw_text, normalized_text = metadatum
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = self._ljspeech_processor.load_wav(str(fname)) wav = self._ljspeech_processor.load_wav(str(fname))
mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32) mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32) mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64) phonemes = np.array(
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes
) # maybe we need to implement it as a map in the future
def batch_examples(batch): def batch_examples(batch):
...@@ -109,44 +144,71 @@ def batch_examples(batch): ...@@ -109,44 +144,71 @@ def batch_examples(batch):
pos_mels = [] pos_mels = []
for data in batch: for data in batch:
_, mel, text = data _, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1)) mel_inputs.append(
np.concatenate(
[np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
axis=-1))
mel_lens.append(mel.shape[1]) mel_lens.append(mel.shape[1])
text_lens.append(len(text)) text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1)) pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1)) pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel) mels.append(mel)
texts.append(text) texts.append(text)
# Sort by text_len in descending order # Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)] texts = [
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)] i
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)] for i, _ in sorted(
mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)] zip(texts, text_lens), key=lambda x: x[1], reverse=True)
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)] ]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)] mels = [
i
for i, _ in sorted(
zip(mels, text_lens), key=lambda x: x[1], reverse=True)
]
mel_inputs = [
i
for i, _ in sorted(
zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
]
mel_lens = [
i
for i, _ in sorted(
zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
]
pos_texts = [
i
for i, _ in sorted(
zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
]
pos_mels = [
i
for i, _ in sorted(
zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
]
text_lens = sorted(text_lens, reverse=True) text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch # Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts) #(B, T) texts = TextIDBatcher(pad_id=0)(texts) #(B, T)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T) pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T) pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels) mels = np.transpose(
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels) SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens)) mel_inputs = np.transpose(
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
np.array(mel_lens))
def batch_examples_vocoder(batch): def batch_examples_vocoder(batch):
mels=[] mels = []
mags=[] mags = []
for data in batch: for data in batch:
mag, mel, _ = data mag, mel, _ = data
mels.append(mel) mels.append(mel)
mags.append(mag) mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1)) mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
return (mels, mags) return (mels, mags)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml', parser.add_argument(
'--config_path',
type=str,
default='config/train_transformer.yaml',
help="the yaml config file path.") help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32, parser.add_argument(
help="batch size for training.") '--batch_size', type=int, default=32, help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000, parser.add_argument(
'--epochs',
type=int,
default=10000,
help="the number of epoch for training.") help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001, parser.add_argument(
'--lr',
type=float,
default=0.001,
help="the learning rate for training.") help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500, parser.add_argument(
'--save_step',
type=int,
default=500,
help="checkpointing interval during training.") help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000, parser.add_argument(
'--image_step',
type=int,
default=2000,
help="attention image interval during training.") help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400, parser.add_argument(
'--max_len',
type=int,
default=400,
help="The max length of audio when synthsis.") help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000, parser.add_argument(
'--transformer_step',
type=int,
default=160000,
help="Global step to restore checkpoint of transformer.") help="Global step to restore checkpoint of transformer.")
parser.add_argument('--vocoder_step', type=int, default=90000, parser.add_argument(
'--vocoder_step',
type=int,
default=90000,
help="Global step to restore checkpoint of postnet.") help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=int, default=1, parser.add_argument(
'--use_gpu',
type=int,
default=1,
help="use gpu or not during training.") help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0, parser.add_argument(
'--use_data_parallel',
type=int,
default=0,
help="use data parallel or not during training.") help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=int, default=0, parser.add_argument(
'--stop_token',
type=int,
default=0,
help="use stop token loss in network or not.") help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', parser.add_argument(
'--data_path',
type=str,
default='./dataset/LJSpeech-1.1',
help="the path of dataset.") help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None, parser.add_argument(
'--checkpoint_path',
type=str,
default=None,
help="the path to load checkpoint or pretrain model.") help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint', parser.add_argument(
'--save_path',
type=str,
default='./checkpoint',
help="the path to save checkpoint.") help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log', parser.add_argument(
'--log_dir',
type=str,
default='./log',
help="the directory to save tensorboard log.") help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample', parser.add_argument(
'--sample_path',
type=str,
default='./sample',
help="the directory to save audio sample in synthesis.") help="the directory to save audio sample in synthesis.")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from scipy.io.wavfile import write from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
...@@ -16,6 +29,7 @@ from parakeet import audio ...@@ -16,6 +29,7 @@ from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
...@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path): ...@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, args): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
...@@ -34,46 +49,53 @@ def synthesis(text_input, args): ...@@ -34,46 +49,53 @@ def synthesis(text_input, args):
# tensorboard # tensorboard
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'synthesis') path = os.path.join(args.log_dir, 'synthesis')
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))) model.set_dict(
load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer")))
model.eval() model.eval()
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model_vocoder = Vocoder(cfg, args.batch_size) model_vocoder = Vocoder(cfg, args.batch_size)
model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))) model_vocoder.set_dict(
load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder")))
model_vocoder.eval() model_vocoder.eval()
# init input # init input
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0]) text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1) pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
pbar = tqdm(range(args.max_len)) pbar = tqdm(range(args.max_len))
for i in pbar: for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1) pos_mel = np.arange(1, mel_input.shape[1] + 1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0]) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1) text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat(
[mel_input, postnet_pred[:, -1:, :]], axis=1)
mag_pred = model_vocoder(postnet_pred) mag_pred = model_vocoder(postnet_pred)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'], sample_rate=cfg['audio']['sr'],
num_mels=cfg['audio']['num_mels'], num_mels=cfg['audio']['num_mels'],
min_level_db=cfg['audio']['min_level_db'], min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg['audio']['ref_level_db'], ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg['audio']['n_fft'], n_fft=cfg['audio']['n_fft'],
win_length= cfg['audio']['win_length'], win_length=cfg['audio']['win_length'],
hop_length= cfg['audio']['hop_length'], hop_length=cfg['audio']['hop_length'],
power=cfg['audio']['power'], power=cfg['audio']['power'],
preemphasis=cfg['audio']['preemphasis'], preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
...@@ -86,13 +108,18 @@ def synthesis(text_input, args): ...@@ -86,13 +108,18 @@ def synthesis(text_input, args):
do_trim_silence=False, do_trim_silence=False,
sound_norm=False) sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) wav = _ljspeech_processor.inv_spectrogram(
fluid.layers.transpose(
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(args.sample_path): if not os.path.exists(args.sample_path):
os.mkdir(args.sample_path) os.mkdir(args.sample_path)
write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav) write(
os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
wav)
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Synthesis model") parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
...@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy ...@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
from data import LJSpeechLoader from data import LJSpeechLoader
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = fluid.dygraph.load_dygraph(
os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
for param in model_dict: for param in model_dict:
if param.startswith('_layers.'): if param.startswith('_layers.'):
...@@ -40,22 +55,27 @@ def main(args): ...@@ -40,22 +55,27 @@ def main(args):
if args.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'transformer') path = os.path.join(args.log_dir, 'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), optimizer = fluid.optimizer.AdamOptimizer(
parameter_list=model.parameters()) learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader() parameter_list=model.parameters())
reader = LJSpeechLoader(
cfg, args, nranks, local_rank, shuffle=True).reader()
if args.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")) model_dict, opti_dict = load_checkpoint(
str(args.transformer_step),
os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = args.transformer_step global_step = args.transformer_step
...@@ -64,86 +84,112 @@ def main(args): ...@@ -64,86 +84,112 @@ def main(args):
if args.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(args.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
global_step += 1 global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
character, mel_input, pos_text, pos_mel)
label = (pos_mel == 0).astype(np.float32) label = (pos_mel == 0).astype(np.float32)
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) mel_loss = layers.mean(
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work. # Note: When used stop token loss the learning did not work.
if args.stop_token: if args.stop_token:
stop_loss = cross_entropy(stop_preds, label) stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss loss = loss + stop_loss
if local_rank==0: if local_rank == 0:
writer.add_scalars('training_loss', { writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(), 'mel_loss': mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy() 'post_mel_loss': post_mel_loss.numpy()
}, global_step) }, global_step)
if args.stop_token: if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) writer.add_scalar('stop_loss',
stop_loss.numpy(), global_step)
if args.use_data_parallel: if args.use_data_parallel:
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
'encoder_alpha':model._layers.encoder.alpha.numpy(), 'encoder_alpha':
'decoder_alpha':model._layers.decoder.alpha.numpy(), model._layers.encoder.alpha.numpy(),
'decoder_alpha':
model._layers.decoder.alpha.numpy(),
}, global_step) }, global_step)
else: else:
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(), 'encoder_alpha': model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(),
}, global_step) }, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) writer.add_scalar('learning_rate',
optimizer._learning_rate.step().numpy(),
global_step)
if global_step % args.image_step == 1: if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs): for i, prob in enumerate(attn_probs):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(
writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC") cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_enc): for i, prob in enumerate(attn_enc):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC") cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_enc_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_dec): for i, prob in enumerate(attn_dec):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") cm.viridis(prob.numpy()[j * 16]) * 255)
writer.add_image(
'Attention_dec_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
if args.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % args.save_step == 0: if local_rank == 0 and global_step % args.save_step == 0:
if not os.path.exists(args.save_path): if not os.path.exists(args.save_path):
os.mkdir(args.save_path) os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'transformer/%d' % global_step) save_path = os.path.join(args.save_path,
'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank == 0:
writer.close() writer.close()
if __name__ =='__main__':
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train TransformerTTS model") parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import os import os
from tqdm import tqdm from tqdm import tqdm
...@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers ...@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
from data import LJSpeechLoader from data import LJSpeechLoader
from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
new_state_dict = OrderedDict() new_state_dict = OrderedDict()
...@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path): ...@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(args): def main(args):
local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
...@@ -35,23 +50,26 @@ def main(args): ...@@ -35,23 +50,26 @@ def main(args):
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if args.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(args.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(args.log_dir) os.mkdir(args.log_dir)
path = os.path.join(args.log_dir,'vocoder') path = os.path.join(args.log_dir, 'vocoder')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = Vocoder(cfg, args.batch_size) model = Vocoder(cfg, args.batch_size)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), optimizer = fluid.optimizer.AdamOptimizer(
parameter_list=model.parameters()) learning_rate=dg.NoamDecay(1 / (
cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
parameter_list=model.parameters())
if args.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")) model_dict, opti_dict = load_checkpoint(
str(args.vocoder_step),
os.path.join(args.checkpoint_path, "vocoder"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = args.vocoder_step global_step = args.vocoder_step
...@@ -61,48 +79,55 @@ def main(args): ...@@ -61,48 +79,55 @@ def main(args):
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader() reader = LJSpeechLoader(
cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(args.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d' % epoch)
mel, mag = data mel, mag = data
mag = dg.to_variable(mag.numpy()) mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy()) mel = dg.to_variable(mel.numpy())
global_step += 1 global_step += 1
mag_pred = model(mel) mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) loss = layers.mean(
layers.abs(layers.elementwise_sub(mag_pred, mag)))
if args.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh'])) optimizer.minimize(
loss,
grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
'grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
if local_rank==0: if local_rank == 0:
writer.add_scalars('training_loss',{ writer.add_scalars('training_loss', {
'loss':loss.numpy(), 'loss': loss.numpy(),
}, global_step) }, global_step)
if global_step % args.save_step == 0: if global_step % args.save_step == 0:
if not os.path.exists(args.save_path): if not os.path.exists(args.save_path):
os.mkdir(args.save_path) os.mkdir(args.save_path)
save_path = os.path.join(args.save_path,'vocoder/%d' % global_step) save_path = os.path.join(args.save_path,
'vocoder/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank == 0:
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Train vocoder model") parser = argparse.ArgumentParser(description="Train vocoder model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
# Print the whole config setting. # Print the whole config setting.
pprint(args) pprint(args)
main(args) main(args)
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
from pprint import pprint from pprint import pprint
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
from pprint import pprint from pprint import pprint
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
import subprocess import subprocess
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import os import os
import time import time
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.0.0" __version__ = "0.0.0"
from . import data, g2p, models, modules from . import data, g2p, models, modules
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor from .audio import AudioProcessor
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa import librosa
import soundfile as sf import soundfile as sf
import numpy as np import numpy as np
import scipy.io import scipy.io
import scipy.signal import scipy.signal
class AudioProcessor(object): class AudioProcessor(object):
def __init__(self, def __init__(
sample_rate=None, # int, sampling rate self,
num_mels=None, # int, bands of mel spectrogram sample_rate=None, # int, sampling rate
min_level_db=None, # float, minimum level db num_mels=None, # int, bands of mel spectrogram
ref_level_db=None, # float, reference level db min_level_db=None, # float, minimum level db
n_fft=None, # int: number of samples in a frame for stft ref_level_db=None, # float, reference level db
win_length=None, # int: the same meaning with n_fft n_fft=None, # int: number of samples in a frame for stft
hop_length=None, # int: number of samples between neighboring frame win_length=None, # int: the same meaning with n_fft
power=None, # float:power to raise before griffin-lim hop_length=None, # int: number of samples between neighboring frame
preemphasis=None, # float: preemphasis coefficident power=None, # float:power to raise before griffin-lim
signal_norm=None, # preemphasis=None, # float: preemphasis coefficident
symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form] signal_norm=None, #
max_norm=None, # float, max norm symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
mel_fmin=None, # int: mel spectrogram's minimum frequency max_norm=None, # float, max norm
mel_fmax=None, # int: mel spectrogram's maximum frequency mel_fmin=None, # int: mel spectrogram's minimum frequency
clip_norm=True, # bool: clip spectrogram's norm mel_fmax=None, # int: mel spectrogram's maximum frequency
griffin_lim_iters=None, # int: clip_norm=True, # bool: clip spectrogram's norm
do_trim_silence=False, # bool: trim silence griffin_lim_iters=None, # int:
sound_norm=False, do_trim_silence=False, # bool: trim silence
**kwargs): sound_norm=False,
**kwargs):
self.sample_rate = sample_rate self.sample_rate = sample_rate
self.num_mels = num_mels self.num_mels = num_mels
self.min_level_db = min_level_db self.min_level_db = min_level_db
...@@ -34,8 +50,8 @@ class AudioProcessor(object): ...@@ -34,8 +50,8 @@ class AudioProcessor(object):
self.n_fft = n_fft self.n_fft = n_fft
self.win_length = win_length or n_fft self.win_length = win_length or n_fft
# hop length defaults to 1/4 window_length # hop length defaults to 1/4 window_length
self.hop_length = hop_length or 0.25 * self.win_length self.hop_length = hop_length or 0.25 * self.win_length
self.power = power self.power = power
self.preemphasis = float(preemphasis) self.preemphasis = float(preemphasis)
...@@ -52,7 +68,8 @@ class AudioProcessor(object): ...@@ -52,7 +68,8 @@ class AudioProcessor(object):
self.do_trim_silence = do_trim_silence self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm self.sound_norm = sound_norm
self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters() self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
)
def _stft_parameters(self): def _stft_parameters(self):
"""compute frame length and hop length in ms""" """compute frame length and hop length in ms"""
...@@ -65,44 +82,54 @@ class AudioProcessor(object): ...@@ -65,44 +82,54 @@ class AudioProcessor(object):
"""object repr""" """object repr"""
cls_name_str = self.__class__.__name__ cls_name_str = self.__class__.__name__
members = vars(self) members = vars(self)
dict_str = "\n".join([" {}: {},".format(k, v) for k, v in members.items()]) dict_str = "\n".join(
[" {}: {},".format(k, v) for k, v in members.items()])
repr_str = "{}(\n{})\n".format(cls_name_str, dict_str) repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
return repr_str return repr_str
def save_wav(self, path, wav): def save_wav(self, path, wav):
"""save audio with scipy.io.wavfile in 16bit integers""" """save audio with scipy.io.wavfile in 16bit integers"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16)) scipy.io.wavfile.write(path, self.sample_rate,
wav_norm.as_type(np.int16))
def load_wav(self, path, sr=None): def load_wav(self, path, sr=None):
"""load wav -> trim_silence -> rescale""" """load wav -> trim_silence -> rescale"""
x, sr = librosa.load(path, sr=None) x, sr = librosa.load(path, sr=None)
assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate) assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
sr, self.sample_rate)
if self.do_trim_silence: if self.do_trim_silence:
try: try:
x = self.trim_silence(x) x = self.trim_silence(x)
except ValueError: except ValueError:
print(" [!] File cannot be trimmed for silence - {}".format(path)) print(" [!] File cannot be trimmed for silence - {}".format(
path))
if self.sound_norm: if self.sound_norm:
x = x / x.max() * 0.9 # why 0.9 ? x = x / x.max() * 0.9 # why 0.9 ?
return x return x
def trim_silence(self, wav): def trim_silence(self, wav):
"""Trim soilent parts with a threshold and 0.01s margin""" """Trim soilent parts with a threshold and 0.01s margin"""
margin = int(self.sample_rate * 0.01) margin = int(self.sample_rate * 0.01)
wav = wav[margin: -margin] wav = wav[margin:-margin]
trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0] trimed_wav = librosa.effects.trim(
wav,
top_db=60,
frame_length=self.win_length,
hop_length=self.hop_length)[0]
return trimed_wav return trimed_wav
def apply_preemphasis(self, x): def apply_preemphasis(self, x):
if self.preemphasis == 0.: if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ") raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1., -self.preemphasis], [1.], x) return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
def apply_inv_preemphasis(self, x): def apply_inv_preemphasis(self, x):
if self.preemphasis == 0.: if self.preemphasis == 0.:
raise RuntimeError(" !! Preemphasis coefficient should be positive. ") raise RuntimeError(
" !! Preemphasis coefficient should be positive. ")
return scipy.signal.lfilter([1.], [1., -self.preemphasis], x) return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
def _amplitude_to_db(self, x): def _amplitude_to_db(self, x):
...@@ -125,12 +152,11 @@ class AudioProcessor(object): ...@@ -125,12 +152,11 @@ class AudioProcessor(object):
"""return mel basis for mel scale""" """return mel basis for mel scale"""
if self.mel_fmax is not None: if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2 assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel( return librosa.filters.mel(self.sample_rate,
self.sample_rate, self.n_fft,
self.n_fft, n_mels=self.num_mels,
n_mels=self.num_mels, fmin=self.mel_fmin,
fmin=self.mel_fmin, fmax=self.mel_fmax)
fmax=self.mel_fmax)
def _normalize(self, S): def _normalize(self, S):
"""put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]""" """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
...@@ -156,25 +182,29 @@ class AudioProcessor(object): ...@@ -156,25 +182,29 @@ class AudioProcessor(object):
if self.symmetric_norm: if self.symmetric_norm:
if self.clip_norm: if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db S_denorm = (S_denorm + self.max_norm) * (
-self.min_level_db) / (2 * self.max_norm
) + self.min_level_db
return S_denorm return S_denorm
else: else:
if self.clip_norm: if self.clip_norm:
S_denorm = np.clip(S_denorm, 0, self.max_norm) S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db S_denorm = S_denorm * (-self.min_level_db
) / self.max_norm + self.min_level_db
return S_denorm return S_denorm
else: else:
return S return S
def _stft(self, y): def _stft(self, y):
return librosa.stft( return librosa.stft(
y=y, y=y,
n_fft=self.n_fft, n_fft=self.n_fft,
win_length=self.win_length, win_length=self.win_length,
hop_length=self.hop_length) hop_length=self.hop_length)
def _istft(self, S): def _istft(self, S):
return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length) return librosa.istft(
S, hop_length=self.hop_length, win_length=self.win_length)
def spectrogram(self, y): def spectrogram(self, y):
"""compute linear spectrogram(amplitude) """compute linear spectrogram(amplitude)
...@@ -195,7 +225,8 @@ class AudioProcessor(object): ...@@ -195,7 +225,8 @@ class AudioProcessor(object):
D = self._stft(self.apply_preemphasis(y)) D = self._stft(self.apply_preemphasis(y))
else: else:
D = self._stft(y) D = self._stft(y)
S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db S = self._amplitude_to_db(self._linear_to_mel(np.abs(
D))) - self.ref_level_db
return self._normalize(S) return self._normalize(S)
def inv_spectrogram(self, spectrogram): def inv_spectrogram(self, spectrogram):
...@@ -203,16 +234,16 @@ class AudioProcessor(object): ...@@ -203,16 +234,16 @@ class AudioProcessor(object):
S = self._denormalize(spectrogram) S = self._denormalize(spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db) S = self._db_to_amplitude(S + self.ref_level_db)
if self.preemphasis: if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S ** self.power) return self._griffin_lim(S**self.power)
def inv_melspectrogram(self, mel_spectrogram): def inv_melspectrogram(self, mel_spectrogram):
S = self._denormalize(mel_spectrogram) S = self._denormalize(mel_spectrogram)
S = self._db_to_amplitude(S + self.ref_level_db) S = self._db_to_amplitude(S + self.ref_level_db)
S = self._mel_to_linear(np.abs(S)) S = self._mel_to_linear(np.abs(S))
if self.preemphasis: if self.preemphasis:
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
return self._griffin_lim(S ** self.power) return self._griffin_lim(S**self.power)
def out_linear_to_mel(self, linear_spec): def out_linear_to_mel(self, linear_spec):
"""convert output linear spec to mel spec""" """convert output linear spec to mel spec"""
...@@ -222,7 +253,7 @@ class AudioProcessor(object): ...@@ -222,7 +253,7 @@ class AudioProcessor(object):
S = self._amplitude_to_db(S) - self.ref_level_db S = self._amplitude_to_db(S) - self.ref_level_db
mel = self._normalize(S) mel = self._normalize(S)
return mel return mel
def _griffin_lim(self, S): def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex) S_complex = np.abs(S).astype(np.complex)
...@@ -234,18 +265,18 @@ class AudioProcessor(object): ...@@ -234,18 +265,18 @@ class AudioProcessor(object):
@staticmethod @staticmethod
def mulaw_encode(wav, qc): def mulaw_encode(wav, qc):
mu = 2 ** qc - 1 mu = 2**qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0) # wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu) signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
# Quantize signal to the specified number of levels. # Quantize signal to the specified number of levels.
signal = (signal + 1) / 2 * mu + 0.5 signal = (signal + 1) / 2 * mu + 0.5
return np.floor(signal,) return np.floor(signal, )
@staticmethod @staticmethod
def mulaw_decode(wav, qc): def mulaw_decode(wav, qc):
"""Recovers waveform from quantized values.""" """Recovers waveform from quantized values."""
mu = 2 ** qc - 1 mu = 2**qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
return x return x
@staticmethod @staticmethod
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .dataset import * from .dataset import *
from .datacargo import * from .datacargo import *
from .sampler import * from .sampler import *
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
functions to make batch for arrays which satisfy some conditions. functions to make batch for arrays which satisfy some conditions.
""" """
import numpy as np import numpy as np
class TextIDBatcher(object): class TextIDBatcher(object):
"""A wrapper class for a function to build a functor, which holds the configs to pass to the function.""" """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
def __init__(self, pad_id=0, dtype=np.int64): def __init__(self, pad_id=0, dtype=np.int64):
self.pad_id = pad_id self.pad_id = pad_id
self.dtype = dtype self.dtype = dtype
def __call__(self, minibatch): def __call__(self, minibatch):
out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype) out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
return out return out
def batch_text_id(minibatch, pad_id=0, dtype=np.int64): def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
""" """
minibatch: List[Example] minibatch: List[Example]
...@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64): ...@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
""" """
peek_example = minibatch[0] peek_example = minibatch[0]
assert len(peek_example.shape) == 1, "text example is an 1D tensor" assert len(peek_example.shape) == 1, "text example is an 1D tensor"
lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, ) lengths = [example.shape[0] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths) max_len = np.max(lengths)
batch = [] batch = []
for example in minibatch: for example in minibatch:
pad_len = max_len - example.shape[0] pad_len = max_len - example.shape[0]
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id)) batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_id))
return np.array(batch, dtype=dtype) return np.array(batch, dtype=dtype)
class WavBatcher(object): class WavBatcher(object):
def __init__(self, pad_value=0., dtype=np.float32): def __init__(self, pad_value=0., dtype=np.float32):
self.pad_value = pad_value self.pad_value = pad_value
self.dtype = dtype self.dtype = dtype
def __call__(self, minibatch): def __call__(self, minibatch):
out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype) out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out return out
def batch_wav(minibatch, pad_value=0., dtype=np.float32): def batch_wav(minibatch, pad_value=0., dtype=np.float32):
""" """
minibatch: List[Example] minibatch: List[Example]
...@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): ...@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
mono_channel = True mono_channel = True
elif len(peek_example.shape) == 2: elif len(peek_example.shape) == 2:
mono_channel = False mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, ) lengths = [example.shape[-1] for example in minibatch
] # assume (channel, n_samples) or (n_samples, )
max_len = np.max(lengths) max_len = np.max(lengths)
batch = [] batch = []
for example in minibatch: for example in minibatch:
pad_len = max_len - example.shape[-1] pad_len = max_len - example.shape[-1]
if mono_channel: if mono_channel:
batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value)) batch.append(
np.pad(example, [(0, pad_len)],
mode='constant',
constant_values=pad_value))
else: else:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype) return np.array(batch, dtype=dtype)
...@@ -75,6 +104,7 @@ class SpecBatcher(object): ...@@ -75,6 +104,7 @@ class SpecBatcher(object):
out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype) out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
return out return out
def batch_spec(minibatch, pad_value=0., dtype=np.float32): def batch_spec(minibatch, pad_value=0., dtype=np.float32):
""" """
minibatch: List[Example] minibatch: List[Example]
...@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): ...@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
mono_channel = True mono_channel = True
elif len(peek_example.shape) == 3: elif len(peek_example.shape) == 3:
mono_channel = False mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame) lengths = [example.shape[-1] for example in minibatch
max_len = np.max(lengths) ] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
batch = [] batch = []
for example in minibatch: for example in minibatch:
pad_len = max_len - example.shape[-1] pad_len = max_len - example.shape[-1]
if mono_channel: if mono_channel:
batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) batch.append(
np.pad(example, [(0, 0), (0, pad_len)],
mode='constant',
constant_values=pad_value))
else: else:
batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no batch.append(
np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
return np.array(batch, dtype=dtype) mode='constant',
\ No newline at end of file constant_values=pad_value)) # what about PCM, no
return np.array(batch, dtype=dtype)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six import six
from .sampler import SequentialSampler, RandomSampler, BatchSampler from .sampler import SequentialSampler, RandomSampler, BatchSampler
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six import six
import numpy as np import numpy as np
...@@ -9,8 +23,7 @@ class DatasetMixin(object): ...@@ -9,8 +23,7 @@ class DatasetMixin(object):
if isinstance(index, slice): if isinstance(index, slice):
start, stop, step = index.indices(len(self)) start, stop, step = index.indices(len(self))
return [ return [
self.get_example(i) self.get_example(i) for i in six.moves.range(start, stop, step)
for i in six.moves.range(start, stop, step)
] ]
elif isinstance(index, (list, np.ndarray)): elif isinstance(index, (list, np.ndarray)):
return [self.get_example(i) for i in index] return [self.get_example(i) for i in index]
...@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin): ...@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
def get_example(self, i): def get_example(self, i):
if i < 0: if i < 0:
raise IndexError( raise IndexError("ChainDataset doesnot support negative indexing.")
"ChainDataset doesnot support negative indexing.")
for dataset in self._datasets: for dataset in self._datasets:
if i < len(dataset): if i < len(dataset):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__. At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
...@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices. ...@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
So the sampler is only responsible for generating valid indices. So the sampler is only responsible for generating valid indices.
""" """
import numpy as np import numpy as np
import random import random
class Sampler(object): class Sampler(object):
def __init__(self, data_source): def __init__(self, data_source):
pass pass
...@@ -23,7 +36,7 @@ class Sampler(object): ...@@ -23,7 +36,7 @@ class Sampler(object):
class SequentialSampler(Sampler): class SequentialSampler(Sampler):
def __init__(self, data_source): def __init__(self, data_source):
self.data_source = data_source self.data_source = data_source
def __iter__(self): def __iter__(self):
return iter(range(len(self.data_source))) return iter(range(len(self.data_source)))
...@@ -42,12 +55,14 @@ class RandomSampler(Sampler): ...@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
"replacement={}".format(self.replacement)) "replacement={}".format(self.replacement))
if self._num_samples is not None and not replacement: if self._num_samples is not None and not replacement:
raise ValueError("With replacement=False, num_samples should not be specified, " raise ValueError(
"since a random permutation will be performed.") "With replacement=False, num_samples should not be specified, "
"since a random permutation will be performed.")
if not isinstance(self.num_samples, int) or self.num_samples <= 0: if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integer " raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(self.num_samples)) "value, but got num_samples={}".format(
self.num_samples))
@property @property
def num_samples(self): def num_samples(self):
...@@ -59,7 +74,9 @@ class RandomSampler(Sampler): ...@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
def __iter__(self): def __iter__(self):
n = len(self.data_source) n = len(self.data_source)
if self.replacement: if self.replacement:
return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist()) return iter(
np.random.randint(
0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
return iter(np.random.permutation(n).tolist()) return iter(np.random.permutation(n).tolist())
def __len__(self): def __len__(self):
...@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler): ...@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
self.indices = indices self.indices = indices
def __iter__(self): def __iter__(self):
return (self.indices[i] for i in np.random.permutation(len(self.indices))) return (self.indices[i]
for i in np.random.permutation(len(self.indices)))
def __len__(self): def __len__(self):
return len(self.indices) return len(self.indices)
...@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): ...@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
3. Permutate mini-batchs 3. Permutate mini-batchs
""" """
def __init__(self, lengths, batch_size=4, batch_group_size=None, def __init__(self,
lengths,
batch_size=4,
batch_group_size=None,
permutate=True): permutate=True):
_lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key _lengths = np.array(
lengths,
dtype=np.int64) # maybe better implement length as a sort key
self.lengths = np.sort(_lengths) self.lengths = np.sort(_lengths)
self.sorted_indices = np.argsort(_lengths) self.sorted_indices = np.argsort(_lengths)
...@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): ...@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
for i in range(len(indices) // batch_group_size): for i in range(len(indices) // batch_group_size):
s = i * batch_group_size s = i * batch_group_size
e = s + batch_group_size e = s + batch_group_size
random.shuffle(indices[s: e]) # inplace random.shuffle(indices[s:e]) # inplace
# Permutate batches # Permutate batches
if self.permutate: if self.permutate:
perm = np.arange(len(indices[:e]) // self.batch_size) perm = np.arange(len(indices[:e]) // self.batch_size)
random.shuffle(perm) random.shuffle(perm)
indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1) indices[:e] = indices[:e].reshape(
-1, self.batch_size)[perm, :].reshape(-1)
# Handle last elements # Handle last elements
s += batch_group_size s += batch_group_size
#print(indices) #print(indices)
if s < len(indices): if s < len(indices):
random.shuffle(indices[s:]) random.shuffle(indices[s:])
return iter(indices) return iter(indices)
def __len__(self): def __len__(self):
...@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler): ...@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
def __init__(self, weights, num_samples, replacement): def __init__(self, weights, num_samples, replacement):
if not isinstance(num_samples, int) or num_samples <= 0: if not isinstance(num_samples, int) or num_samples <= 0:
raise ValueError("num_samples should be a positive integer " raise ValueError("num_samples should be a positive integer "
"value, but got num_samples={}".format(num_samples)) "value, but got num_samples={}".format(
num_samples))
self.weights = np.array(weights, dtype=np.float64) self.weights = np.array(weights, dtype=np.float64)
self.num_samples = num_samples self.num_samples = num_samples
self.replacement = replacement self.replacement = replacement
def __iter__(self): def __iter__(self):
return iter(np.random.choice(len(self.weights), size=(self.num_samples, ), return iter(
replace=self.replacement, p=self.weights).tolist()) np.random.choice(
len(self.weights),
size=(self.num_samples, ),
replace=self.replacement,
p=self.weights).tolist())
def __len__(self): def __len__(self):
return self.num_samples return self.num_samples
...@@ -184,7 +213,7 @@ class DistributedSampler(Sampler): ...@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
# Subset samples for each trainer. # Subset samples for each trainer.
indices = indices[self.rank:self.total_size:self.num_trainers] indices = indices[self.rank:self.total_size:self.num_trainers]
assert len(indices) == self.num_samples assert len(indices) == self.num_samples
return iter(indices) return iter(indices)
...@@ -209,8 +238,7 @@ class BatchSampler(Sampler): ...@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
def __init__(self, sampler, batch_size, drop_last): def __init__(self, sampler, batch_size, drop_last):
if not isinstance(sampler, Sampler): if not isinstance(sampler, Sampler):
raise ValueError("sampler should be an instance of " raise ValueError("sampler should be an instance of "
"Sampler, but got sampler={}" "Sampler, but got sampler={}".format(sampler))
.format(sampler))
if not isinstance(batch_size, int) or batch_size <= 0: if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError("batch_size should be a positive integer value, " raise ValueError("batch_size should be a positive integer value, "
"but got batch_size={}".format(batch_size)) "but got batch_size={}".format(batch_size))
......
...@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand ...@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`. For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
That is it! That is it!
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import pandas as pd import pandas as pd
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
from ruamel.yaml import YAML from ruamel.yaml import YAML
...@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset ...@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, WavBatcher from parakeet.data.batch import TextIDBatcher, WavBatcher
class VCTK(Dataset): class VCTK(Dataset):
def __init__(self, root): def __init__(self, root):
assert isinstance(root, (str, Path)), "root should be a string or Path object" assert isinstance(root, (
str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root) self.root = root if isinstance(root, Path) else Path(root)
self.text_root = self.root.joinpath("txt") self.text_root = self.root.joinpath("txt")
self.wav_root = self.root.joinpath("wav48") self.wav_root = self.root.joinpath("wav48")
if not (self.root.joinpath("metadata.csv").exists() and if not (self.root.joinpath("metadata.csv").exists() and
self.root.joinpath("speaker_indices.yaml").exists()): self.root.joinpath("speaker_indices.yaml").exists()):
self._prepare_metadata() self._prepare_metadata()
self.speaker_indices, self.metadata = self._load_metadata() self.speaker_indices, self.metadata = self._load_metadata()
def _load_metadata(self): def _load_metadata(self):
yaml=YAML(typ='safe') yaml = YAML(typ='safe')
speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml")) speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
metadata = pd.read_csv(self.root.joinpath("metadata.csv"), metadata = pd.read_csv(
sep="|", quoting=3, header=1) self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
return speaker_indices, metadata return speaker_indices, metadata
def _prepare_metadata(self): def _prepare_metadata(self):
...@@ -41,15 +57,19 @@ class VCTK(Dataset): ...@@ -41,15 +57,19 @@ class VCTK(Dataset):
with io.open(str(text_file)) as f: with io.open(str(text_file)) as f:
transcription = f.read().strip() transcription = f.read().strip()
wav_file = text_file.with_suffix(".wav") wav_file = text_file.with_suffix(".wav")
metadata.append((wav_file.name, speaker_folder.name, transcription)) metadata.append(
metadata = pd.DataFrame.from_records(metadata, (wav_file.name, speaker_folder.name, transcription))
columns=["wave_file", "speaker", "text"]) metadata = pd.DataFrame.from_records(
metadata, columns=["wave_file", "speaker", "text"])
# save them # save them
yaml=YAML(typ='safe') yaml = YAML(typ='safe')
yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml")) yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
metadata.to_csv(self.root.joinpath("metadata.csv"), metadata.to_csv(
sep="|", quoting=3, index=False) self.root.joinpath("metadata.csv"),
sep="|",
quoting=3,
index=False)
def _get_example(self, metadatum): def _get_example(self, metadatum):
wave_file, speaker, text = metadatum wave_file, speaker, text = metadatum
...@@ -77,5 +97,3 @@ class VCTK(Dataset): ...@@ -77,5 +97,3 @@ class VCTK(Dataset):
speaker_batch = np.array(speaker_batch) speaker_batch = np.array(speaker_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch) phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return wav_batch, speaker_batch, phoneme_batch return wav_batch, speaker_batch, phoneme_batch
\ No newline at end of file
# coding: utf-8 # coding: utf-8
"""Text processing frontend """Text processing frontend
All frontend module should have the following functions: All frontend module should have the following functions:
......
...@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0): ...@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence from ..text import text_to_sequence
text = text_to_sequence(text, ["english_cleaners"]) text = text_to_sequence(text, ["english_cleaners"])
return text return text
...@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0): ...@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
from ..text import text_to_sequence from ..text import text_to_sequence
text = text_to_sequence(text, ["basic_cleaners"]) text = text_to_sequence(text, ["basic_cleaners"])
return text return text
# coding: utf-8 # coding: utf-8
import MeCab import MeCab
import jaconv import jaconv
from random import random from random import random
...@@ -30,9 +29,9 @@ def _yomi(mecab_result): ...@@ -30,9 +29,9 @@ def _yomi(mecab_result):
def _mix_pronunciation(tokens, yomis, p): def _mix_pronunciation(tokens, yomis, p):
return "".join( return "".join(yomis[idx]
yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx] if yomis[idx] is not None and random() < p else tokens[idx]
for idx in range(len(tokens))) for idx in range(len(tokens)))
def mix_pronunciation(text, p): def mix_pronunciation(text, p):
...@@ -59,8 +58,7 @@ def normalize_delimitor(text): ...@@ -59,8 +58,7 @@ def normalize_delimitor(text):
def text_to_sequence(text, p=0.0): def text_to_sequence(text, p=0.0):
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】", "(", ")", "(", ")"]:
"(", ")", "(", ")"]:
text = text.replace(c, "") text = text.replace(c, "")
text = text.replace("!", "!") text = text.replace("!", "!")
text = text.replace("?", "?") text = text.replace("?", "?")
......
# coding: utf-8 # coding: utf-8
from random import random from random import random
n_vocab = 0xffff n_vocab = 0xffff
...@@ -13,5 +12,6 @@ _tagger = None ...@@ -13,5 +12,6 @@ _tagger = None
def text_to_sequence(text, p=0.0): def text_to_sequence(text, p=0.0):
return [ord(c) for c in text] + [_eos] # EOS return [ord(c) for c in text] + [_eos] # EOS
def sequence_to_text(seq): def sequence_to_text(seq):
return "".join(chr(n) for n in seq) return "".join(chr(n) for n in seq)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re import re
from . import cleaners from . import cleaners
from .symbols import symbols from .symbols import symbols
# Mappings from symbol to numeric ID and vice versa: # Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)} _symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)}
...@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names): ...@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
if not m: if not m:
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
break break
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) sequence += _symbols_to_sequence(
_clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2)) sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3) text = m.group(3)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
''' '''
Cleaners are transformations that run over the input text at both training and eval time. Cleaners are transformations that run over the input text at both training and eval time.
...@@ -14,31 +27,31 @@ import re ...@@ -14,31 +27,31 @@ import re
from unidecode import unidecode from unidecode import unidecode
from .numbers import normalize_numbers from .numbers import normalize_numbers
# Regular expression matching whitespace: # Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+') _whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations: # List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
('mrs', 'misess'), for x in [
('mr', 'mister'), ('mrs', 'misess'),
('dr', 'doctor'), ('mr', 'mister'),
('st', 'saint'), ('dr', 'doctor'),
('co', 'company'), ('st', 'saint'),
('jr', 'junior'), ('co', 'company'),
('maj', 'major'), ('jr', 'junior'),
('gen', 'general'), ('maj', 'major'),
('drs', 'doctors'), ('gen', 'general'),
('rev', 'reverend'), ('drs', 'doctors'),
('lt', 'lieutenant'), ('rev', 'reverend'),
('hon', 'honorable'), ('lt', 'lieutenant'),
('sgt', 'sergeant'), ('hon', 'honorable'),
('capt', 'captain'), ('sgt', 'sergeant'),
('esq', 'esquire'), ('capt', 'captain'),
('ltd', 'limited'), ('esq', 'esquire'),
('col', 'colonel'), ('ltd', 'limited'),
('ft', 'fort'), ('col', 'colonel'),
]] ('ft', 'fort'),
]]
def expand_abbreviations(text): def expand_abbreviations(text):
......
import re # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
valid_symbols = [ valid_symbols = [
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
'Y', 'Z', 'ZH'
] ]
_valid_symbol_set = set(valid_symbols) _valid_symbol_set = set(valid_symbols)
...@@ -24,7 +38,10 @@ class CMUDict: ...@@ -24,7 +38,10 @@ class CMUDict:
else: else:
entries = _parse_cmudict(file_or_path) entries = _parse_cmudict(file_or_path)
if not keep_ambiguous: if not keep_ambiguous:
entries = {word: pron for word, pron in entries.items() if len(pron) == 1} entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries self._entries = entries
def __len__(self): def __len__(self):
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
import inflect import inflect
import re import re
_inflect = inflect.engine() _inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
...@@ -56,7 +55,8 @@ def _expand_number(m): ...@@ -56,7 +55,8 @@ def _expand_number(m):
elif num % 100 == 0: elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred' return _inflect.number_to_words(num // 100) + ' hundred'
else: else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') return _inflect.number_to_words(
num, andword='', zero='oh', group=2).replace(', ', ' ')
else: else:
return _inflect.number_to_words(num, andword='') return _inflect.number_to_words(num, andword='')
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
''' '''
Defines the set of symbols used in text input to the model. Defines the set of symbols used in text input to the model.
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
from parakeet.models.deepvoice3.decoder import Decoder, WindowRange from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
from parakeet.models.deepvoice3.converter import Converter from parakeet.models.deepvoice3.converter import Converter
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from collections import namedtuple from collections import namedtuple
from paddle import fluid from paddle import fluid
...@@ -19,23 +33,19 @@ class Attention(dg.Layer): ...@@ -19,23 +33,19 @@ class Attention(dg.Layer):
value_projection=True): value_projection=True):
super(Attention, self).__init__() super(Attention, self).__init__()
std = np.sqrt(1 / query_dim) std = np.sqrt(1 / query_dim)
self.query_proj = Linear(query_dim, self.query_proj = Linear(
embed_dim, query_dim, embed_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
if key_projection: if key_projection:
std = np.sqrt(1 / embed_dim) std = np.sqrt(1 / embed_dim)
self.key_proj = Linear(embed_dim, self.key_proj = Linear(
embed_dim, embed_dim, embed_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
if value_projection: if value_projection:
std = np.sqrt(1 / embed_dim) std = np.sqrt(1 / embed_dim)
self.value_proj = Linear(embed_dim, self.value_proj = Linear(
embed_dim, embed_dim, embed_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
std = np.sqrt(1 / embed_dim) std = np.sqrt(1 / embed_dim)
self.out_proj = Linear(embed_dim, self.out_proj = Linear(
query_dim, embed_dim, query_dim, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
self.key_projection = key_projection self.key_projection = key_projection
self.value_projection = value_projection self.value_projection = value_projection
...@@ -102,9 +112,8 @@ class Attention(dg.Layer): ...@@ -102,9 +112,8 @@ class Attention(dg.Layer):
x = F.softmax(x) x = F.softmax(x)
attn_scores = x attn_scores = x
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x = F.matmul(x, values) x = F.matmul(x, values)
encoder_length = keys.shape[1] encoder_length = keys.shape[1]
# CAUTION: is it wrong? let it be now # CAUTION: is it wrong? let it be now
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle import fluid from paddle import fluid
...@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer): ...@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
has residual connection from the input x, and scale the output by has residual connection from the input x, and scale the output by
np.sqrt(0.5). np.sqrt(0.5).
""" """
def __init__(self, def __init__(self,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
...@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer): ...@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
), "this block uses residual connection"\ ), "this block uses residual connection"\
"the input_channes should equals num_filters" "the input_channes should equals num_filters"
std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels)) std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
self.conv = Conv1DCell(in_channels, self.conv = Conv1DCell(
2 * num_filters, in_channels,
filter_size, 2 * num_filters,
dilation, filter_size,
causal, dilation,
param_attr=I.Normal(scale=std)) causal,
param_attr=I.Normal(scale=std))
if n_speakers > 1: if n_speakers > 1:
assert (speaker_dim is not None assert (speaker_dim is not None
), "speaker embed should not be null in multi-speaker case" ), "speaker embed should not be null in multi-speaker case"
std = np.sqrt(1 / speaker_dim) std = np.sqrt(1 / speaker_dim)
self.fc = Linear(speaker_dim, self.fc = Linear(
num_filters, speaker_dim, num_filters, param_attr=I.Normal(scale=std))
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None): def forward(self, x, speaker_embed=None):
""" """
...@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer): ...@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU. C_out means the output channels of Conv1DGLU.
""" """
residual = x residual = x
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x = self.conv(x) x = self.conv(x)
content, gate = F.split(x, num_or_sections=2, dim=1) content, gate = F.split(x, num_or_sections=2, dim=1)
...@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer): ...@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
C_out means the output channels of Conv1DGLU. C_out means the output channels of Conv1DGLU.
""" """
residual = x_t residual = x_t
x_t = F.dropout(x_t, x_t = F.dropout(
self.dropout, x_t, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x_t = self.conv.add_input(x_t) x_t = self.conv.add_input(x_t)
content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1) content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from itertools import chain from itertools import chain
...@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): ...@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2, 2,
stride=2, stride=2,
param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))), param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout),
Conv1DGLU(n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout),
Conv1DTranspose(
target_channels, target_channels,
target_channels, target_channels,
2, 3,
stride=2, dilation=1,
param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))), std_mul=1.,
Conv1DGLU(n_speakers, dropout=dropout), Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=1, 3,
std_mul=1., dilation=3,
dropout=dropout), std_mul=4.,
Conv1DGLU(n_speakers, dropout=dropout), Conv1DTranspose(
speaker_dim, target_channels,
target_channels, target_channels,
target_channels, 2,
3, stride=2,
dilation=3, param_attr=I.Normal(scale=np.sqrt(
std_mul=4., 4. / (2 * target_channels)))), Conv1DGLU(
dropout=dropout) n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=1,
std_mul=1.,
dropout=dropout), Conv1DGLU(
n_speakers,
speaker_dim,
target_channels,
target_channels,
3,
dilation=3,
std_mul=4.,
dropout=dropout)
] ]
return upsampling_convolutions return upsampling_convolutions
...@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): ...@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
2, 2,
stride=2, stride=2,
param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))), param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=1, 3,
std_mul=1., dilation=1,
dropout=dropout), std_mul=1.,
Conv1DGLU(n_speakers, dropout=dropout), Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=3, 3,
std_mul=4., dilation=3,
dropout=dropout) std_mul=4.,
dropout=dropout)
] ]
return upsampling_convolutions return upsampling_convolutions
def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout): def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
upsampling_convolutions = [ upsampling_convolutions = [
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
target_channels, speaker_dim,
target_channels, target_channels,
3, target_channels,
dilation=3, 3,
std_mul=4., dilation=3,
dropout=dropout) std_mul=4.,
dropout=dropout)
] ]
return upsampling_convolutions return upsampling_convolutions
...@@ -108,6 +125,7 @@ class Converter(dg.Layer): ...@@ -108,6 +125,7 @@ class Converter(dg.Layer):
Vocoder that transforms mel spectrogram (or ecoder hidden states) Vocoder that transforms mel spectrogram (or ecoder hidden states)
to waveform. to waveform.
""" """
def __init__(self, def __init__(self,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
...@@ -161,33 +179,36 @@ class Converter(dg.Layer): ...@@ -161,33 +179,36 @@ class Converter(dg.Layer):
std = np.sqrt(std_mul / in_channels) std = np.sqrt(std_mul / in_channels)
# CAUTION: relu # CAUTION: relu
self.convolutions.append( self.convolutions.append(
Conv1D(in_channels, Conv1D(
out_channels, in_channels,
1, out_channels,
act="relu", 1,
param_attr=I.Normal(scale=std))) act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels in_channels = out_channels
std_mul = 2.0 std_mul = 2.0
self.convolutions.append( self.convolutions.append(
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation=dilation, filter_size,
std_mul=std_mul, dilation=dilation,
dropout=dropout)) std_mul=std_mul,
dropout=dropout))
in_channels = out_channels in_channels = out_channels
std_mul = 4.0 std_mul = 4.0
# final conv proj, channel transformed to linear dim # final conv proj, channel transformed to linear dim
std = np.sqrt(std_mul * (1 - dropout) / in_channels) std = np.sqrt(std_mul * (1 - dropout) / in_channels)
# CAUTION: sigmoid # CAUTION: sigmoid
self.last_conv_proj = Conv1D(in_channels, self.last_conv_proj = Conv1D(
linear_dim, in_channels,
1, linear_dim,
act="sigmoid", 1,
param_attr=I.Normal(scale=std)) act="sigmoid",
param_attr=I.Normal(scale=std))
def forward(self, x, speaker_embed=None): def forward(self, x, speaker_embed=None):
""" """
...@@ -229,4 +250,4 @@ class Converter(dg.Layer): ...@@ -229,4 +250,4 @@ class Converter(dg.Layer):
out = self.last_conv_proj(x) out = self.last_conv_proj(x)
out = F.transpose(out, [0, 2, 1]) out = F.transpose(out, [0, 2, 1])
return out return out
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import paddle.fluid.layers as F import paddle.fluid.layers as F
import paddle.fluid.initializer as I import paddle.fluid.initializer as I
...@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r): ...@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__( def __init__(
self, self,
n_speakers, n_speakers,
speaker_dim, speaker_dim,
embed_dim, embed_dim,
mel_dim, mel_dim,
r=1, r=1,
max_positions=512, max_positions=512,
padding_idx=None, # remove it! padding_idx=None, # remove it!
preattention=(ConvSpec(128, 5, 1), ) * 4, preattention=(ConvSpec(128, 5, 1), ) * 4,
convolutions=(ConvSpec(128, 5, 1), ) * 4, convolutions=(ConvSpec(128, 5, 1), ) * 4,
attention=True, attention=True,
dropout=0.0, dropout=0.0,
use_memory_mask=False, use_memory_mask=False,
force_monotonic_attention=False, force_monotonic_attention=False,
query_position_rate=1.0, query_position_rate=1.0,
key_position_rate=1.0, key_position_rate=1.0,
window_range=WindowRange(-1, 3), window_range=WindowRange(-1, 3),
key_projection=True, key_projection=True,
value_projection=True): value_projection=True):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.dropout = dropout self.dropout = dropout
...@@ -111,23 +125,17 @@ class Decoder(dg.Layer): ...@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
conv_channels = convolutions[0].out_channels conv_channels = convolutions[0].out_channels
# only when padding idx is 0 can we easilt handle it # only when padding idx is 0 can we easilt handle it
self.embed_keys_positions = PositionEmbedding(max_positions, self.embed_keys_positions = PositionEmbedding(
embed_dim, max_positions, embed_dim, padding_idx=0)
padding_idx=0) self.embed_query_positions = PositionEmbedding(
self.embed_query_positions = PositionEmbedding(max_positions, max_positions, conv_channels, padding_idx=0)
conv_channels,
padding_idx=0)
if n_speakers > 1: if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) std = np.sqrt((1 - dropout) / speaker_dim)
self.speaker_proj1 = Linear(speaker_dim, self.speaker_proj1 = Linear(
1, speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
act="sigmoid", self.speaker_proj2 = Linear(
param_attr=I.Normal(scale=std)) speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
self.speaker_proj2 = Linear(speaker_dim,
1,
act="sigmoid",
param_attr=I.Normal(scale=std))
# prenet # prenet
self.prenet = dg.LayerList() self.prenet = dg.LayerList()
...@@ -138,24 +146,26 @@ class Decoder(dg.Layer): ...@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
# conv1d & relu # conv1d & relu
std = np.sqrt(std_mul / in_channels) std = np.sqrt(std_mul / in_channels)
self.prenet.append( self.prenet.append(
Conv1D(in_channels, Conv1D(
out_channels, in_channels,
1, out_channels,
act="relu", 1,
param_attr=I.Normal(scale=std))) act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels in_channels = out_channels
std_mul = 2.0 std_mul = 2.0
self.prenet.append( self.prenet.append(
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation, filter_size,
std_mul, dilation,
dropout, std_mul,
causal=True, dropout,
residual=True)) causal=True,
residual=True))
in_channels = out_channels in_channels = out_channels
std_mul = 4.0 std_mul = 4.0
...@@ -184,16 +194,17 @@ class Decoder(dg.Layer): ...@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
assert ( assert (
in_channels == out_channels in_channels == out_channels
), "the stack of convolution & attention does not change channels" ), "the stack of convolution & attention does not change channels"
conv_layer = Conv1DGLU(n_speakers, conv_layer = Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation, filter_size,
std_mul, dilation,
dropout, std_mul,
causal=True, dropout,
residual=False) causal=True,
residual=False)
attn_layer = Attention( attn_layer = Attention(
out_channels, out_channels,
embed_dim, embed_dim,
...@@ -211,10 +222,8 @@ class Decoder(dg.Layer): ...@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
# 1 * 1 conv to transform channels # 1 * 1 conv to transform channels
std = np.sqrt(std_mul * (1 - dropout) / in_channels) std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.last_conv = Conv1D(in_channels, self.last_conv = Conv1D(
mel_dim * r, in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
1,
param_attr=I.Normal(scale=std))
# mel (before sigmoid) to done hat # mel (before sigmoid) to done hat
std = np.sqrt(1 / in_channels) std = np.sqrt(1 / in_channels)
...@@ -308,9 +317,8 @@ class Decoder(dg.Layer): ...@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
# (B, C, T) # (B, C, T)
frames = F.transpose(frames, [0, 2, 1]) frames = F.transpose(frames, [0, 2, 1])
x = frames x = frames
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
# Prenet # Prenet
for layer in self.prenet: for layer in self.prenet:
if isinstance(layer, Conv1DGLU): if isinstance(layer, Conv1DGLU):
...@@ -408,14 +416,13 @@ class Decoder(dg.Layer): ...@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
test_inputs = fold_adjacent_frames(test_inputs, self.r) test_inputs = fold_adjacent_frames(test_inputs, self.r)
test_inputs = F.transpose(test_inputs, [0, 2, 1]) test_inputs = F.transpose(test_inputs, [0, 2, 1])
initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1), initial_input = F.zeros(
dtype=keys.dtype) (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
t = 0 # decoder time step t = 0 # decoder time step
while True: while True:
frame_pos = F.fill_constant((batch_size, 1), frame_pos = F.fill_constant(
value=t + 1, (batch_size, 1), value=t + 1, dtype="int64")
dtype="int64")
w = self.query_position_rate w = self.query_position_rate
if self.n_speakers > 1: if self.n_speakers > 1:
w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1]) w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
...@@ -433,9 +440,8 @@ class Decoder(dg.Layer): ...@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
current_input = initial_input current_input = initial_input
x_t = current_input x_t = current_input
x_t = F.dropout(x_t, x_t = F.dropout(
self.dropout, x_t, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
# Prenet # Prenet
for layer in self.prenet: for layer in self.prenet:
...@@ -453,15 +459,15 @@ class Decoder(dg.Layer): ...@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
x_t = F.transpose(x_t, [0, 2, 1]) x_t = F.transpose(x_t, [0, 2, 1])
if frame_pos_embed is not None: if frame_pos_embed is not None:
x_t += frame_pos_embed x_t += frame_pos_embed
x_t, attn_scores = attn( x_t, attn_scores = attn(x_t, (keys, values), mask,
x_t, (keys, values), mask, last_attended[i]
last_attended[i] if test_inputs is None else None) if test_inputs is None else None)
x_t = F.transpose(x_t, [0, 2, 1]) x_t = F.transpose(x_t, [0, 2, 1])
step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc) step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc)
# update last attended when necessary # update last attended when necessary
if self.force_monotonic_attention[i]: if self.force_monotonic_attention[i]:
last_attended[i] = np.argmax(attn_scores.numpy(), last_attended[i] = np.argmax(
axis=-1)[0][0] attn_scores.numpy(), axis=-1)[0][0]
x_t = F.scale(residual + x_t, np.sqrt(0.5)) x_t = F.scale(residual + x_t, np.sqrt(0.5))
if len(step_attn_scores): if len(step_attn_scores):
# (B, 1, T_enc) again # (B, 1, T_enc) again
...@@ -485,8 +491,8 @@ class Decoder(dg.Layer): ...@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
t += 1 t += 1
if test_inputs is None: if test_inputs is None:
if F.reduce_min(done_t).numpy( if F.reduce_min(done_t).numpy()[
)[0] > 0.5 and t > self.min_decoder_steps: 0] > 0.5 and t > self.min_decoder_steps:
break break
elif t > self.max_decoder_steps: elif t > self.max_decoder_steps:
break break
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from collections import namedtuple from collections import namedtuple
...@@ -33,14 +47,16 @@ class Encoder(dg.Layer): ...@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
self.dropout = dropout self.dropout = dropout
if n_speakers > 1: if n_speakers > 1:
std = np.sqrt((1 - dropout) / speaker_dim) std = np.sqrt((1 - dropout) / speaker_dim)
self.sp_proj1 = Linear(speaker_dim, self.sp_proj1 = Linear(
embed_dim, speaker_dim,
act="softsign", embed_dim,
param_attr=I.Normal(scale=std)) act="softsign",
self.sp_proj2 = Linear(speaker_dim, param_attr=I.Normal(scale=std))
embed_dim, self.sp_proj2 = Linear(
act="softsign", speaker_dim,
param_attr=I.Normal(scale=std)) embed_dim,
act="softsign",
param_attr=I.Normal(scale=std))
self.n_speakers = n_speakers self.n_speakers = n_speakers
self.convolutions = dg.LayerList() self.convolutions = dg.LayerList()
...@@ -51,31 +67,34 @@ class Encoder(dg.Layer): ...@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
if in_channels != out_channels: if in_channels != out_channels:
std = np.sqrt(std_mul / in_channels) std = np.sqrt(std_mul / in_channels)
self.convolutions.append( self.convolutions.append(
Conv1D(in_channels, Conv1D(
out_channels, in_channels,
1, out_channels,
act="relu", 1,
param_attr=I.Normal(scale=std))) act="relu",
param_attr=I.Normal(scale=std)))
in_channels = out_channels in_channels = out_channels
std_mul = 2.0 std_mul = 2.0
self.convolutions.append( self.convolutions.append(
Conv1DGLU(n_speakers, Conv1DGLU(
speaker_dim, n_speakers,
in_channels, speaker_dim,
out_channels, in_channels,
filter_size, out_channels,
dilation, filter_size,
std_mul, dilation,
dropout, std_mul,
causal=False, dropout,
residual=True)) causal=False,
residual=True))
in_channels = out_channels in_channels = out_channels
std_mul = 4.0 std_mul = 4.0
std = np.sqrt(std_mul * (1 - dropout) / in_channels) std = np.sqrt(std_mul * (1 - dropout) / in_channels)
self.convolutions.append( self.convolutions.append(
Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std))) Conv1D(
in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def forward(self, x, speaker_embed=None): def forward(self, x, speaker_embed=None):
""" """
...@@ -96,9 +115,8 @@ class Encoder(dg.Layer): ...@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
representation for values. representation for values.
""" """
x = self.embed(x) x = self.embed(x)
x = F.dropout(x, x = F.dropout(
self.dropout, x, self.dropout, dropout_implementation="upscale_in_train")
dropout_implementation="upscale_in_train")
x = F.transpose(x, [0, 2, 1]) x = F.transpose(x, [0, 2, 1])
if self.n_speakers > 1 and speaker_embed is not None: if self.n_speakers > 1 and speaker_embed is not None:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from numba import jit from numba import jit
...@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g): ...@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
return W return W
def guided_attentions(encoder_lengths, def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
decoder_lengths,
max_decoder_len,
g=0.2): g=0.2):
B = len(encoder_lengths) B = len(encoder_lengths)
max_input_len = encoder_lengths.max() max_input_len = encoder_lengths.max()
...@@ -93,9 +105,8 @@ class TTSLoss(object): ...@@ -93,9 +105,8 @@ class TTSLoss(object):
def binary_divergence(self, prediction, target, mask): def binary_divergence(self, prediction, target, mask):
flattened_prediction = F.reshape(prediction, [-1, 1]) flattened_prediction = F.reshape(prediction, [-1, 1])
flattened_target = F.reshape(target, [-1, 1]) flattened_target = F.reshape(target, [-1, 1])
flattened_loss = F.log_loss(flattened_prediction, flattened_loss = F.log_loss(
flattened_target, flattened_prediction, flattened_target, epsilon=1e-8)
epsilon=1e-8)
bin_div = fluid.layers.reshape(flattened_loss, prediction.shape) bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
w = self.masked_weight w = self.masked_weight
...@@ -163,23 +174,20 @@ class TTSLoss(object): ...@@ -163,23 +174,20 @@ class TTSLoss(object):
max_mel_steps = max_frames // self.downsample_factor max_mel_steps = max_frames // self.downsample_factor
max_decoder_steps = max_mel_steps // self.r max_decoder_steps = max_mel_steps // self.r
decoder_mask = F.sequence_mask(n_frames // self.downsample_factor // decoder_mask = F.sequence_mask(
self.r, n_frames // self.downsample_factor // self.r,
max_decoder_steps, max_decoder_steps,
dtype="float32") dtype="float32")
mel_mask = F.sequence_mask(n_frames // self.downsample_factor, mel_mask = F.sequence_mask(
max_mel_steps, n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
dtype="float32")
lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32") lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
if compute_lin_loss: if compute_lin_loss:
lin_hyp = lin_hyp[:, :-self.time_shift, :] lin_hyp = lin_hyp[:, :-self.time_shift, :]
lin_ref = lin_ref[:, self.time_shift:, :] lin_ref = lin_ref[:, self.time_shift:, :]
lin_mask = lin_mask[:, self.time_shift:, :] lin_mask = lin_mask[:, self.time_shift:, :]
lin_l1_loss = self.l1_loss(lin_hyp, lin_l1_loss = self.l1_loss(
lin_ref, lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
lin_mask,
priority_bin=self.priority_bin)
lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask) lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
lin_loss = self.binary_divergence_weight * lin_bce_loss \ lin_loss = self.binary_divergence_weight * lin_bce_loss \
+ (1 - self.binary_divergence_weight) * lin_l1_loss + (1 - self.binary_divergence_weight) * lin_l1_loss
...@@ -197,9 +205,10 @@ class TTSLoss(object): ...@@ -197,9 +205,10 @@ class TTSLoss(object):
total_loss += mel_loss total_loss += mel_loss
if compute_attn_loss: if compute_attn_loss:
attn_loss = self.attention_loss( attn_loss = self.attention_loss(attn_hyp,
attn_hyp, input_lengths.numpy(), input_lengths.numpy(),
n_frames.numpy() // (self.downsample_factor * self.r)) n_frames.numpy() //
(self.downsample_factor * self.r))
total_loss += attn_loss total_loss += attn_loss
if compute_done_loss: if compute_done_loss:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import paddle.fluid.layers as F import paddle.fluid.layers as F
...@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer): ...@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
mel_outputs, alignments, done, decoder_states = self.decoder( mel_outputs, alignments, done, decoder_states = self.decoder(
(keys, values), valid_lengths, mel_inputs, text_positions, (keys, values), valid_lengths, mel_inputs, text_positions,
frame_positions, speaker_embed) frame_positions, speaker_embed)
linear_outputs = self.converter( linear_outputs = self.converter(decoder_states
decoder_states if self.use_decoder_states else mel_outputs, if self.use_decoder_states else
speaker_embed) mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done return mel_outputs, linear_outputs, alignments, done
def transduce(self, text_sequences, text_positions, speaker_indices=None): def transduce(self, text_sequences, text_positions, speaker_indices=None):
...@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer): ...@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
keys, values = self.encoder(text_sequences, speaker_embed) keys, values = self.encoder(text_sequences, speaker_embed)
mel_outputs, alignments, done, decoder_states = self.decoder.decode( mel_outputs, alignments, done, decoder_states = self.decoder.decode(
(keys, values), text_positions, speaker_embed) (keys, values), text_positions, speaker_embed)
linear_outputs = self.converter( linear_outputs = self.converter(decoder_states
decoder_states if self.use_decoder_states else mel_outputs, if self.use_decoder_states else
speaker_embed) mel_outputs, speaker_embed)
return mel_outputs, linear_outputs, alignments, done return mel_outputs, linear_outputs, alignments, done
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle import fluid from paddle import fluid
import paddle.fluid.layers as F import paddle.fluid.layers as F
...@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer): ...@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
speaker_position_rate) # (B, V, C) speaker_position_rate) # (B, V, C)
# make indices for gather_nd # make indices for gather_nd
batch_id = F.expand( batch_id = F.expand(
F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]), F.unsqueeze(
[1, time_steps]) F.range(
0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
# (B, T, 2) # (B, T, 2)
gather_nd_id = F.stack([batch_id, indices], -1) gather_nd_id = F.stack([batch_id, indices], -1)
out = F.gather_nd(weight, gather_nd_id) out = F.gather_nd(weight, gather_nd_id)
return out return out
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock from parakeet.models.fastspeech.fft_block import FFTBlock
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, def __init__(self,
len_max_seq, len_max_seq,
...@@ -18,16 +32,29 @@ class Decoder(dg.Layer): ...@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
super(Decoder, self).__init__() super(Decoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(
self.position_enc = dg.Embedding(size=[n_position, d_model], n_position, d_model, padding_idx=0)
padding_idx=0, self.position_enc = dg.Embedding(
param_attr=fluid.ParamAttr( size=[n_position, d_model],
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), padding_idx=0,
trainable=False)) param_attr=fluid.ParamAttr(
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack): for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer) self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos): def forward(self, enc_seq, enc_pos):
""" """
Decoder layer of FastSpeech. Decoder layer of FastSpeech.
...@@ -57,4 +84,4 @@ class Decoder(dg.Layer): ...@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
slf_attn_mask=slf_attn_mask) slf_attn_mask=slf_attn_mask)
dec_slf_attn_list += [dec_slf_attn] dec_slf_attn_list += [dec_slf_attn]
return dec_output, dec_slf_attn_list return dec_output, dec_slf_attn_list
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
from parakeet.models.fastspeech.fft_block import FFTBlock from parakeet.models.fastspeech.fft_block import FFTBlock
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, def __init__(self,
n_src_vocab, n_src_vocab,
...@@ -19,14 +33,28 @@ class Encoder(dg.Layer): ...@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
super(Encoder, self).__init__() super(Encoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0) self.src_word_emb = dg.Embedding(
self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0) size=[n_src_vocab, d_model], padding_idx=0)
self.position_enc = dg.Embedding(size=[n_position, d_model], self.pos_inp = get_sinusoid_encoding_table(
padding_idx=0, n_position, d_model, padding_idx=0)
param_attr=fluid.ParamAttr( self.position_enc = dg.Embedding(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), size=[n_position, d_model],
trainable=False)) padding_idx=0,
self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.layer_stack = [
FFTBlock(
d_model,
d_inner,
n_head,
d_k,
d_v,
fft_conv1d_kernel,
fft_conv1d_padding,
dropout=dropout) for _ in range(n_layers)
]
for i, layer in enumerate(self.layer_stack): for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer) self.add_sublayer('fft_{}'.format(i), layer)
...@@ -52,7 +80,8 @@ class Encoder(dg.Layer): ...@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
non_pad_mask = get_non_pad_mask(character) non_pad_mask = get_non_pad_mask(character)
# -- Forward # -- Forward
enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C) enc_output = self.src_word_emb(character) + self.position_enc(
text_pos) #(N, T, C)
for enc_layer in self.layer_stack: for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer( enc_output, enc_slf_attn = enc_layer(
...@@ -60,5 +89,5 @@ class Encoder(dg.Layer): ...@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
non_pad_mask=non_pad_mask, non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask) slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn] enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator ...@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder from parakeet.models.fastspeech.decoder import Decoder
class FastSpeech(dg.Layer): class FastSpeech(dg.Layer):
def __init__(self, cfg): def __init__(self, cfg):
" FastSpeech" " FastSpeech"
super(FastSpeech, self).__init__() super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(
len_max_seq=cfg['max_seq_len'], n_src_vocab=len(symbols) + 1,
n_layers=cfg['encoder_n_layer'], len_max_seq=cfg['max_seq_len'],
n_head=cfg['encoder_head'], n_layers=cfg['encoder_n_layer'],
d_k=cfg['fs_hidden_size'] // cfg['encoder_head'], n_head=cfg['encoder_head'],
d_v=cfg['fs_hidden_size'] // cfg['encoder_head'], d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg['fs_hidden_size'], d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_inner=cfg['encoder_conv1d_filter_size'], d_model=cfg['fs_hidden_size'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'], d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_padding=cfg['fft_conv1d_padding'], fft_conv1d_kernel=cfg['fft_conv1d_filter'],
dropout=0.1) fft_conv1d_padding=cfg['fft_conv1d_padding'],
self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], dropout=0.1)
out_channels=cfg['duration_predictor_output_size'], self.length_regulator = LengthRegulator(
filter_size=cfg['duration_predictor_filter_size'], input_size=cfg['fs_hidden_size'],
dropout=cfg['dropout']) out_channels=cfg['duration_predictor_output_size'],
self.decoder = Decoder(len_max_seq=cfg['max_seq_len'], filter_size=cfg['duration_predictor_filter_size'],
n_layers=cfg['decoder_n_layer'], dropout=cfg['dropout'])
n_head=cfg['decoder_head'], self.decoder = Decoder(
d_k=cfg['fs_hidden_size'] // cfg['decoder_head'], len_max_seq=cfg['max_seq_len'],
d_v=cfg['fs_hidden_size'] // cfg['decoder_head'], n_layers=cfg['decoder_n_layer'],
d_model=cfg['fs_hidden_size'], n_head=cfg['decoder_head'],
d_inner=cfg['decoder_conv1d_filter_size'], d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
fft_conv1d_kernel=cfg['fft_conv1d_filter'], d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
fft_conv1d_padding=cfg['fft_conv1d_padding'], d_model=cfg['fs_hidden_size'],
dropout=0.1) d_inner=cfg['decoder_conv1d_filter_size'],
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1)
self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg['fs_hidden_size']) k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
self.mel_linear = dg.Linear(cfg['fs_hidden_size'], low=-k, high=k))
cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'], self.mel_linear = dg.Linear(
param_attr = self.weight, cfg['fs_hidden_size'],
bias_attr = self.bias,) cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'], param_attr=self.weight,
num_hidden=512, bias_attr=self.bias, )
filter_size=5, self.postnet = PostConvNet(
padding=int(5 / 2), n_mels=cfg['audio']['num_mels'],
num_conv=5, num_hidden=512,
outputs_per_step=cfg['audio']['outputs_per_step'], filter_size=5,
use_cudnn=True, padding=int(5 / 2),
dropout=0.1, num_conv=5,
batchnorm_last=True) outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True,
dropout=0.1,
batchnorm_last=True)
def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0): def forward(self,
character,
text_pos,
mel_pos=None,
length_target=None,
alpha=1.0):
""" """
FastSpeech model. FastSpeech model.
...@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer): ...@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
""" """
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos) encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
character, text_pos)
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output, length_regulator_output, duration_predictor_output = self.length_regulator(
target=length_target, encoder_output, target=length_target, alpha=alpha)
alpha=alpha) decoder_output, dec_slf_attn_list = self.decoder(
decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos) length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output) mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
else: else:
length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha) length_regulator_output, decoder_pos = self.length_regulator(
decoder_output, _ = self.decoder(length_regulator_output, decoder_pos) encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output,
decoder_pos)
mel_output = self.mel_linear(decoder_output) mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output mel_output_postnet = self.postnet(mel_output) + mel_output
return mel_output, mel_output_postnet return mel_output, mel_output_postnet
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -6,11 +19,32 @@ import paddle.fluid as fluid ...@@ -6,11 +19,32 @@ import paddle.fluid as fluid
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
class FFTBlock(dg.Layer): class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): def __init__(self,
d_model,
d_inner,
n_head,
d_k,
d_v,
filter_size,
padding,
dropout=0.2):
super(FFTBlock, self).__init__() super(FFTBlock, self).__init__()
self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False) self.slf_attn = MultiheadAttention(
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout) d_model,
d_k,
d_v,
num_head=n_head,
is_bias=True,
dropout=dropout,
is_concat=False)
self.pos_ffn = PositionwiseFeedForward(
d_model,
d_inner,
filter_size=filter_size,
padding=padding,
dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
""" """
...@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer): ...@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
output (Variable), Shape(B, T, C), the output after self-attention & ffn. output (Variable), Shape(B, T, C), the output after self-attention & ffn.
slf_attn (Variable), Shape(B * n_head, T, T), the self attention. slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
""" """
output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask) output, slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask output *= non_pad_mask
output = self.pos_ffn(output) output = self.pos_ffn(output)
output *= non_pad_mask output *= non_pad_mask
return output, slf_attn return output, slf_attn
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import math import math
import parakeet.models.fastspeech.utils import parakeet.models.fastspeech.utils
...@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers ...@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D from parakeet.modules.customized import Conv1D
class LengthRegulator(dg.Layer): class LengthRegulator(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1): def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(LengthRegulator, self).__init__() super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor(input_size=input_size, self.duration_predictor = DurationPredictor(
out_channels=out_channels, input_size=input_size,
filter_size=filter_size, out_channels=out_channels,
dropout=dropout) filter_size=filter_size,
dropout=dropout)
def LR(self, x, duration_predictor_output, alpha=1.0): def LR(self, x, duration_predictor_output, alpha=1.0):
output = [] output = []
batch_size = x.shape[0] batch_size = x.shape[0]
for i in range(batch_size): for i in range(batch_size):
output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha)) output.append(
self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
alpha))
output = self.pad(output) output = self.pad(output)
return output return output
def pad(self, input_ele): def pad(self, input_ele):
max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
out_list = [] out_list = []
for i in range(len(input_ele)): for i in range(len(input_ele)):
pad_len = max_len - input_ele[i].shape[0] pad_len = max_len - input_ele[i].shape[0]
one_batch_padded = layers.pad( one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) pad_value=0.0)
out_list.append(one_batch_padded) out_list.append(one_batch_padded)
out_padded = layers.stack(out_list) out_padded = layers.stack(out_list)
return out_padded return out_padded
def expand(self, batch, predicted, alpha): def expand(self, batch, predicted, alpha):
out = [] out = []
time_steps = batch.shape[1] time_steps = batch.shape[1]
fertilities = predicted.numpy() fertilities = predicted.numpy()
batch = layers.squeeze(batch,[0]) batch = layers.squeeze(batch, [0])
for i in range(time_steps): for i in range(time_steps):
if fertilities[0,i]==0: if fertilities[0, i] == 0:
continue continue
out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1])) out.append(
layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
out = layers.concat(out, axis=0) out = layers.concat(out, axis=0)
return out return out
def forward(self, x, alpha=1.0, target=None): def forward(self, x, alpha=1.0, target=None):
""" """
...@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer): ...@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
else: else:
duration_predictor_output = layers.round(duration_predictor_output) duration_predictor_output = layers.round(duration_predictor_output)
output = self.LR(x, duration_predictor_output, alpha) output = self.LR(x, duration_predictor_output, alpha)
mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1)) mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
mel_pos = layers.unsqueeze(mel_pos, [0]) mel_pos = layers.unsqueeze(mel_pos, [0])
return output, mel_pos return output, mel_pos
class DurationPredictor(dg.Layer): class DurationPredictor(dg.Layer):
def __init__(self, input_size, out_channels, filter_size, dropout=0.1): def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
super(DurationPredictor, self).__init__() super(DurationPredictor, self).__init__()
...@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer): ...@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
self.dropout = dropout self.dropout = dropout
k = math.sqrt(1 / self.input_size) k = math.sqrt(1 / self.input_size)
self.conv1 = Conv1D(num_channels = self.input_size, self.conv1 = Conv1D(
num_filters = self.out_channels, num_channels=self.input_size,
filter_size = self.filter_size, num_filters=self.out_channels,
padding=1, filter_size=self.filter_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=1,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
#data_format='NTC') initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
k = math.sqrt(1 / self.out_channels) k = math.sqrt(1 / self.out_channels)
self.conv2 = Conv1D(num_channels = self.out_channels, self.conv2 = Conv1D(
num_filters = self.out_channels, num_channels=self.out_channels,
filter_size = self.filter_size, num_filters=self.out_channels,
padding=1, filter_size=self.filter_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=1,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
#data_format='NTC') initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
#data_format='NTC')
self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm1 = dg.LayerNorm(self.out_channels)
self.layer_norm2 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) self.weight = fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer())
k = math.sqrt(1 / self.out_channels) k = math.sqrt(1 / self.out_channels)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight, self.linear = dg.Linear(
bias_attr = self.bias) self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
def forward(self, encoder_output): def forward(self, encoder_output):
""" """
...@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer): ...@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
out (Variable), Shape(B, T, C), the output of duration predictor. out (Variable), Shape(B, T, C), the output of duration predictor.
""" """
# encoder_output.shape(N, T, C) # encoder_output.shape(N, T, C)
out = layers.transpose(encoder_output, [0,2,1]) out = layers.transpose(encoder_output, [0, 2, 1])
out = self.conv1(out) out = self.conv1(out)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
out = self.conv2(out) out = self.conv2(out)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
out = layers.relu(self.linear(out)) out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1]) out = layers.squeeze(out, axes=[-1])
return out
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
def get_alignment(attn_probs, mel_lens, n_head): def get_alignment(attn_probs, mel_lens, n_head):
max_F = 0 max_F = 0
assert attn_probs[0].shape[0] % n_head == 0 assert attn_probs[0].shape[0] % n_head == 0
...@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head): ...@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
for i in range(len(attn_probs)): for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy() multi_attn = attn_probs[i].numpy()
for j in range(n_head): for j in range(n_head):
attn = multi_attn[j*batch_size:(j+1)*batch_size] attn = multi_attn[j * batch_size:(j + 1) * batch_size]
F = score_F(attn) F = score_F(attn)
if max_F < F: if max_F < F:
max_F = F max_F = F
max_attn = attn max_attn = attn
alignment = compute_duration(max_attn, mel_lens) alignment = compute_duration(max_attn, mel_lens)
return alignment return alignment
def score_F(attn): def score_F(attn):
max = np.max(attn, axis=-1) max = np.max(attn, axis=-1)
mean = np.mean(max) mean = np.mean(max)
return mean return mean
def compute_duration(attn, mel_lens): def compute_duration(attn, mel_lens):
alignment = np.zeros([attn.shape[0],attn.shape[2]]) alignment = np.zeros([attn.shape[0], attn.shape[2]])
mel_lens = mel_lens.numpy() mel_lens = mel_lens.numpy()
for i in range(attn.shape[0]): for i in range(attn.shape[0]):
for j in range(mel_lens[i]): for j in range(mel_lens[i]):
max_index = np.argmax(attn[i,j]) max_index = np.argmax(attn[i, j])
alignment[i,max_index] += 1 alignment[i, max_index] += 1
return alignment return alignment
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D ...@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
from parakeet.modules.dynamic_gru import DynamicGRU from parakeet.modules.dynamic_gru import DynamicGRU
import numpy as np import numpy as np
class CBHG(dg.Layer): class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, def __init__(self,
max_pool_kernel_size=2, is_post=False): hidden_size,
batch_size,
K=16,
projection_size=256,
num_gru_layers=2,
max_pool_kernel_size=2,
is_post=False):
super(CBHG, self).__init__() super(CBHG, self).__init__()
""" """
:param hidden_size: dimension of hidden unit :param hidden_size: dimension of hidden unit
...@@ -24,28 +44,39 @@ class CBHG(dg.Layer): ...@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
self.projection_size = projection_size self.projection_size = projection_size
self.conv_list = [] self.conv_list = []
k = math.sqrt(1 / projection_size) k = math.sqrt(1 / projection_size)
self.conv_list.append(Conv1D(num_channels = projection_size, self.conv_list.append(
num_filters = hidden_size, Conv1D(
filter_size = 1, num_channels=projection_size,
padding = int(np.floor(1/2)), num_filters=hidden_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=1,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) padding=int(np.floor(1 / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
for i in range(2,K+1): for i in range(2, K + 1):
self.conv_list.append(Conv1D(num_channels = hidden_size, self.conv_list.append(
num_filters = hidden_size, Conv1D(
filter_size = i, num_channels=hidden_size,
padding = int(np.floor(i/2)), num_filters=hidden_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=i,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) padding=int(np.floor(i / 2)),
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, layer in enumerate(self.conv_list): for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = [] self.batchnorm_list = []
for i in range(K): for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size, self.batchnorm_list.append(
data_layout='NCHW')) dg.BatchNorm(
hidden_size, data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list): for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer) self.add_sublayer("batchnorm_list_{}".format(i), layer)
...@@ -53,91 +84,120 @@ class CBHG(dg.Layer): ...@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
conv_outdim = hidden_size * K conv_outdim = hidden_size * K
k = math.sqrt(1 / conv_outdim) k = math.sqrt(1 / conv_outdim)
self.conv_projection_1 = Conv1D(num_channels = conv_outdim, self.conv_projection_1 = Conv1D(
num_filters = hidden_size, num_channels=conv_outdim,
filter_size = 3, num_filters=hidden_size,
padding = int(np.floor(3/2)), filter_size=3,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=int(np.floor(3 / 2)),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
self.conv_projection_2 = Conv1D(num_channels = hidden_size, self.conv_projection_2 = Conv1D(
num_filters = projection_size, num_channels=hidden_size,
filter_size = 3, num_filters=projection_size,
padding = int(np.floor(3/2)), filter_size=3,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=int(np.floor(3 / 2)),
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
data_layout='NCHW') low=-k, high=k)))
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
data_layout='NCHW') self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size, self.batchnorm_proj_2 = dg.BatchNorm(
pool_type='max', projection_size, data_layout='NCHW')
pool_stride=1, self.max_pool = Pool1D(
pool_padding=1, pool_size=max_pool_kernel_size,
data_format = "NCT") pool_type='max',
pool_stride=1,
pool_padding=1,
data_format="NCT")
self.highway = Highwaynet(self.projection_size) self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0) h_0 = dg.to_variable(h_0)
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3, self.fc_forward1 = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), hidden_size,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) hidden_size // 2 * 3,
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3, param_attr=fluid.ParamAttr(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, low=-k, high=k)))
is_reverse = False, self.fc_reverse1 = dg.Linear(
origin_mode = True, hidden_size,
h_0 = h_0) hidden_size // 2 * 3,
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, param_attr=fluid.ParamAttr(
is_reverse=True, initializer=fluid.initializer.XavierInitializer()),
origin_mode=True, bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
h_0 = h_0) low=-k, high=k)))
self.gru_forward1 = DynamicGRU(
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3, size=self.hidden_size // 2,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), is_reverse=False,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) origin_mode=True,
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3, h_0=h_0)
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), self.gru_reverse1 = DynamicGRU(
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) size=self.hidden_size // 2,
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, is_reverse=True,
is_reverse = False, origin_mode=True,
origin_mode = True, h_0=h_0)
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, self.fc_forward2 = dg.Linear(
is_reverse=True, hidden_size,
origin_mode=True, hidden_size // 2 * 3,
h_0 = h_0) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.fc_reverse2 = dg.Linear(
hidden_size,
hidden_size // 2 * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.gru_forward2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=False,
origin_mode=True,
h_0=h_0)
self.gru_reverse2 = DynamicGRU(
size=self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0=h_0)
def _conv_fit_dim(self, x, filter_size=3): def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0: if filter_size % 2 == 0:
return x[:,:,:-1] return x[:, :, :-1]
else: else:
return x return x
def forward(self, input_): def forward(self, input_):
# input_.shape = [N, C, T] # input_.shape = [N, C, T]
conv_list = [] conv_list = []
conv_input = input_ conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)): for i, (conv, batchnorm
conv_input = self._conv_fit_dim(conv(conv_input), i+1) ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
conv_input = layers.relu(batchnorm(conv_input)) conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input) conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1) conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1] conv_pool = self.max_pool(conv_cat)[:, :, :-1]
conv_proj = layers.relu(
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool)))) self.batchnorm_proj_1(
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_ self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(
self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T] # conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1]) highway = layers.transpose(conv_proj, [0, 2, 1])
highway = self.highway(highway) highway = self.highway(highway)
# highway.shape = [N, T, C] # highway.shape = [N, T, C]
...@@ -151,9 +211,10 @@ class CBHG(dg.Layer): ...@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
out_forward = self.gru_forward2(fc_forward) out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse) out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1) out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1]) out = layers.transpose(out, [0, 2, 1])
return out return out
class Highwaynet(dg.Layer): class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4): def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__() super(Highwaynet, self).__init__()
...@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer): ...@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
self.linears = [] self.linears = []
k = math.sqrt(1 / num_units) k = math.sqrt(1 / num_units)
for i in range(num_layers): for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units, self.linears.append(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), dg.Linear(
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))) num_units,
self.gates.append(dg.Linear(num_units, num_units, num_units,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), param_attr=fluid.ParamAttr(
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))) initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): initializer=fluid.initializer.Uniform(
low=-k, high=k))))
self.gates.append(
dg.Linear(
num_units,
num_units,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k))))
for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate) self.add_sublayer("gates_{}".format(i), gate)
...@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer): ...@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
t_ = fluid.layers.sigmoid(gate(out)) t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_ c = 1 - t_
out = h * t_ + out * c out = h * t_ + out * c
return out
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward ...@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.prenet import PreNet from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformer_tts.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4): def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr() param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', self.alpha = self.create_parameter(
default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) shape=(1, ),
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) attr=param,
self.pos_emb = dg.Embedding(size=[1024, num_hidden], dtype='float32',
padding_idx=0, default_initializer=fluid.initializer.ConstantInitializer(
param_attr=fluid.ParamAttr( value=1.0))
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), self.pos_inp = get_sinusoid_encoding_table(
trainable=False)) 1024, self.num_hidden, padding_idx=0)
self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], self.pos_emb = dg.Embedding(
hidden_size = num_hidden * 2, size=[1024, num_hidden],
output_size = num_hidden, padding_idx=0,
dropout_rate=0.2) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(
input_size=config['audio']['num_mels'],
hidden_size=num_hidden * 2,
output_size=num_hidden,
dropout_rate=0.2)
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
self.linear = dg.Linear(num_hidden, num_hidden, self.linear = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), num_hidden,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.selfattn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.selfattn_layers): for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.attn_layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.attn_layers): for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer) self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] self.ffns = [
PositionwiseFeedForward(
num_hidden, num_hidden * num_head, filter_size=1)
for _ in range(3)
]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'], self.mel_linear = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), num_hidden,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) config['audio']['num_mels'] * config['audio']['outputs_per_step'],
self.stop_linear = dg.Linear(num_hidden, 1, param_attr=fluid.ParamAttr(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], self.stop_linear = dg.Linear(
filter_size = 5, padding = 4, num_conv=5, num_hidden,
outputs_per_step=config['audio']['outputs_per_step'], 1,
use_cudnn = True) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
self.postconvnet = PostConvNet(
config['audio']['num_mels'],
config['hidden_size'],
filter_size=5,
padding=4,
num_conv=5,
outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn=True)
def forward(self, key, value, query, c_mask, positional): def forward(self, key, value, query, c_mask, positional):
# get decoder mask with triangular matrix # get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional) m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query) mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32) query)
triu_tensor = dg.to_variable(
get_triu_tensor(query.numpy(), query.numpy())).astype(
np.float32)
mask = mask + triu_tensor mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32) mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len) # (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query) zero_mask = get_attn_key_pad_mask(
layers.squeeze(c_mask, [-1]), query)
else: else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) mask = get_triu_tensor(query.numpy(),
query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None m_mask, zero_mask = None, None
# Decoder pre-network # Decoder pre-network
query = self.decoder_prenet(query) query = self.decoder_prenet(query)
# Centered position # Centered position
query = self.linear(query) query = self.linear(query)
...@@ -84,10 +137,13 @@ class Decoder(dg.Layer): ...@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder # Attention decoder-decoder, encoder-decoder
selfattn_list = list() selfattn_list = list()
attn_list = list() attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) self.ffns):
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) query, attn_dec = selfattn(
query, query, query, mask=mask, query_mask=m_mask)
query, attn_dot = attn(
key, value, query, mask=zero_mask, query_mask=m_mask)
query = ffn(query) query = ffn(query)
selfattn_list.append(attn_dec) selfattn_list.append(attn_dec)
attn_list.append(attn_dot) attn_list.append(attn_dot)
...@@ -96,7 +152,7 @@ class Decoder(dg.Layer): ...@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
# Post Mel Network # Post Mel Network
out = self.postconvnet(mel_out) out = self.postconvnet(mel_out)
out = mel_out + out out = mel_out + out
# Stop tokens # Stop tokens
stop_tokens = self.stop_linear(query) stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1]) stop_tokens = layers.squeeze(stop_tokens, [-1])
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
...@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention ...@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, num_head=4): def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.alpha = self.create_parameter(
self.pos_emb = dg.Embedding(size=[1024, num_hidden], shape=(1, ), attr=param, dtype='float32')
padding_idx=0, self.pos_inp = get_sinusoid_encoding_table(
param_attr=fluid.ParamAttr( 1024, self.num_hidden, padding_idx=0)
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), self.pos_emb = dg.Embedding(
trainable=False)) size=[1024, num_hidden],
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, padding_idx=0,
num_hidden = num_hidden, param_attr=fluid.ParamAttr(
use_cudnn=True) initializer=fluid.initializer.NumpyArrayInitializer(
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(
embedding_size=embedding_size,
num_hidden=num_hidden,
use_cudnn=True)
self.layers = [
MultiheadAttention(num_hidden, num_hidden // num_head,
num_hidden // num_head) for _ in range(3)
]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)] self.ffns = [
PositionwiseFeedForward(
num_hidden,
num_hidden * num_head,
filter_size=1,
use_cudnn=True) for _ in range(3)
]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
...@@ -33,25 +62,23 @@ class Encoder(dg.Layer): ...@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
mask = get_attn_key_pad_mask(positional, x) mask = get_attn_key_pad_mask(positional, x)
else: else:
query_mask, mask = None, None query_mask, mask = None, None
# Encoder pre_network # Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C) x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding # Get positional encoding
positional = self.pos_emb(positional) positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C) x = positional * self.alpha + x #(N, T, C)
# Positional dropout # Positional dropout
x = layers.dropout(x, 0.1) x = layers.dropout(x, 0.1)
# Self attention encoder # Self attention encoder
attentions = list() attentions = list()
for layer, ffn in zip(self.layers, self.ffns): for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask) x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
x = ffn(x) x = ffn(x)
attentions.append(attention) attentions.append(attention)
return x, query_mask, attentions return x, query_mask, attentions
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer): ...@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size], self.embedding = dg.Embedding(
padding_idx = None) size=[len(symbols), embedding_size], padding_idx=None)
self.conv_list = [] self.conv_list = []
k = math.sqrt(1 / embedding_size) k = math.sqrt(1 / embedding_size)
self.conv_list.append(Conv1D(num_channels = embedding_size, self.conv_list.append(
num_filters = num_hidden, Conv1D(
filter_size = 5, num_channels=embedding_size,
padding = int(np.floor(5/2)), num_filters=num_hidden,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=5,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=int(np.floor(5 / 2)),
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
for _ in range(2): for _ in range(2):
self.conv_list.append(Conv1D(num_channels = num_hidden, self.conv_list.append(
num_filters = num_hidden, Conv1D(
filter_size = 5, num_channels=num_hidden,
padding = int(np.floor(5/2)), num_filters=num_hidden,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=5,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=int(np.floor(5 / 2)),
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list): for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [
data_layout='NCHW') for _ in range(3)] dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(3)
]
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
self.projection = dg.Linear(num_hidden, num_hidden, self.projection = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), num_hidden,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) num_hidden,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x): def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size) x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1]) x = layers.transpose(x, [0, 2, 1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C) x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
x = self.projection(x) x = self.projection(x)
return x return x
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.customized import Conv1D from parakeet.modules.customized import Conv1D
class PostConvNet(dg.Layer): class PostConvNet(dg.Layer):
def __init__(self, def __init__(self,
n_mels=80, n_mels=80,
num_hidden=512, num_hidden=512,
filter_size=5, filter_size=5,
...@@ -16,49 +30,66 @@ class PostConvNet(dg.Layer): ...@@ -16,49 +30,66 @@ class PostConvNet(dg.Layer):
dropout=0.1, dropout=0.1,
batchnorm_last=False): batchnorm_last=False):
super(PostConvNet, self).__init__() super(PostConvNet, self).__init__()
self.dropout = dropout self.dropout = dropout
self.num_conv = num_conv self.num_conv = num_conv
self.batchnorm_last = batchnorm_last self.batchnorm_last = batchnorm_last
self.conv_list = [] self.conv_list = []
k = math.sqrt(1 / (n_mels * outputs_per_step)) k = math.sqrt(1 / (n_mels * outputs_per_step))
self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step, self.conv_list.append(
num_filters = num_hidden, Conv1D(
filter_size = filter_size, num_channels=n_mels * outputs_per_step,
padding = padding, num_filters=num_hidden,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=filter_size,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=padding,
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
for _ in range(1, num_conv-1): for _ in range(1, num_conv - 1):
self.conv_list.append(Conv1D(num_channels = num_hidden, self.conv_list.append(
num_filters = num_hidden, Conv1D(
filter_size = filter_size, num_channels=num_hidden,
padding = padding, num_filters=num_hidden,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=filter_size,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=padding,
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
self.conv_list.append(Conv1D(num_channels = num_hidden, self.conv_list.append(
num_filters = n_mels * outputs_per_step, Conv1D(
filter_size = filter_size, num_channels=num_hidden,
padding = padding, num_filters=n_mels * outputs_per_step,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), filter_size=filter_size,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), padding=padding,
use_cudnn = use_cudnn)) param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn))
for i, layer in enumerate(self.conv_list): for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [
data_layout='NCHW') for _ in range(num_conv-1)] dg.BatchNorm(
num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
]
if self.batchnorm_last: if self.batchnorm_last:
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, self.batch_norm_list.append(
data_layout='NCHW')) dg.BatchNorm(
n_mels * outputs_per_step, data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
def forward(self, input): def forward(self, input):
""" """
...@@ -69,18 +100,19 @@ class PostConvNet(dg.Layer): ...@@ -69,18 +100,19 @@ class PostConvNet(dg.Layer):
Returns: Returns:
output (Variable), Shape(B, T, C), the result after postconvnet. output (Variable), Shape(B, T, C), the result after postconvnet.
""" """
input = layers.transpose(input, [0,2,1]) input = layers.transpose(input, [0, 2, 1])
len = input.shape[-1] len = input.shape[-1]
for i in range(self.num_conv-1): for i in range(self.num_conv - 1):
batch_norm = self.batch_norm_list[i] batch_norm = self.batch_norm_list[i]
conv = self.conv_list[i] conv = self.conv_list[i]
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) input = layers.dropout(
conv = self.conv_list[self.num_conv-1] layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
input = conv(input)[:,:,:len] conv = self.conv_list[self.num_conv - 1]
input = conv(input)[:, :, :len]
if self.batchnorm_last: if self.batchnorm_last:
batch_norm = self.batch_norm_list[self.num_conv-1] batch_norm = self.batch_norm_list[self.num_conv - 1]
input = layers.dropout(batch_norm(input), self.dropout) input = layers.dropout(batch_norm(input), self.dropout)
output = layers.transpose(input, [0,2,1]) output = layers.transpose(input, [0, 2, 1])
return output return output
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
class PreNet(dg.Layer): class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
""" """
...@@ -17,13 +31,21 @@ class PreNet(dg.Layer): ...@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
k = math.sqrt(1 / input_size) k = math.sqrt(1 / input_size)
self.linear1 = dg.Linear(input_size, hidden_size, self.linear1 = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), input_size,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) hidden_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
k = math.sqrt(1 / hidden_size) k = math.sqrt(1 / hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size, self.linear2 = dg.Linear(
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), hidden_size,
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) output_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)))
def forward(self, x): def forward(self, x):
""" """
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformer_tts.encoder import Encoder from parakeet.models.transformer_tts.encoder import Encoder
from parakeet.models.transformer_tts.decoder import Decoder from parakeet.models.transformer_tts.decoder import Decoder
class TransformerTTS(dg.Layer): class TransformerTTS(dg.Layer):
def __init__(self, config): def __init__(self, config):
super(TransformerTTS, self).__init__() super(TransformerTTS, self).__init__()
...@@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer): ...@@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer):
self.config = config self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel): def forward(self, characters, mel_input, pos_text, pos_mel):
key, c_mask, attns_enc = self.encoder(characters, pos_text)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
key, c_mask, attns_enc = self.encoder(characters, pos_text)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
import librosa import librosa
import os, copy import os, copy
...@@ -6,14 +19,15 @@ import paddle.fluid.layers as layers ...@@ -6,14 +19,15 @@ import paddle.fluid.layers as layers
def get_positional_table(d_pos_vec, n_position=1024): def get_positional_table(d_pos_vec, n_position=1024):
position_enc = np.array([ position_enc = np.array(
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)] [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc return position_enc
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table ''' ''' Sinusoid position encoding table '''
...@@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): ...@@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
def get_posi_angle_vec(position): def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)] return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table = np.array(
[get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
...@@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): ...@@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
return sinusoid_table return sinusoid_table
def get_non_pad_mask(seq): def get_non_pad_mask(seq):
return layers.unsqueeze((seq != 0).astype(np.float32),[-1]) return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
def get_attn_key_pad_mask(seq_k, seq_q): def get_attn_key_pad_mask(seq_k, seq_q):
''' For masking out the padding part of key sequence. ''' ''' For masking out the padding part of key sequence. '''
...@@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q): ...@@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
# Expand to fit the shape of key query attention matrix. # Expand to fit the shape of key query attention matrix.
len_q = seq_q.shape[1] len_q = seq_q.shape[1]
padding_mask = (seq_k != 0).astype(np.float32) padding_mask = (seq_k != 0).astype(np.float32)
padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) padding_mask = layers.expand(
layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
return padding_mask return padding_mask
def get_triu_tensor(seq_k, seq_q): def get_triu_tensor(seq_k, seq_q):
''' For make a triu tensor ''' ''' For make a triu tensor '''
len_k = seq_k.shape[1] len_k = seq_k.shape[1]
len_q = seq_q.shape[1] len_q = seq_q.shape[1]
batch_size = seq_k.shape[0] batch_size = seq_k.shape[0]
triu_tensor = np.triu(np.ones([len_k, len_q]), 1) triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0) triu_tensor = np.repeat(
np.expand_dims(
triu_tensor, axis=0), batch_size, axis=0)
return triu_tensor return triu_tensor
def guided_attention(N, T, g=0.2): def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.''' '''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32) W = np.zeros((N, T), dtype=np.float32)
for n_pos in range(W.shape[0]): for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]): for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g)) W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
**2 / (2 * g * g))
return W return W
def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30): def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon) output = -1 * label * layers.log(input + epsilon) - (
1 - label) * layers.log(1 - input + epsilon)
output = output * (label * (position_weight - 1) + 1) output = output * (label * (position_weight - 1) + 1)
return layers.reduce_sum(output, dim=[0, 1]) return layers.reduce_sum(output, dim=[0, 1])
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D from parakeet.modules.customized import Conv1D
from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.utils import *
from parakeet.models.transformer_tts.cbhg import CBHG from parakeet.models.transformer_tts.cbhg import CBHG
class Vocoder(dg.Layer): class Vocoder(dg.Layer):
""" """
CBHG Network (mel -> linear) CBHG Network (mel -> linear)
""" """
def __init__(self, config, batch_size): def __init__(self, config, batch_size):
super(Vocoder, self).__init__() super(Vocoder, self).__init__()
self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], self.pre_proj = Conv1D(
num_filters = config['hidden_size'], num_channels=config['audio']['num_mels'],
filter_size=1) num_filters=config['hidden_size'],
filter_size=1)
self.cbhg = CBHG(config['hidden_size'], batch_size) self.cbhg = CBHG(config['hidden_size'], batch_size)
self.post_proj = Conv1D(num_channels = config['hidden_size'], self.post_proj = Conv1D(
num_filters = (config['audio']['n_fft'] // 2) + 1, num_channels=config['hidden_size'],
filter_size=1) num_filters=(config['audio']['n_fft'] // 2) + 1,
filter_size=1)
def forward(self, mel): def forward(self, mel):
mel = layers.transpose(mel, [0,2,1]) mel = layers.transpose(mel, [0, 2, 1])
mel = self.pre_proj(mel) mel = self.pre_proj(mel)
mel = self.cbhg(mel) mel = self.cbhg(mel)
mag_pred = self.post_proj(mel) mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1]) mag_pred = layers.transpose(mag_pred, [0, 2, 1])
return mag_pred return mag_pred
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.models.waveflow.waveflow import WaveFlow from parakeet.models.waveflow.waveflow import WaveFlow
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random import random
import librosa import librosa
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import os import os
import time import time
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms. Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms.
WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499). WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499).
Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels.
We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures. We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
...@@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \ ...@@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \
#### Save and Load checkpoints #### Save and Load checkpoints
Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default. Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default.
The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint): There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. 1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
2. Use `--iteration=500000`. 2. Use `--iteration=500000`.
3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`. 3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`.
...@@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \ ...@@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \
--root=./data/LJSpeech-1.1 \ --root=./data/LJSpeech-1.1 \
--name=${ModelName} --use_gpu=true \ --name=${ModelName} --use_gpu=true \
--output=./syn_audios \ --output=./syn_audios \
--sample=${SAMPLE} --sample=${SAMPLE}
``` ```
In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random import random
import librosa import librosa
...@@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech): ...@@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
self.fft_window_shift = config.fft_window_shift self.fft_window_shift = config.fft_window_shift
# Calculate context frames. # Calculate context frames.
frames_per_second = config.sample_rate // self.fft_window_shift frames_per_second = config.sample_rate // self.fft_window_shift
train_clip_frames = int(np.ceil( train_clip_frames = int(
config.train_clip_second * frames_per_second)) np.ceil(config.train_clip_second * frames_per_second))
context_frames = config.context_size // self.fft_window_shift context_frames = config.context_size // self.fft_window_shift
self.num_frames = train_clip_frames + context_frames self.num_frames = train_clip_frames + context_frames
...@@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech): ...@@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech):
fft_window_shift = config.fft_window_shift fft_window_shift = config.fft_window_shift
fft_window_size = config.fft_window_size fft_window_size = config.fft_window_size
fft_size = config.fft_size fft_size = config.fft_size
audio, loaded_sr = librosa.load(wav_path, sr=None) audio, loaded_sr = librosa.load(wav_path, sr=None)
assert loaded_sr == sr assert loaded_sr == sr
...@@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech): ...@@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech):
fft_padding = (fft_size - fft_window_shift) // 2 fft_padding = (fft_size - fft_window_shift) // 2
desired_length = frames * fft_window_shift + fft_padding * 2 desired_length = frames * fft_window_shift + fft_padding * 2
pad_amount = (desired_length - audio.size) // 2 pad_amount = (desired_length - audio.size) // 2
if audio.size % 2 == 0: if audio.size % 2 == 0:
audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect') audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect')
else: else:
audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect') audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect')
# Normalize audio. # Normalize audio.
audio = audio / np.abs(audio).max() * 0.999 audio = audio / np.abs(audio).max() * 0.999
# Compute mel-spectrogram. # Compute mel-spectrogram.
# Turn center to False to prevent internal padding. # Turn center to False to prevent internal padding.
spectrogram = librosa.core.stft( spectrogram = librosa.core.stft(
audio, hop_length=fft_window_shift, audio,
win_length=fft_window_size, n_fft=fft_size, center=False) hop_length=fft_window_shift,
win_length=fft_window_size,
n_fft=fft_size,
center=False)
spectrogram_magnitude = np.abs(spectrogram) spectrogram_magnitude = np.abs(spectrogram)
# Compute mel-spectrograms. # Compute mel-spectrograms.
mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size, mel_filter_bank = librosa.filters.mel(sr=sr,
n_fft=fft_size,
n_mels=config.mel_bands) n_mels=config.mel_bands)
mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude) mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
mel_spectrogram = mel_spectrogram.T mel_spectrogram = mel_spectrogram.T
# Rescale mel_spectrogram. # Rescale mel_spectrogram.
min_level, ref_level = 1e-5, 20 min_level, ref_level = 1e-5, 20
mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram)) mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
mel_spectrogram = mel_spectrogram - ref_level mel_spectrogram = mel_spectrogram - ref_level
mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1) mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
# Extract the center of audio that corresponds to mel spectrograms. # Extract the center of audio that corresponds to mel spectrograms.
audio = audio[fft_padding : -fft_padding] audio = audio[fft_padding:-fft_padding]
assert mel_spectrogram.shape[0] * fft_window_shift == audio.size assert mel_spectrogram.shape[0] * fft_window_shift == audio.size
return audio, mel_spectrogram return audio, mel_spectrogram
class Subset(dataset.Dataset): class Subset(dataset.Dataset):
def __init__(self, dataset, indices, valid): def __init__(self, dataset, indices, valid):
self.dataset = dataset self.dataset = dataset
self.indices = indices self.indices = indices
...@@ -100,23 +118,23 @@ class Subset(dataset.Dataset): ...@@ -100,23 +118,23 @@ class Subset(dataset.Dataset):
audio_start = frame_start * fft_window_shift audio_start = frame_start * fft_window_shift
audio_end = frame_end * fft_window_shift audio_end = frame_end * fft_window_shift
audio = audio[audio_start : audio_end] audio = audio[audio_start:audio_end]
return audio, mel, audio_start return audio, mel, audio_start
def _batch_examples(self, batch): def _batch_examples(self, batch):
audios = [sample[0] for sample in batch] audios = [sample[0] for sample in batch]
audio_starts = [sample[2] for sample in batch] audio_starts = [sample[2] for sample in batch]
# mels shape [num_frames, mel_bands] # mels shape [num_frames, mel_bands]
max_frames = max(sample[1].shape[0] for sample in batch) max_frames = max(sample[1].shape[0] for sample in batch)
mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch] mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch]
audios = np.array(audios, dtype=np.float32) audios = np.array(audios, dtype=np.float32)
mels = np.array(mels, dtype=np.float32) mels = np.array(mels, dtype=np.float32)
audio_starts = np.array(audio_starts, dtype=np.int32) audio_starts = np.array(audio_starts, dtype=np.int32)
return audios, mels, audio_starts return audios, mels, audio_starts
def __len__(self): def __len__(self):
...@@ -138,17 +156,17 @@ class LJSpeech: ...@@ -138,17 +156,17 @@ class LJSpeech:
# Train dataset. # Train dataset.
trainset = Subset(ds, train_indices, valid=False) trainset = Subset(ds, train_indices, valid=False)
sampler = DistributedSampler(len(trainset), nranks, rank) sampler = DistributedSampler(len(trainset), nranks, rank)
total_bs = config.batch_size total_bs = config.batch_size
assert total_bs % nranks == 0 assert total_bs % nranks == 0
train_sampler = BatchSampler(sampler, total_bs // nranks, train_sampler = BatchSampler(
drop_last=True) sampler, total_bs // nranks, drop_last=True)
trainloader = DataCargo(trainset, batch_sampler=train_sampler) trainloader = DataCargo(trainset, batch_sampler=train_sampler)
trainreader = fluid.io.PyReader(capacity=50, return_list=True) trainreader = fluid.io.PyReader(capacity=50, return_list=True)
trainreader.decorate_batch_generator(trainloader, place) trainreader.decorate_batch_generator(trainloader, place)
self.trainloader = (data for _ in iter(int, 1) self.trainloader = (data for _ in iter(int, 1)
for data in trainreader()) for data in trainreader())
# Valid dataset. # Valid dataset.
validset = Subset(ds, valid_indices, valid=True) validset = Subset(ds, valid_indices, valid=True)
...@@ -156,5 +174,5 @@ class LJSpeech: ...@@ -156,5 +174,5 @@ class LJSpeech:
validloader = DataCargo(validset, batch_size=1, shuffle=False) validloader = DataCargo(validset, batch_size=1, shuffle=False)
validreader = fluid.io.PyReader(capacity=20, return_list=True) validreader = fluid.io.PyReader(capacity=20, return_list=True)
validreader.decorate_batch_generator(validloader, place) validreader.decorate_batch_generator(validloader, place)
self.validloader = validreader self.validloader = validreader
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" """
Utility module for restarting training when using SLURM. Utility module for restarting training when using SLURM.
""" """
...@@ -45,8 +58,8 @@ def parse_time(text): ...@@ -45,8 +58,8 @@ def parse_time(text):
try: try:
return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds) return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
except ValueError as e: except ValueError as e:
raise ValueError("Error parsing time {}. Got error {}.".format( raise ValueError("Error parsing time {}. Got error {}.".format(text,
text, str(e))) str(e)))
def restart_command(): def restart_command():
...@@ -76,8 +89,10 @@ def restart_command(): ...@@ -76,8 +89,10 @@ def restart_command():
gres, partition = info.get("Gres"), info.get("Partition") gres, partition = info.get("Gres"), info.get("Partition")
stderr, stdout = info.get("StdErr"), info.get("StdOut") stderr, stdout = info.get("StdErr"), info.get("StdOut")
job_name = info.get("JobName") job_name = info.get("JobName")
command = ["sbatch", "--job-name={}".format(job_name), command = [
"--ntasks={}".format(num_tasks)] "sbatch", "--job-name={}".format(job_name),
"--ntasks={}".format(num_tasks)
]
if partition: if partition:
command.extend(["--partition", partition]) command.extend(["--partition", partition])
...@@ -98,12 +113,13 @@ def restart_command(): ...@@ -98,12 +113,13 @@ def restart_command():
dist_setting = ['-m', 'paddle.distributed.launch'] dist_setting = ['-m', 'paddle.distributed.launch']
wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv
command.append( command.append("--wrap={}".format(" ".join(
"--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd))) shlex.quote(arg) for arg in wrap_cmd)))
time_limit_string = info["TimeLimit"] time_limit_string = info["TimeLimit"]
if time_limit_string.lower() == "unlimited": if time_limit_string.lower() == "unlimited":
print("UNLIMITED detected: restart OFF, infinite learning ON.", print(
flush=True) "UNLIMITED detected: restart OFF, infinite learning ON.",
flush=True)
return command, None return command, None
time_limit = parse_time(time_limit_string) time_limit = parse_time(time_limit_string)
runtime = parse_time(info["RunTime"]) runtime = parse_time(info["RunTime"])
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
from pprint import pprint from pprint import pprint
...@@ -12,25 +26,42 @@ from wavenet import WaveNet ...@@ -12,25 +26,42 @@ from wavenet import WaveNet
def add_options_to_parser(parser): def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='wavenet', parser.add_argument(
'--model',
type=str,
default='wavenet',
help="general name of the model") help="general name of the model")
parser.add_argument('--name', type=str, parser.add_argument(
help="specific name of the training model") '--name', type=str, help="specific name of the training model")
parser.add_argument('--root', type=str, parser.add_argument(
help="root path of the LJSpeech dataset") '--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--use_gpu', type=bool, default=True, parser.add_argument(
'--use_gpu',
type=bool,
default=True,
help="option to use gpu training") help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None, parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, " help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint")) "default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None, parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load") help="path of the checkpoint to load")
parser.add_argument('--output', type=str, default="./syn_audios", parser.add_argument(
'--output',
type=str,
default="./syn_audios",
help="path to write synthesized audio files") help="path to write synthesized audio files")
parser.add_argument('--sample', type=int, parser.add_argument(
'--sample',
type=int,
help="which of the valid samples to synthesize audio") help="which of the valid samples to synthesize audio")
...@@ -52,7 +83,7 @@ def synthesize(config): ...@@ -52,7 +83,7 @@ def synthesize(config):
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
print("Random Seed: ", seed) print("Random Seed: ", seed)
# Build model. # Build model.
model = WaveNet(config, checkpoint_dir) model = WaveNet(config, checkpoint_dir)
model.build(training=False) model.build(training=False)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os import os
import random import random
import subprocess import subprocess
...@@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60 ...@@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60
def add_options_to_parser(parser): def add_options_to_parser(parser):
parser.add_argument('--model', type=str, default='wavenet', parser.add_argument(
'--model',
type=str,
default='wavenet',
help="general name of the model") help="general name of the model")
parser.add_argument('--name', type=str, parser.add_argument(
help="specific name of the training model") '--name', type=str, help="specific name of the training model")
parser.add_argument('--root', type=str, parser.add_argument(
help="root path of the LJSpeech dataset") '--root', type=str, help="root path of the LJSpeech dataset")
parser.add_argument('--parallel', type=bool, default=True, parser.add_argument(
'--parallel',
type=bool,
default=True,
help="option to use data parallel training") help="option to use data parallel training")
parser.add_argument('--use_gpu', type=bool, default=True, parser.add_argument(
'--use_gpu',
type=bool,
default=True,
help="option to use gpu training") help="option to use gpu training")
parser.add_argument('--iteration', type=int, default=None, parser.add_argument(
'--iteration',
type=int,
default=None,
help=("which iteration of checkpoint to load, " help=("which iteration of checkpoint to load, "
"default to load the latest checkpoint")) "default to load the latest checkpoint"))
parser.add_argument('--checkpoint', type=str, default=None, parser.add_argument(
'--checkpoint',
type=str,
default=None,
help="path of the checkpoint to load") help="path of the checkpoint to load")
parser.add_argument('--slurm', type=bool, default=False, parser.add_argument(
'--slurm',
type=bool,
default=False,
help="whether you are using slurm to submit training jobs") help="whether you are using slurm to submit training jobs")
...@@ -104,8 +136,8 @@ def train(config): ...@@ -104,8 +136,8 @@ def train(config):
# Check whether reaching the time limit. # Check whether reaching the time limit.
if config.slurm: if config.slurm:
done = (death_time is not None and death_time - time.time() < done = (death_time is not None and
MAXIMUM_SAVE_TIME) death_time - time.time() < MAXIMUM_SAVE_TIME)
if rank == 0 and done: if rank == 0 and done:
print("Saving progress before exiting.") print("Saving progress before exiting.")
...@@ -127,8 +159,8 @@ def train(config): ...@@ -127,8 +159,8 @@ def train(config):
if __name__ == "__main__": if __name__ == "__main__":
# Create parser. # Create parser.
parser = jsonargparse.ArgumentParser(description="Train WaveNet model", parser = jsonargparse.ArgumentParser(
formatter_class='default_argparse') description="Train WaveNet model", formatter_class='default_argparse')
add_options_to_parser(parser) add_options_to_parser(parser)
utils.add_config_options_to_parser(parser) utils.add_config_options_to_parser(parser)
...@@ -136,4 +168,4 @@ if __name__ == "__main__": ...@@ -136,4 +168,4 @@ if __name__ == "__main__":
# For conflicting updates to the same field, # For conflicting updates to the same field,
# the preceding update will be overwritten by the following one. # the preceding update will be overwritten by the following one.
config = parser.parse_args() config = parser.parse_args()
train(config) train(config)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import os import os
import time import time
...@@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg ...@@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--valid_size', type=int, parser.add_argument(
help="size of the valid dataset") '--valid_size', type=int, help="size of the valid dataset")
parser.add_argument('--train_clip_second', type=float, parser.add_argument(
'--train_clip_second',
type=float,
help="the length of audio clip for training") help="the length of audio clip for training")
parser.add_argument('--sample_rate', type=int, parser.add_argument(
help="sampling rate of audio data file") '--sample_rate', type=int, help="sampling rate of audio data file")
parser.add_argument('--fft_window_shift', type=int, parser.add_argument(
'--fft_window_shift',
type=int,
help="the shift of fft window for each frame") help="the shift of fft window for each frame")
parser.add_argument('--fft_window_size', type=int, parser.add_argument(
'--fft_window_size',
type=int,
help="the size of fft window for each frame") help="the size of fft window for each frame")
parser.add_argument('--fft_size', type=int, parser.add_argument(
help="the size of fft filter on each frame") '--fft_size', type=int, help="the size of fft filter on each frame")
parser.add_argument('--mel_bands', type=int, parser.add_argument(
'--mel_bands',
type=int,
help="the number of mel bands when calculating mel spectrograms") help="the number of mel bands when calculating mel spectrograms")
parser.add_argument('--seed', type=int, parser.add_argument(
help="seed of random initialization for the model") '--seed', type=int, help="seed of random initialization for the model")
parser.add_argument('--batch_size', type=int, parser.add_argument(
help="batch size for training") '--batch_size', type=int, help="batch size for training")
parser.add_argument('--test_every', type=int, parser.add_argument(
help="test interval during training") '--test_every', type=int, help="test interval during training")
parser.add_argument('--save_every', type=int, parser.add_argument(
'--save_every',
type=int,
help="checkpointing interval during training") help="checkpointing interval during training")
parser.add_argument('--max_iterations', type=int, parser.add_argument(
help="maximum training iterations") '--max_iterations', type=int, help="maximum training iterations")
parser.add_argument('--layers', type=int, parser.add_argument(
help="number of dilated convolution layers") '--layers', type=int, help="number of dilated convolution layers")
parser.add_argument('--kernel_width', type=int, parser.add_argument(
help="dilated convolution kernel width") '--kernel_width', type=int, help="dilated convolution kernel width")
parser.add_argument('--dilation_block', type=list, parser.add_argument(
help="dilated convolution kernel width") '--dilation_block', type=list, help="dilated convolution kernel width")
parser.add_argument('--residual_channels', type=int) parser.add_argument('--residual_channels', type=int)
parser.add_argument('--skip_channels', type=int) parser.add_argument('--skip_channels', type=int)
parser.add_argument('--loss_type', type=str, parser.add_argument(
help="mix-gaussian-pdf or softmax") '--loss_type', type=str, help="mix-gaussian-pdf or softmax")
parser.add_argument('--num_channels', type=int, default=None, parser.add_argument(
'--num_channels',
type=int,
default=None,
help="number of channels for softmax output") help="number of channels for softmax output")
parser.add_argument('--num_mixtures', type=int, default=None, parser.add_argument(
'--num_mixtures',
type=int,
default=None,
help="number of gaussian mixtures for gaussian output") help="number of gaussian mixtures for gaussian output")
parser.add_argument('--log_scale_min', type=float, default=None, parser.add_argument(
'--log_scale_min',
type=float,
default=None,
help="minimum clip value of log variance of gaussian output") help="minimum clip value of log variance of gaussian output")
parser.add_argument('--conditioner.filter_sizes', type=list, parser.add_argument(
'--conditioner.filter_sizes',
type=list,
help="conv2d tranpose op filter sizes for building conditioner") help="conv2d tranpose op filter sizes for building conditioner")
parser.add_argument('--conditioner.upsample_factors', type=list, parser.add_argument(
'--conditioner.upsample_factors',
type=list,
help="list of upsample factors for building conditioner") help="list of upsample factors for building conditioner")
parser.add_argument('--learning_rate', type=float) parser.add_argument('--learning_rate', type=float)
parser.add_argument('--gradient_max_norm', type=float) parser.add_argument('--gradient_max_norm', type=float)
parser.add_argument('--anneal.every', type=int, parser.add_argument(
'--anneal.every',
type=int,
help="step interval for annealing learning rate") help="step interval for annealing learning rate")
parser.add_argument('--anneal.rate', type=float) parser.add_argument('--anneal.rate', type=float)
...@@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration): ...@@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
handle.write("model_checkpoint_path: step-{}".format(iteration)) handle.write("model_checkpoint_path: step-{}".format(iteration))
def load_parameters(checkpoint_dir, rank, model, optimizer=None, def load_parameters(checkpoint_dir,
iteration=None, file_path=None): rank,
model,
optimizer=None,
iteration=None,
file_path=None):
if file_path is None: if file_path is None:
if iteration is None: if iteration is None:
iteration = load_latest_checkpoint(checkpoint_dir, rank) iteration = load_latest_checkpoint(checkpoint_dir, rank)
...@@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None, ...@@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
if optimizer and optimizer_dict: if optimizer and optimizer_dict:
optimizer.set_dict(optimizer_dict) optimizer.set_dict(optimizer_dict)
print("[checkpoint] Rank {}: loaded optimizer state from {}".format( print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
rank, file_path)) rank, file_path))
def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None): def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import os import os
import time import time
...@@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule ...@@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule
class WaveNet(): class WaveNet():
def __init__(self, config, checkpoint_dir, parallel=False, rank=0, def __init__(self,
nranks=1, tb_logger=None): config,
checkpoint_dir,
parallel=False,
rank=0,
nranks=1,
tb_logger=None):
# Process config to calculate the context size # Process config to calculate the context size
dilations = list( dilations = list(
itertools.islice( itertools.islice(
...@@ -29,12 +48,12 @@ class WaveNet(): ...@@ -29,12 +48,12 @@ class WaveNet():
def build(self, training=True): def build(self, training=True):
config = self.config config = self.config
dataset = LJSpeech(config, self.nranks, self.rank) dataset = LJSpeech(config, self.nranks, self.rank)
self.trainloader = dataset.trainloader self.trainloader = dataset.trainloader
self.validloader = dataset.validloader self.validloader = dataset.validloader
wavenet = WaveNetModule("wavenet", config, self.rank) wavenet = WaveNetModule("wavenet", config, self.rank)
# Dry run once to create and initalize all necessary parameters. # Dry run once to create and initalize all necessary parameters.
audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32)) audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32))
mel = dg.to_variable( mel = dg.to_variable(
...@@ -45,38 +64,44 @@ class WaveNet(): ...@@ -45,38 +64,44 @@ class WaveNet():
if training: if training:
# Create Learning rate scheduler. # Create Learning rate scheduler.
lr_scheduler = dg.ExponentialDecay( lr_scheduler = dg.ExponentialDecay(
learning_rate = config.learning_rate, learning_rate=config.learning_rate,
decay_steps = config.anneal.every, decay_steps=config.anneal.every,
decay_rate = config.anneal.rate, decay_rate=config.anneal.rate,
staircase=True) staircase=True)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=lr_scheduler) learning_rate=lr_scheduler)
clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm( clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
config.gradient_max_norm) config.gradient_max_norm)
# Load parameters. # Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank, utils.load_parameters(
wavenet, optimizer, self.checkpoint_dir,
iteration=config.iteration, self.rank,
file_path=config.checkpoint) wavenet,
optimizer,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank)) print("Rank {}: checkpoint loaded.".format(self.rank))
# Data parallelism. # Data parallelism.
if self.parallel: if self.parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
wavenet = dg.parallel.DataParallel(wavenet, strategy) wavenet = dg.parallel.DataParallel(wavenet, strategy)
self.wavenet = wavenet self.wavenet = wavenet
self.optimizer = optimizer self.optimizer = optimizer
self.clipper = clipper self.clipper = clipper
else: else:
# Load parameters. # Load parameters.
utils.load_parameters(self.checkpoint_dir, self.rank, wavenet, utils.load_parameters(
iteration=config.iteration, self.checkpoint_dir,
file_path=config.checkpoint) self.rank,
wavenet,
iteration=config.iteration,
file_path=config.checkpoint)
print("Rank {}: checkpoint loaded.".format(self.rank)) print("Rank {}: checkpoint loaded.".format(self.rank))
self.wavenet = wavenet self.wavenet = wavenet
...@@ -104,7 +129,9 @@ class WaveNet(): ...@@ -104,7 +129,9 @@ class WaveNet():
else: else:
current_lr = self.optimizer._learning_rate current_lr = self.optimizer._learning_rate
self.optimizer.minimize(loss, grad_clip=self.clipper, self.optimizer.minimize(
loss,
grad_clip=self.clipper,
parameter_list=self.wavenet.parameters()) parameter_list=self.wavenet.parameters())
self.wavenet.clear_gradients() self.wavenet.clear_gradients()
...@@ -143,10 +170,16 @@ class WaveNet(): ...@@ -143,10 +170,16 @@ class WaveNet():
tb = self.tb_logger tb = self.tb_logger
tb.add_scalar("Valid-Avg-Loss", loss_val, iteration) tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(), tb.add_audio(
iteration, sample_rate=self.config.sample_rate) "Teacher-Forced-Audio-0",
tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(), sample_audios[0].numpy(),
iteration, sample_rate=self.config.sample_rate) iteration,
sample_rate=self.config.sample_rate)
tb.add_audio(
"Teacher-Forced-Audio-1",
sample_audios[1].numpy(),
iteration,
sample_rate=self.config.sample_rate)
@dg.no_grad @dg.no_grad
def infer(self, iteration): def infer(self, iteration):
...@@ -165,10 +198,9 @@ class WaveNet(): ...@@ -165,10 +198,9 @@ class WaveNet():
start_time = time.time() start_time = time.time()
syn_audio = self.wavenet.synthesize(mels_list[sample]) syn_audio = self.wavenet.synthesize(mels_list[sample])
syn_time = time.time() - start_time syn_time = time.time() - start_time
print("audio shape {}, synthesis time {}".format( print("audio shape {}, synthesis time {}".format(syn_audio.shape,
syn_audio.shape, syn_time)) syn_time))
librosa.output.write_wav(filename, syn_audio, librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate)
sr=config.sample_rate)
def save(self, iteration): def save(self, iteration):
utils.save_latest_parameters(self.checkpoint_dir, iteration, utils.save_latest_parameters(self.checkpoint_dir, iteration,
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools import itertools
import numpy as np import numpy as np
...@@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'): ...@@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'):
def extract_slices(x, audio_starts, audio_length, rank): def extract_slices(x, audio_starts, audio_length, rank):
slices = [] slices = []
for i in range(x.shape[0]): for i in range(x.shape[0]):
start = audio_starts.numpy()[i] start = audio_starts.numpy()[i]
end = start + audio_length end = start + audio_length
slice = fluid.layers.slice( slice = fluid.layers.slice(
x, axes=[0, 1], starts=[i, start], ends=[i+1, end]) x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
slices.append(fluid.layers.squeeze(slice, [0])) slices.append(fluid.layers.squeeze(slice, [0]))
x = fluid.layers.stack(slices, axis=0) x = fluid.layers.stack(slices, axis=0)
...@@ -50,7 +64,7 @@ class Conditioner(dg.Layer): ...@@ -50,7 +64,7 @@ class Conditioner(dg.Layer):
# Register python list as parameters. # Register python list as parameters.
for i, layer in enumerate(self.deconvs): for i, layer in enumerate(self.deconvs):
self.add_sublayer("conv_transpose_{}".format(i), layer) self.add_sublayer("conv_transpose_{}".format(i), layer)
def forward(self, x): def forward(self, x):
x = fluid.layers.unsqueeze(x, 1) x = fluid.layers.unsqueeze(x, 1)
for layer in self.deconvs: for layer in self.deconvs:
...@@ -62,7 +76,7 @@ class Conditioner(dg.Layer): ...@@ -62,7 +76,7 @@ class Conditioner(dg.Layer):
class WaveNetModule(dg.Layer): class WaveNetModule(dg.Layer):
def __init__(self, name_scope, config, rank): def __init__(self, name_scope, config, rank):
super(WaveNetModule, self).__init__(name_scope) super(WaveNetModule, self).__init__(name_scope)
self.rank = rank self.rank = rank
self.conditioner = Conditioner(self.full_name(), config) self.conditioner = Conditioner(self.full_name(), config)
self.dilations = list( self.dilations = list(
...@@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer): ...@@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer):
embed_dim=config.residual_channels, embed_dim=config.residual_channels,
std=0.1) std=0.1)
elif config.loss_type == "mix-gaussian-pdf": elif config.loss_type == "mix-gaussian-pdf":
self.embedding_fc = modules.FC( self.embedding_fc = modules.FC(self.full_name(),
self.full_name(), in_features=1,
in_features=1, size=config.residual_channels,
size=config.residual_channels, num_flatten_dims=2,
num_flatten_dims=2, relu=False)
relu=False)
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(loss_type))
"loss_type {} is unsupported!".format(loss_type))
self.dilated_causal_convs = [] self.dilated_causal_convs = []
for dilation in self.dilations: for dilation in self.dilations:
...@@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer): ...@@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer):
num_filters=config.residual_channels, num_filters=config.residual_channels,
filter_size=config.kernel_width, filter_size=config.kernel_width,
dilation=dilation, dilation=dilation,
causal=True causal=True))
)
)
for i, layer in enumerate(self.dilated_causal_convs): for i, layer in enumerate(self.dilated_causal_convs):
self.add_sublayer("dilated_causal_conv_{}".format(i), layer) self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
self.fc1 = modules.FC( self.fc1 = modules.FC(self.full_name(),
self.full_name(), in_features=config.residual_channels,
in_features=config.residual_channels, size=config.skip_channels,
size=config.skip_channels, num_flatten_dims=2,
num_flatten_dims=2, relu=True,
relu=True, act="relu")
act="relu")
self.fc2 = modules.FC(self.full_name(),
self.fc2 = modules.FC( in_features=config.skip_channels,
self.full_name(), size=config.skip_channels,
in_features=config.skip_channels, num_flatten_dims=2,
size=config.skip_channels, relu=True,
num_flatten_dims=2, act="relu")
relu=True,
act="relu")
if config.loss_type == "softmax": if config.loss_type == "softmax":
self.fc3 = modules.FC( self.fc3 = modules.FC(self.full_name(),
self.full_name(), in_features=config.skip_channels,
in_features=config.skip_channels, size=config.num_channels,
size=config.num_channels, num_flatten_dims=2,
num_flatten_dims=2, relu=False)
relu=False)
elif config.loss_type == "mix-gaussian-pdf": elif config.loss_type == "mix-gaussian-pdf":
self.fc3 = modules.FC( self.fc3 = modules.FC(self.full_name(),
self.full_name(), in_features=config.skip_channels,
in_features=config.skip_channels, size=3 * config.num_mixtures,
size=3 * config.num_mixtures, num_flatten_dims=2,
num_flatten_dims=2, relu=False)
relu=False)
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(loss_type))
"loss_type {} is unsupported!".format(loss_type))
def sample_softmax(self, mix_parameters): def sample_softmax(self, mix_parameters):
batch, length, hidden = mix_parameters.shape batch, length, hidden = mix_parameters.shape
mix_param_2d = fluid.layers.reshape(mix_parameters, mix_param_2d = fluid.layers.reshape(mix_parameters,
[batch * length, hidden]) [batch * length, hidden])
mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1) mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)
# quantized: [batch * length] # quantized: [batch * length]
quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d), quantized = fluid.layers.cast(
dtype="float32") fluid.layers.sampling_id(mix_param_2d), dtype="float32")
samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0 samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0
# samples: [batch * length] # samples: [batch * length]
...@@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer): ...@@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer):
# to [bs * len, 3 * num_mixtures]. # to [bs * len, 3 * num_mixtures].
batch, length, hidden = mix_parameters.shape batch, length, hidden = mix_parameters.shape
mix_param_2d = fluid.layers.reshape(mix_parameters, mix_param_2d = fluid.layers.reshape(mix_parameters,
[batch * length, hidden]) [batch * length, hidden])
K = hidden // 3 K = hidden // 3
# Unpack the parameters of the mixture of gaussian. # Unpack the parameters of the mixture of gaussian.
logits_pi = mix_param_2d[:, 0 : K] logits_pi = mix_param_2d[:, 0:K]
mu = mix_param_2d[:, K : 2*K] mu = mix_param_2d[:, K:2 * K]
log_s = mix_param_2d[:, 2*K : 3*K] log_s = mix_param_2d[:, 2 * K:3 * K]
s = fluid.layers.exp(log_s) s = fluid.layers.exp(log_s)
pi = fluid.layers.softmax(logits_pi, axis=-1) pi = fluid.layers.softmax(logits_pi, axis=-1)
comp_samples = fluid.layers.sampling_id(pi) comp_samples = fluid.layers.sampling_id(pi)
row_idx = dg.to_variable(np.arange(batch * length)) row_idx = dg.to_variable(np.arange(batch * length))
comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1) comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)
mu_comp = fluid.layers.gather_nd(mu, comp_samples) mu_comp = fluid.layers.gather_nd(mu, comp_samples)
s_comp = fluid.layers.gather_nd(s, comp_samples) s_comp = fluid.layers.gather_nd(s, comp_samples)
# N(0, 1) normal sample. # N(0, 1) normal sample.
u = fluid.layers.gaussian_random(shape=[batch * length]) u = fluid.layers.gaussian_random(shape=[batch * length])
...@@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer): ...@@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer):
# Calculate gaussian loss. # Calculate gaussian loss.
targets = fluid.layers.unsqueeze(targets, -1) targets = fluid.layers.unsqueeze(targets, -1)
targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures]) targets = fluid.layers.expand(targets,
x_std = inv_s * (targets - mu) [1, 1, self.config.num_mixtures])
x_std = inv_s * (targets - mu)
exponent = fluid.layers.exp(-0.5 * x_std * x_std) exponent = fluid.layers.exp(-0.5 * x_std * x_std)
pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
pdf_x = pi * pdf_x pdf_x = pi * pdf_x
...@@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer): ...@@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer):
# Slice conditioners. # Slice conditioners.
audio_length = audios.shape[1] audio_length = audios.shape[1]
conditioner = extract_slices(full_conditioner, conditioner = extract_slices(full_conditioner, audio_starts,
audio_starts, audio_length, self.rank) audio_length, self.rank)
# input_audio, target_audio: [bs, len] # input_audio, target_audio: [bs, len]
input_audios = audios[:, :-1] input_audios = audios[:, :-1]
target_audios = audios[:, 1:] target_audios = audios[:, 1:]
...@@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer): ...@@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer):
layer_input = self.embedding_fc( layer_input = self.embedding_fc(
fluid.layers.unsqueeze(input_audios, 2)) fluid.layers.unsqueeze(input_audios, 2))
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(loss_type))
"loss_type {} is unsupported!".format(loss_type))
# layer_input: [bs, res_channel, 1, len] # layer_input: [bs, res_channel, 1, len]
layer_input = fluid.layers.unsqueeze( layer_input = fluid.layers.unsqueeze(
fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2) fluid.layers.transpose(
layer_input, perm=[0, 2, 1]), 2)
# conditioner: [bs, mel_bands, 1, len] # conditioner: [bs, mel_bands, 1, len]
conditioner = fluid.layers.unsqueeze( conditioner = fluid.layers.unsqueeze(
fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2) fluid.layers.transpose(
conditioner, perm=[0, 2, 1]), 2)
skip = None skip = None
for i, layer in enumerate(self.dilated_causal_convs): for i, layer in enumerate(self.dilated_causal_convs):
...@@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer): ...@@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer):
elif loss_type == "mix-gaussian-pdf": elif loss_type == "mix-gaussian-pdf":
sample_audios = self.sample_mix_gaussian(mix_parameters) sample_audios = self.sample_mix_gaussian(mix_parameters)
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(
"loss_type {} is unsupported!".format(loss_type)) loss_type))
if loss_type == "softmax": if loss_type == "softmax":
loss = self.softmax_loss(target_audios, mix_parameters) loss = self.softmax_loss(target_audios, mix_parameters)
elif loss_type == "mix-gaussian-pdf": elif loss_type == "mix-gaussian-pdf":
loss = self.mixture_density_loss(target_audios, loss = self.mixture_density_loss(target_audios, mix_parameters,
mix_parameters, self.log_scale_min) self.log_scale_min)
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(loss_type))
"loss_type {} is unsupported!".format(loss_type))
return loss, sample_audios return loss, sample_audios
def synthesize(self, mels): def synthesize(self, mels):
self.start_new_sequence() self.start_new_sequence()
bs, n_frames, mel_bands = mels.shape bs, n_frames, mel_bands = mels.shape
conditioner = self.conditioner(mels) conditioner = self.conditioner(mels)
time_steps = conditioner.shape[1] time_steps = conditioner.shape[1]
...@@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer): ...@@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer):
elif loss_type == "mix-gaussian-pdf": elif loss_type == "mix-gaussian-pdf":
audio_input = self.embedding_fc(current_sample) audio_input = self.embedding_fc(current_sample)
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(
"loss_type {} is unsupported!".format(loss_type)) loss_type))
# [bs, channel, 1, 1] # [bs, channel, 1, 1]
audio_input = fluid.layers.unsqueeze( audio_input = fluid.layers.unsqueeze(
fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2) fluid.layers.transpose(
audio_input, perm=[0, 2, 1]), 2)
# [bs, mel_bands] # [bs, mel_bands]
cond_input = conditioner[:, i, :] cond_input = conditioner[:, i, :]
# [bs, mel_bands, 1, 1] # [bs, mel_bands, 1, 1]
cond_input = fluid.layers.reshape( cond_input = fluid.layers.reshape(cond_input,
cond_input, cond_input.shape + [1, 1]) cond_input.shape + [1, 1])
skip = None skip = None
for layer in self.dilated_causal_convs: for layer in self.dilated_causal_convs:
audio_input, skip = layer.add_input( audio_input, skip = layer.add_input(audio_input, skip,
audio_input, skip, cond_input) cond_input)
# [bs, 1, channel] # [bs, 1, channel]
skip = fluid.layers.transpose( skip = fluid.layers.transpose(
fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1]) fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
...@@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer): ...@@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer):
elif loss_type == "mix-gaussian-pdf": elif loss_type == "mix-gaussian-pdf":
sample = self.sample_mix_gaussian(mix_parameters) sample = self.sample_mix_gaussian(mix_parameters)
else: else:
raise ValueError( raise ValueError("loss_type {} is unsupported!".format(
"loss_type {} is unsupported!".format(loss_type)) loss_type))
audio_samples.append(sample) audio_samples.append(sample)
# [bs] # [bs]
current_sample = audio_samples[-1] current_sample = audio_samples[-1]
# [bs, 1, 1] # [bs, 1, 1]
current_sample = fluid.layers.reshape(current_sample, current_sample = fluid.layers.reshape(
current_sample.shape + [1, 1]) current_sample, current_sample.shape + [1, 1])
# syn_audio: [num_samples] # syn_audio: [num_samples]
syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy() syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
return syn_audio return syn_audio
def start_new_sequence(self): def start_new_sequence(self):
for layer in self.sublayers(): for layer in self.sublayers():
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import weight_norm from . import weight_norm
from .customized import * from .customized import *
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import fluid from paddle import fluid
import paddle.fluid.layers as F import paddle.fluid.layers as F
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -7,14 +21,15 @@ class Pool1D(dg.Layer): ...@@ -7,14 +21,15 @@ class Pool1D(dg.Layer):
""" """
A Pool 1D block implemented with Pool2D. A Pool 1D block implemented with Pool2D.
""" """
def __init__(self, def __init__(self,
pool_size=-1, pool_size=-1,
pool_type='max', pool_type='max',
pool_stride=1, pool_stride=1,
pool_padding=0, pool_padding=0,
global_pooling=False, global_pooling=False,
use_cudnn=True, use_cudnn=True,
ceil_mode=False, ceil_mode=False,
exclusive=True, exclusive=True,
data_format='NCT'): data_format='NCT'):
super(Pool1D, self).__init__() super(Pool1D, self).__init__()
...@@ -28,13 +43,16 @@ class Pool1D(dg.Layer): ...@@ -28,13 +43,16 @@ class Pool1D(dg.Layer):
self.exclusive = exclusive self.exclusive = exclusive
self.data_format = data_format self.data_format = data_format
self.pool2d = dg.Pool2D(
[1, pool_size],
pool_type=pool_type,
pool_stride=[1, pool_stride],
pool_padding=[0, pool_padding],
global_pooling=global_pooling,
use_cudnn=use_cudnn,
ceil_mode=ceil_mode,
exclusive=exclusive)
self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive)
def forward(self, x): def forward(self, x):
""" """
Args: Args:
...@@ -53,12 +71,14 @@ class Pool1D(dg.Layer): ...@@ -53,12 +71,14 @@ class Pool1D(dg.Layer):
x = fluid.layers.transpose(x, [0, 2, 1]) x = fluid.layers.transpose(x, [0, 2, 1])
return x return x
class Conv1D(dg.Conv2D): class Conv1D(dg.Conv2D):
"""A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and """A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and
use (B, C, 1, T) data layout to compute 1D convolution. Nothing more. use (B, C, 1, T) data layout to compute 1D convolution. Nothing more.
NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple
layer, instead of a complex one. So we can easily apply weight norm to it. layer, instead of a complex one. So we can easily apply weight norm to it.
""" """
def __init__(self, def __init__(self,
num_channels, num_channels,
num_filters, num_filters,
...@@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D): ...@@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D):
use_cudnn=True, use_cudnn=True,
act=None, act=None,
dtype='float32'): dtype='float32'):
super(Conv1D, self).__init__(num_channels, super(Conv1D, self).__init__(
num_filters, (1, filter_size), num_channels,
stride=(1, stride), num_filters, (1, filter_size),
padding=(0, padding), stride=(1, stride),
dilation=(1, dilation), padding=(0, padding),
groups=groups, dilation=(1, dilation),
param_attr=param_attr, groups=groups,
bias_attr=bias_attr, param_attr=param_attr,
use_cudnn=use_cudnn, bias_attr=bias_attr,
act=act, use_cudnn=use_cudnn,
dtype=dtype) act=act,
dtype=dtype)
def forward(self, x): def forward(self, x):
x = F.unsqueeze(x, [2]) x = F.unsqueeze(x, [2])
...@@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose): ...@@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose):
use_cudnn=True, use_cudnn=True,
act=None, act=None,
dtype='float32'): dtype='float32'):
super(Conv1DTranspose, self).__init__(num_channels, super(Conv1DTranspose, self).__init__(
num_filters, (1, filter_size), num_channels,
output_size=None, num_filters, (1, filter_size),
padding=(0, padding), output_size=None,
stride=(1, stride), padding=(0, padding),
dilation=(1, dilation), stride=(1, stride),
groups=groups, dilation=(1, dilation),
param_attr=param_attr, groups=groups,
bias_attr=bias_attr, param_attr=param_attr,
use_cudnn=use_cudnn, bias_attr=bias_attr,
act=act, use_cudnn=use_cudnn,
dtype=dtype) act=act,
dtype=dtype)
def forward(self, x): def forward(self, x):
x = F.unsqueeze(x, [2]) x = F.unsqueeze(x, [2])
...@@ -134,6 +156,7 @@ class Conv1DCell(Conv1D): ...@@ -134,6 +156,7 @@ class Conv1DCell(Conv1D):
It is a cell that it acts like an RNN cell. It does not support stride > 1, and it It is a cell that it acts like an RNN cell. It does not support stride > 1, and it
ensures 1-to-1 mapping from input time steps to output timesteps. ensures 1-to-1 mapping from input time steps to output timesteps.
""" """
def __init__(self, def __init__(self,
num_channels, num_channels,
num_filters, num_filters,
...@@ -150,18 +173,19 @@ class Conv1DCell(Conv1D): ...@@ -150,18 +173,19 @@ class Conv1DCell(Conv1D):
padding = receptive_field - 1 if causal else receptive_field // 2 padding = receptive_field - 1 if causal else receptive_field // 2
self._receptive_field = receptive_field self._receptive_field = receptive_field
self.causal = causal self.causal = causal
super(Conv1DCell, self).__init__(num_channels, super(Conv1DCell, self).__init__(
num_filters, num_channels,
filter_size, num_filters,
stride=1, filter_size,
padding=padding, stride=1,
dilation=dilation, padding=padding,
groups=groups, dilation=dilation,
param_attr=param_attr, groups=groups,
bias_attr=bias_attr, param_attr=param_attr,
use_cudnn=use_cudnn, bias_attr=bias_attr,
act=act, use_cudnn=use_cudnn,
dtype=dtype) act=act,
dtype=dtype)
def forward(self, x): def forward(self, x):
# it ensures that ouput time steps == input time steps # it ensures that ouput time steps == input time steps
...@@ -189,15 +213,16 @@ class Conv1DCell(Conv1D): ...@@ -189,15 +213,16 @@ class Conv1DCell(Conv1D):
def add_input(self, x_t): def add_input(self, x_t):
batch_size, c_in, _ = x_t.shape batch_size, c_in, _ = x_t.shape
if self._buffer is None: if self._buffer is None:
self._buffer = F.zeros((batch_size, c_in, self.receptive_field), self._buffer = F.zeros(
dtype=x_t.dtype) (batch_size, c_in, self.receptive_field), dtype=x_t.dtype)
self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1) self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1)
if self._dilation[1] > 1: if self._dilation[1] > 1:
input = F.strided_slice(self._buffer, input = F.strided_slice(
axes=[2], self._buffer,
starts=[0], axes=[2],
ends=[self.receptive_field], starts=[0],
strides=[self._dilation[1]]) ends=[self.receptive_field],
strides=[self._dilation[1]])
else: else:
input = self._buffer input = self._buffer
input = F.reshape(input, (batch_size, -1)) input = F.reshape(input, (batch_size, -1))
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
class DynamicGRU(dg.Layer): class DynamicGRU(dg.Layer):
def __init__(self, def __init__(self,
size, size,
...@@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer): ...@@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer):
res = res[::-1] res = res[::-1]
res = layers.concat(res, axis=1) res = layers.concat(res, axis=1)
return res return res
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D ...@@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D
class PositionwiseFeedForward(dg.Layer): class PositionwiseFeedForward(dg.Layer):
''' A two-feed-forward-layer module ''' ''' A two-feed-forward-layer module '''
def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
def __init__(self,
d_in,
num_hidden,
filter_size,
padding=0,
use_cudnn=True,
dropout=0.1):
super(PositionwiseFeedForward, self).__init__() super(PositionwiseFeedForward, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.dropout = dropout self.dropout = dropout
k = math.sqrt(1 / d_in) k = math.sqrt(1 / d_in)
self.w_1 = Conv1D(num_channels = d_in, self.w_1 = Conv1D(
num_filters = num_hidden, num_channels=d_in,
filter_size = filter_size, num_filters=num_hidden,
padding=padding, filter_size=filter_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=padding,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), param_attr=fluid.ParamAttr(
use_cudnn = use_cudnn) initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn)
k = math.sqrt(1 / num_hidden) k = math.sqrt(1 / num_hidden)
self.w_2 = Conv1D(num_channels = num_hidden, self.w_2 = Conv1D(
num_filters = d_in, num_channels=num_hidden,
filter_size = filter_size, num_filters=d_in,
padding=padding, filter_size=filter_size,
param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), padding=padding,
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), param_attr=fluid.ParamAttr(
use_cudnn = use_cudnn) initializer=fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k)),
use_cudnn=use_cudnn)
self.layer_norm = dg.LayerNorm(d_in) self.layer_norm = dg.LayerNorm(d_in)
def forward(self, input): def forward(self, input):
...@@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer): ...@@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer):
Returns: Returns:
output (Variable), Shape(B, T, C), the result after FFN. output (Variable), Shape(B, T, C), the result after FFN.
""" """
x = layers.transpose(input, [0,2,1]) x = layers.transpose(input, [0, 2, 1])
#FFN Networt #FFN Networt
x = self.w_2(layers.relu(self.w_1(x))) x = self.w_2(layers.relu(self.w_1(x)))
# dropout # dropout
x = layers.dropout(x, self.dropout) x = layers.dropout(x, self.dropout)
x = layers.transpose(x, [0,2,1]) x = layers.transpose(x, [0, 2, 1])
# residual connection # residual connection
x = x + input x = x + input
#layer normalization #layer normalization
output = self.layer_norm(x) output = self.layer_norm(x)
return output return output
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
class Linear(dg.Layer): class Linear(dg.Layer):
def __init__(self, in_features, out_features, is_bias=True, dtype="float32"): def __init__(self,
in_features,
out_features,
is_bias=True,
dtype="float32"):
super(Linear, self).__init__() super(Linear, self).__init__()
self.in_features = in_features self.in_features = in_features
self.out_features = out_features self.out_features = out_features
self.dtype = dtype self.dtype = dtype
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) self.weight = fluid.ParamAttr(
self.bias = is_bias initializer=fluid.initializer.XavierInitializer())
self.bias = is_bias
if is_bias is not False: if is_bias is not False:
k = math.sqrt(1 / in_features) k = math.sqrt(1 / in_features)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
low=-k, high=k))
self.linear = dg.Linear(
in_features,
out_features,
param_attr=self.weight,
bias_attr=self.bias, )
self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
bias_attr = self.bias,)
def forward(self, x): def forward(self, x):
x = self.linear(x) x = self.linear(x)
return x return x
class ScaledDotProductAttention(dg.Layer): class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key): def __init__(self, d_key):
super(ScaledDotProductAttention, self).__init__() super(ScaledDotProductAttention, self).__init__()
self.d_key = d_key self.d_key = d_key
# please attention this mask is diff from pytorch # please attention this mask is diff from pytorch
def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1): def forward(self,
key,
value,
query,
mask=None,
query_mask=None,
dropout=0.1):
""" """
Scaled Dot Product Attention. Scaled Dot Product Attention.
...@@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer): ...@@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer):
attention (Variable), Shape(n_head * B, T, C), the attention of key. attention (Variable), Shape(n_head * B, T, C), the attention of key.
""" """
# Compute attention score # Compute attention score
attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y attention = layers.matmul(
query, key, transpose_y=True) #transpose the last dim in y
attention = attention / math.sqrt(self.d_key) attention = attention / math.sqrt(self.d_key)
# Mask key to ignore padding # Mask key to ignore padding
if mask is not None: if mask is not None:
attention = attention * mask attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
attention = attention + mask attention = attention + mask
attention = layers.softmax(attention) attention = layers.softmax(attention)
attention = layers.dropout(attention, dropout) attention = layers.dropout(attention, dropout)
# Mask query to ignore padding # Mask query to ignore padding
if query_mask is not None: if query_mask is not None:
attention = attention * query_mask attention = attention * query_mask
result = layers.matmul(attention, value) result = layers.matmul(attention, value)
return result, attention return result, attention
class MultiheadAttention(dg.Layer): class MultiheadAttention(dg.Layer):
def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True): def __init__(self,
num_hidden,
d_k,
d_q,
num_head=4,
is_bias=False,
dropout=0.1,
is_concat=True):
super(MultiheadAttention, self).__init__() super(MultiheadAttention, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.num_head = num_head self.num_head = num_head
...@@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer): ...@@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer):
# repeat masks h times # repeat masks h times
if query_mask is not None: if query_mask is not None:
query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) query_mask = layers.expand(query_mask,
[self.num_head, 1, seq_len_key])
if mask is not None: if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1)) mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention # Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) key = layers.reshape(
value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k]) self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q]) value = layers.reshape(
self.value(value),
[batch_size, seq_len_key, self.num_head, self.d_k])
query = layers.reshape(
self.query(query_input),
[batch_size, seq_len_query, self.num_head, self.d_q])
key = layers.reshape(
layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(
layers.transpose(value, [2, 0, 1, 3]),
[-1, seq_len_key, self.d_k])
query = layers.reshape(
layers.transpose(query, [2, 0, 1, 3]),
[-1, seq_len_query, self.d_q])
result, attention = self.scal_attn(
key, value, query, mask=mask, query_mask=query_mask)
key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result # concat all multihead result
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(
layers.transpose(result, [1, 2, 0, 3]),
[batch_size, seq_len_query, -1])
if self.is_concat: if self.is_concat:
result = layers.concat([query_input,result], axis=-1) result = layers.concat([query_input, result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout) result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input result = result + query_input
result = self.layer_norm(result) result = self.layer_norm(result)
return result, attention return result, attention
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from paddle import fluid from paddle import fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np import numpy as np
from torch import nn from torch import nn
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -10,8 +24,8 @@ def summary(layer): ...@@ -10,8 +24,8 @@ def summary(layer):
print("{}|{}|{}".format(name, param.shape, np.prod(param.shape))) print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
num_elements += np.prod(param.shape) num_elements += np.prod(param.shape)
num_params += 1 num_params += 1
print("layer has {} parameters, {} elements.".format( print("layer has {} parameters, {} elements.".format(num_params,
num_params, num_elements)) num_elements))
def freeze(layer): def freeze(layer):
...@@ -31,5 +45,5 @@ def torch_summary(layer): ...@@ -31,5 +45,5 @@ def torch_summary(layer):
print("{}|{}|{}".format(name, param.shape, np.prod(param.shape))) print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
num_elements += np.prod(param.shape) num_elements += np.prod(param.shape)
num_params += 1 num_params += 1
print("layer has {} parameters, {} elements.".format( print("layer has {} parameters, {} elements.".format(num_params,
num_params, num_elements)) num_elements))
import os # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
import io #
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import io
import re import re
from setuptools import setup, find_packages from setuptools import setup, find_packages
def read(*names, **kwargs): def read(*names, **kwargs):
with io.open( with io.open(
os.path.join(os.path.dirname(__file__), *names), os.path.join(os.path.dirname(__file__), *names),
encoding=kwargs.get("encoding", "utf8") encoding=kwargs.get("encoding", "utf8")) as fp:
) as fp:
return fp.read() return fp.read()
...@@ -19,6 +33,7 @@ def find_version(*file_paths): ...@@ -19,6 +33,7 @@ def find_version(*file_paths):
return version_match.group(1) return version_match.group(1)
raise RuntimeError("Unable to find version string.") raise RuntimeError("Unable to find version string.")
VERSION = find_version('parakeet', '__init__.py') VERSION = find_version('parakeet', '__init__.py')
long_description = read('README.md') long_description = read('README.md')
...@@ -32,17 +47,26 @@ setup_info = dict( ...@@ -32,17 +47,26 @@ setup_info = dict(
description='Speech synthesis tools and models based on Paddlepaddle', description='Speech synthesis tools and models based on Paddlepaddle',
long_description=long_description, long_description=long_description,
license='Apache 2', license='Apache 2',
install_requires=[ install_requires=[
'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', 'numpy',
'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy', 'nltk',
'ruamel.yaml', 'pandas', 'sox', 'soundfile', 'inflect',
'librosa',
'unidecode',
'numba',
'tqdm',
'matplotlib',
'tensorboardX',
'tensorboard',
'scipy',
'ruamel.yaml',
'pandas',
'sox',
'soundfile',
], ],
# Package info # Package info
packages=find_packages(exclude=('tests', 'tests.*')), packages=find_packages(exclude=('tests', 'tests.*')),
zip_safe=True, )
zip_safe=True, setup(**setup_info)
)
setup(**setup_info)
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets.ljspeech import LJSpeech from parakeet.datasets.ljspeech import LJSpeech
from parakeet.data.datacargo import DataCargo from parakeet.data.datacargo import DataCargo
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.datasets import vctk from parakeet.datasets import vctk
from pathlib import Path from pathlib import Path
from parakeet.data.datacargo import DataCargo from parakeet.data.datacargo import DataCargo
root = Path("/workspace/datasets/VCTK-Corpus") root = Path("/workspace/datasets/VCTK-Corpus")
vctk_dataset = vctk.VCTK(root) vctk_dataset = vctk.VCTK(root)
vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True) vctk_cargo = DataCargo(
vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
for i, batch in enumerate(vctk_cargo): for i, batch in enumerate(vctk_cargo):
print(i) print(i)
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import io, re
import sys, os
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册