diff --git a/examples/deepvoice3/ljspeech.yaml b/examples/deepvoice3/ljspeech.yaml index 338dd0dbbb965ae185d61c29ed3d080fdb77114f..bd17d2ecd175da75926d552178015ce1fee357c8 100644 --- a/examples/deepvoice3/ljspeech.yaml +++ b/examples/deepvoice3/ljspeech.yaml @@ -21,6 +21,7 @@ transform: # db scale min_level_db: -100 ref_level_db: 20 + clip_norm: true loss: @@ -48,20 +49,20 @@ model: embedding_weight_std: 0.1 freeze_embedding: false padding_idx: 0 - encoder_channels: 256 + encoder_channels: 512 # decoder query_position_rate: 1.0 key_position_rate: 1.29 trainable_positional_encodings: false kernel_size: 3 - decoder_channels: 512 + decoder_channels: 256 downsample_factor: 4 outputs_per_step: 1 # attention - key_position_rate: true - value_position_rate: true + key_projection: true + value_projection: true force_monotonic_attention: true window_backward: -1 window_ahead: 3 @@ -88,16 +89,3 @@ train: snap_interval: 1000 eval_interval: 10000 save_interval: 10000 - - - - - - - - - - - - - diff --git a/examples/deepvoice3/synthesis.py b/examples/deepvoice3/synthesis.py index 9841da437cb754855f7f1cf0c9d2ed0a36630f98..303c1827428088dec368caa3131815f03439b18a 100644 --- a/examples/deepvoice3/synthesis.py +++ b/examples/deepvoice3/synthesis.py @@ -1,6 +1,6 @@ import os import argparse -import ruamel.yamls +import ruamel.yaml import numpy as np import soundfile as sf @@ -22,6 +22,11 @@ if __name__ == "__main__": parser.add_argument("checkpoint", type=str, help="checkpoint to load.") parser.add_argument("text", type=str, help="text file to synthesize") parser.add_argument("output_path", type=str, help="path to save results") + parser.add_argument("-g", + "--device", + type=int, + default=-1, + help="device to use") args = parser.parse_args() with open(args.config, 'rt') as f: @@ -67,7 +72,7 @@ if __name__ == "__main__": use_memory_mask = model_config["use_memory_mask"] query_position_rate = model_config["query_position_rate"] key_position_rate = model_config["key_position_rate"] - window_behind = model_config["window_behind"] + window_backward = model_config["window_backward"] window_ahead = model_config["window_ahead"] key_projection = model_config["key_projection"] value_projection = model_config["value_projection"] @@ -76,11 +81,12 @@ if __name__ == "__main__": freeze_embedding, filter_size, encoder_channels, n_mels, decoder_channels, r, trainable_positional_encodings, use_memory_mask, - query_position_rate, key_position_rate, window_behind, - window_ahead, key_projection, value_projection, - downsample_factor, linear_dim, use_decoder_states, - converter_channels, dropout) + query_position_rate, key_position_rate, + window_backward, window_ahead, key_projection, + value_projection, downsample_factor, linear_dim, + use_decoder_states, converter_channels, dropout) + summary(dv3) state, _ = dg.load_dygraph(args.checkpoint) dv3.set_dict(state) diff --git a/examples/deepvoice3/train.py b/examples/deepvoice3/train.py index 3bd9052aafefd00debf8165c2c863192df8b5909..0c7740240171ecd7c8cc817476ca5ad7aa1f51fa 100644 --- a/examples/deepvoice3/train.py +++ b/examples/deepvoice3/train.py @@ -1,6 +1,6 @@ import os import argparse -import ruamel.yamls +import ruamel.yaml import numpy as np from matplotlib import cm import matplotlib.pyplot as plt @@ -15,10 +15,9 @@ import paddle.fluid.layers as F import paddle.fluid.dygraph as dg from parakeet.g2p import en -from parakeet.models.deepvoice3.encoder import ConvSpec from parakeet.data import FilterDataset, TransformDataset, FilterDataset from parakeet.data import DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler -from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3 +from parakeet.models.deepvoice3 import Encoder, Decoder, Converter, DeepVoice3, ConvSpec from parakeet.models.deepvoice3.loss import TTSLoss from parakeet.utils.layer_tools import summary @@ -128,7 +127,7 @@ if __name__ == "__main__": use_memory_mask = model_config["use_memory_mask"] query_position_rate = model_config["query_position_rate"] key_position_rate = model_config["key_position_rate"] - window_behind = model_config["window_behind"] + window_backward = model_config["window_backward"] window_ahead = model_config["window_ahead"] key_projection = model_config["key_projection"] value_projection = model_config["value_projection"] @@ -137,10 +136,10 @@ if __name__ == "__main__": freeze_embedding, filter_size, encoder_channels, n_mels, decoder_channels, r, trainable_positional_encodings, use_memory_mask, - query_position_rate, key_position_rate, window_behind, - window_ahead, key_projection, value_projection, - downsample_factor, linear_dim, use_decoder_states, - converter_channels, dropout) + query_position_rate, key_position_rate, + window_backward, window_ahead, key_projection, + value_projection, downsample_factor, linear_dim, + use_decoder_states, converter_channels, dropout) # =========================loss========================= loss_config = config["loss"] diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ed86edd00910e39c84afdfbd24666932c7565ebe 100644 --- a/parakeet/data/__init__.py +++ b/parakeet/data/__init__.py @@ -0,0 +1,4 @@ +from .dataset import * +from .datacargo import * +from .sampler import * +from .batch import * diff --git a/parakeet/models/deepvoice3/__init__.py b/parakeet/models/deepvoice3/__init__.py index 4a8d30e5f469d3fab661cc23a983cee051c2f0fa..04309877028e7631b71f3121fec8d85fb9e50aed 100644 --- a/parakeet/models/deepvoice3/__init__.py +++ b/parakeet/models/deepvoice3/__init__.py @@ -1,4 +1,4 @@ -from parakeet.models.deepvoice3.encoder import Encoder -from parakeet.models.deepvoice3.decoder import Decoder +from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec +from parakeet.models.deepvoice3.decoder import Decoder, WindowRange from parakeet.models.deepvoice3.converter import Converter from parakeet.models.deepvoice3.model import DeepVoice3