From 7938a5f6a47b95a9c008ec9ea8fba4b68b157a61 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 13 Jul 2020 15:19:52 +0800 Subject: [PATCH] add griffin lim as an alternative vocoder --- examples/deepvoice3/synthesize.py | 30 +++++++++++++++++++++++------- examples/deepvoice3/vocoder.py | 14 +++++++++++--- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/examples/deepvoice3/synthesize.py b/examples/deepvoice3/synthesize.py index 1fd1d95..9f0dda0 100644 --- a/examples/deepvoice3/synthesize.py +++ b/examples/deepvoice3/synthesize.py @@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args from parakeet.g2p import en -from vocoder import WaveflowVocoder +from vocoder import WaveflowVocoder, GriffinLimVocoder from train import create_model @@ -26,8 +26,18 @@ def main(args, config): model = create_model(config) loaded_step = load_parameters(model, checkpoint_path=args.checkpoint) model.eval() - vocoder = WaveflowVocoder() - vocoder.model.eval() + if args.vocoder == "waveflow": + vocoder = WaveflowVocoder() + vocoder.model.eval() + elif args.vocoder == "griffin-lim": + vocoder = GriffinLimVocoder( + sharpening_factor=config["sharpening_factor"], + sample_rate=config["sample_rate"], + n_fft=config["n_fft"], + win_length=config["win_length"], + hop_length=config["hop_length"]) + else: + raise ValueError("Other vocoders are not supported.") if not os.path.exists(args.output): os.makedirs(args.output) @@ -35,12 +45,12 @@ def main(args, config): with open(args.input, 'rt') as f: sentences = [line.strip() for line in f.readlines()] for i, sentence in enumerate(sentences): - wav = synthesize(config, model, vocoder, sentence, monotonic_layers) + wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers) sf.write(os.path.join(args.output, "sentence{}.wav".format(i)), wav, samplerate=config["sample_rate"]) -def synthesize(config, model, vocoder, sentence, monotonic_layers): +def synthesize(args, config, model, vocoder, sentence, monotonic_layers): print("[synthesize] {}".format(sentence)) text = en.text_to_sequence(sentence, p=1.0) text = np.expand_dims(np.array(text, dtype="int64"), 0) @@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers): force_monotonic_attention=force_monotonic_attention, window=(config["backward_step"], config["forward_step"])) decoded, refined, attentions = outputs - wav = vocoder(F.transpose(decoded, (0, 2, 1))) - wav_np = wav.numpy()[0] + if args.vocoder == "griffin-lim": + wav_np = vocoder(refined.numpy()[0].T) + else: + wav = vocoder(F.transpose(refined, (0, 2, 1))) + wav_np = wav.numpy()[0] return wav_np + + if __name__ == "__main__": import argparse from ruamel import yaml @@ -72,6 +87,7 @@ if __name__ == "__main__": parser.add_argument("--output", type=str, required=True, help="path to save audio") parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint") parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1") + parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use") args = parser.parse_args() with open(args.config, 'rt') as f: config = yaml.safe_load(f) diff --git a/examples/deepvoice3/vocoder.py b/examples/deepvoice3/vocoder.py index 1471260..5568394 100644 --- a/examples/deepvoice3/vocoder.py +++ b/examples/deepvoice3/vocoder.py @@ -31,13 +31,21 @@ class WaveflowVocoder(object): return audio class GriffinLimVocoder(object): - def __init__(self, sharpening_factor=1.4, win_length=1024, hop_length=256): + def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024, + win_length=1024, hop_length=256): + self.sample_rate = sample_rate + self.n_fft = n_fft self.sharpening_factor = sharpening_factor self.win_length = win_length self.hop_length = hop_length - def __call__(self, spec): - audio = librosa.core.griffinlim(np.exp(spec * self.sharpening_factor), + def __call__(self, mel): + spec = librosa.feature.inverse.mel_to_stft( + np.exp(mel), + sr=self.sample_rate, + n_fft=self.n_fft, + fmin=0, fmax=8000.0, power=1.0) + audio = librosa.core.griffinlim(spec ** self.sharpening_factor, win_length=self.win_length, hop_length=self.hop_length) return audio -- GitLab