synthesize.py 3.9 KB
Newer Older
C
chenfeiyu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
import numpy as np 
from matplotlib import cm
import librosa
import os
import time
import tqdm
import argparse
from ruamel import yaml
import paddle
from paddle import fluid
from paddle.fluid import layers as F
from paddle.fluid import dygraph as dg
from paddle.fluid.io import DataLoader
import soundfile as sf

from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTimeLengthSampler, SequentialSampler
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
from parakeet.g2p import en
19
from parakeet.models.deepvoice3.weight_norm_hook import remove_weight_norm
20
from vocoder import WaveflowVocoder, GriffinLimVocoder
C
chenfeiyu 已提交
21 22 23 24 25 26
from train import create_model


def main(args, config):
    model = create_model(config)
    loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
27 28 29 30 31 32
    for name, layer in model.named_sublayers():
        try:
            remove_weight_norm(layer)
        except ValueError:
            # this layer has not weight norm hook
            pass
C
chenfeiyu 已提交
33
    model.eval()
34 35 36 37 38 39 40 41 42 43 44 45
    if args.vocoder == "waveflow":
        vocoder = WaveflowVocoder()
        vocoder.model.eval()
    elif args.vocoder == "griffin-lim":
        vocoder = GriffinLimVocoder(
            sharpening_factor=config["sharpening_factor"], 
            sample_rate=config["sample_rate"],
            n_fft=config["n_fft"],
            win_length=config["win_length"],
            hop_length=config["hop_length"])
    else:
        raise ValueError("Other vocoders are not supported.")
C
chenfeiyu 已提交
46 47 48 49 50 51 52
    
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    monotonic_layers = [int(item.strip()) - 1 for item in args.monotonic_layers.split(',')]
    with open(args.input, 'rt') as f:
        sentences = [line.strip() for line in f.readlines()]
    for i, sentence in enumerate(sentences):
53
        wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
C
chenfeiyu 已提交
54 55 56 57
        sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
                 wav, samplerate=config["sample_rate"])


58
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
C
chenfeiyu 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
    print("[synthesize] {}".format(sentence))
    text = en.text_to_sequence(sentence, p=1.0)
    text = np.expand_dims(np.array(text, dtype="int64"), 0)
    lengths = np.array([text.size], dtype=np.int64)
    text_seqs = dg.to_variable(text)
    text_lengths = dg.to_variable(lengths)

    decoder_layers = config["decoder_layers"]
    force_monotonic_attention = [False] * decoder_layers
    for i in monotonic_layers:
        force_monotonic_attention[i] = True
    
    with dg.no_grad():
        outputs = model(text_seqs, text_lengths, speakers=None,
            force_monotonic_attention=force_monotonic_attention, 
            window=(config["backward_step"], config["forward_step"]))
        decoded, refined, attentions = outputs
76 77 78 79 80
        if args.vocoder == "griffin-lim":
            wav_np = vocoder(refined.numpy()[0].T)
        else:
            wav = vocoder(F.transpose(refined, (0, 2, 1)))
            wav_np = wav.numpy()[0]
C
chenfeiyu 已提交
81 82 83
    return wav_np


84 85


C
chenfeiyu 已提交
86 87 88 89 90 91 92 93
if __name__ == "__main__":
    import argparse
    from ruamel import yaml
    parser = argparse.ArgumentParser("synthesize from a checkpoint")
    parser.add_argument("--config", type=str, required=True, help="config file")
    parser.add_argument("--input", type=str, required=True, help="text file to synthesize")
    parser.add_argument("--output", type=str, required=True, help="path to save audio")
    parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
94
    parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layers' indices(start from 1)")
95
    parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
C
chenfeiyu 已提交
96 97 98 99 100 101
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
        config = yaml.safe_load(f)
    
    dg.enable_dygraph(fluid.CUDAPlace(0))
    main(args, config)