提交 7938a5f6 编写于 作者: C chenfeiyu

add griffin lim as an alternative vocoder

上级 282c36c2
...@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime ...@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
from parakeet.g2p import en from parakeet.g2p import en
from vocoder import WaveflowVocoder from vocoder import WaveflowVocoder, GriffinLimVocoder
from train import create_model from train import create_model
...@@ -26,8 +26,18 @@ def main(args, config): ...@@ -26,8 +26,18 @@ def main(args, config):
model = create_model(config) model = create_model(config)
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint) loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
model.eval() model.eval()
if args.vocoder == "waveflow":
vocoder = WaveflowVocoder() vocoder = WaveflowVocoder()
vocoder.model.eval() vocoder.model.eval()
elif args.vocoder == "griffin-lim":
vocoder = GriffinLimVocoder(
sharpening_factor=config["sharpening_factor"],
sample_rate=config["sample_rate"],
n_fft=config["n_fft"],
win_length=config["win_length"],
hop_length=config["hop_length"])
else:
raise ValueError("Other vocoders are not supported.")
if not os.path.exists(args.output): if not os.path.exists(args.output):
os.makedirs(args.output) os.makedirs(args.output)
...@@ -35,12 +45,12 @@ def main(args, config): ...@@ -35,12 +45,12 @@ def main(args, config):
with open(args.input, 'rt') as f: with open(args.input, 'rt') as f:
sentences = [line.strip() for line in f.readlines()] sentences = [line.strip() for line in f.readlines()]
for i, sentence in enumerate(sentences): for i, sentence in enumerate(sentences):
wav = synthesize(config, model, vocoder, sentence, monotonic_layers) wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)), sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
wav, samplerate=config["sample_rate"]) wav, samplerate=config["sample_rate"])
def synthesize(config, model, vocoder, sentence, monotonic_layers): def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
print("[synthesize] {}".format(sentence)) print("[synthesize] {}".format(sentence))
text = en.text_to_sequence(sentence, p=1.0) text = en.text_to_sequence(sentence, p=1.0)
text = np.expand_dims(np.array(text, dtype="int64"), 0) text = np.expand_dims(np.array(text, dtype="int64"), 0)
...@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers): ...@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
force_monotonic_attention=force_monotonic_attention, force_monotonic_attention=force_monotonic_attention,
window=(config["backward_step"], config["forward_step"])) window=(config["backward_step"], config["forward_step"]))
decoded, refined, attentions = outputs decoded, refined, attentions = outputs
wav = vocoder(F.transpose(decoded, (0, 2, 1))) if args.vocoder == "griffin-lim":
wav_np = vocoder(refined.numpy()[0].T)
else:
wav = vocoder(F.transpose(refined, (0, 2, 1)))
wav_np = wav.numpy()[0] wav_np = wav.numpy()[0]
return wav_np return wav_np
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
from ruamel import yaml from ruamel import yaml
...@@ -72,6 +87,7 @@ if __name__ == "__main__": ...@@ -72,6 +87,7 @@ if __name__ == "__main__":
parser.add_argument("--output", type=str, required=True, help="path to save audio") parser.add_argument("--output", type=str, required=True, help="path to save audio")
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint") parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1") parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1")
parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
args = parser.parse_args() args = parser.parse_args()
with open(args.config, 'rt') as f: with open(args.config, 'rt') as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
......
...@@ -31,13 +31,21 @@ class WaveflowVocoder(object): ...@@ -31,13 +31,21 @@ class WaveflowVocoder(object):
return audio return audio
class GriffinLimVocoder(object): class GriffinLimVocoder(object):
def __init__(self, sharpening_factor=1.4, win_length=1024, hop_length=256): def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024,
win_length=1024, hop_length=256):
self.sample_rate = sample_rate
self.n_fft = n_fft
self.sharpening_factor = sharpening_factor self.sharpening_factor = sharpening_factor
self.win_length = win_length self.win_length = win_length
self.hop_length = hop_length self.hop_length = hop_length
def __call__(self, spec): def __call__(self, mel):
audio = librosa.core.griffinlim(np.exp(spec * self.sharpening_factor), spec = librosa.feature.inverse.mel_to_stft(
np.exp(mel),
sr=self.sample_rate,
n_fft=self.n_fft,
fmin=0, fmax=8000.0, power=1.0)
audio = librosa.core.griffinlim(spec ** self.sharpening_factor,
win_length=self.win_length, hop_length=self.hop_length) win_length=self.win_length, hop_length=self.hop_length)
return audio return audio
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册