提交 13a9fe87 编写于 作者: L liuyibing01

Merge branch 'dv3_reload' into 'master'

add griffin lim as an alternative vocoder

See merge request !64
...@@ -112,6 +112,7 @@ tensorboard --logdir=runs/ --host=$HOSTNAME --port=8000 ...@@ -112,6 +112,7 @@ tensorboard --logdir=runs/ --host=$HOSTNAME --port=8000
usage: synthesize from a checkpoint [-h] --config CONFIG --input INPUT usage: synthesize from a checkpoint [-h] --config CONFIG --input INPUT
--output OUTPUT --checkpoint CHECKPOINT --output OUTPUT --checkpoint CHECKPOINT
--monotonic_layers MONOTONIC_LAYERS --monotonic_layers MONOTONIC_LAYERS
[--vocoder {griffin-lim,waveflow}]
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
...@@ -121,11 +122,14 @@ optional arguments: ...@@ -121,11 +122,14 @@ optional arguments:
--checkpoint CHECKPOINT --checkpoint CHECKPOINT
data path of the checkpoint data path of the checkpoint
--monotonic_layers MONOTONIC_LAYERS --monotonic_layers MONOTONIC_LAYERS
monotonic decoder layer, index starts friom 1 monotonic decoder layers' indices(start from 1)
--vocoder {griffin-lim,waveflow}
vocoder to use
``` ```
`synthesize.py` is used to synthesize several sentences in a text file. `synthesize.py` is used to synthesize several sentences in a text file.
`--monotonic_layers` is the index of the decoders layer that manifest monotonic diagonal attention. You can get monotonic layers by inspecting tensorboard logs. Mind that the index starts from 1. The layers that manifest monotonic diagonal attention are stable for a model during training and synthesizing, but differ among different runs. So once you get the indices of monotonic layers by inspecting tensorboard log, you can use them at synthesizing. Note that only decoder layers that show strong diagonal attention should be considerd. `--monotonic_layers` is the index of the decoders layer that manifest monotonic diagonal attention. You can get monotonic layers by inspecting tensorboard logs. Mind that the index starts from 1. The layers that manifest monotonic diagonal attention are stable for a model during training and synthesizing, but differ among different runs. So once you get the indices of monotonic layers by inspecting tensorboard log, you can use them at synthesizing. Note that only decoder layers that show strong diagonal attention should be considerd.
`--vocoder` is the vocoder to use. Current supported values are "waveflow" and "griffin-lim". Default value is "waveflow".
example code: example code:
...@@ -135,5 +139,6 @@ CUDA_VISIBLE_DEVICES=2 python synthesize.py \ ...@@ -135,5 +139,6 @@ CUDA_VISIBLE_DEVICES=2 python synthesize.py \
--input sentences.txt \ --input sentences.txt \
--output outputs/ \ --output outputs/ \
--checkpoint runs/Jul07_09-39-34_instance-mqcyj27y-4/step-1320000 \ --checkpoint runs/Jul07_09-39-34_instance-mqcyj27y-4/step-1320000 \
--monotonic_layers "5,6" --monotonic_layers "5,6" \
--vocoder waveflow
``` ```
Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
in being comparatively modern.
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process
produced the block books, which were the immediate predecessors of the true printed book,
the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
...@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime ...@@ -18,7 +18,7 @@ from parakeet.data import SliceDataset, DataCargo, PartialyRandomizedSimilarTime
from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args from parakeet.utils.io import save_parameters, load_parameters, add_yaml_config_to_args
from parakeet.g2p import en from parakeet.g2p import en
from vocoder import WaveflowVocoder from vocoder import WaveflowVocoder, GriffinLimVocoder
from train import create_model from train import create_model
...@@ -26,8 +26,18 @@ def main(args, config): ...@@ -26,8 +26,18 @@ def main(args, config):
model = create_model(config) model = create_model(config)
loaded_step = load_parameters(model, checkpoint_path=args.checkpoint) loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
model.eval() model.eval()
vocoder = WaveflowVocoder() if args.vocoder == "waveflow":
vocoder.model.eval() vocoder = WaveflowVocoder()
vocoder.model.eval()
elif args.vocoder == "griffin-lim":
vocoder = GriffinLimVocoder(
sharpening_factor=config["sharpening_factor"],
sample_rate=config["sample_rate"],
n_fft=config["n_fft"],
win_length=config["win_length"],
hop_length=config["hop_length"])
else:
raise ValueError("Other vocoders are not supported.")
if not os.path.exists(args.output): if not os.path.exists(args.output):
os.makedirs(args.output) os.makedirs(args.output)
...@@ -35,12 +45,12 @@ def main(args, config): ...@@ -35,12 +45,12 @@ def main(args, config):
with open(args.input, 'rt') as f: with open(args.input, 'rt') as f:
sentences = [line.strip() for line in f.readlines()] sentences = [line.strip() for line in f.readlines()]
for i, sentence in enumerate(sentences): for i, sentence in enumerate(sentences):
wav = synthesize(config, model, vocoder, sentence, monotonic_layers) wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers)
sf.write(os.path.join(args.output, "sentence{}.wav".format(i)), sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
wav, samplerate=config["sample_rate"]) wav, samplerate=config["sample_rate"])
def synthesize(config, model, vocoder, sentence, monotonic_layers): def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
print("[synthesize] {}".format(sentence)) print("[synthesize] {}".format(sentence))
text = en.text_to_sequence(sentence, p=1.0) text = en.text_to_sequence(sentence, p=1.0)
text = np.expand_dims(np.array(text, dtype="int64"), 0) text = np.expand_dims(np.array(text, dtype="int64"), 0)
...@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers): ...@@ -58,11 +68,16 @@ def synthesize(config, model, vocoder, sentence, monotonic_layers):
force_monotonic_attention=force_monotonic_attention, force_monotonic_attention=force_monotonic_attention,
window=(config["backward_step"], config["forward_step"])) window=(config["backward_step"], config["forward_step"]))
decoded, refined, attentions = outputs decoded, refined, attentions = outputs
wav = vocoder(F.transpose(decoded, (0, 2, 1))) if args.vocoder == "griffin-lim":
wav_np = wav.numpy()[0] wav_np = vocoder(refined.numpy()[0].T)
else:
wav = vocoder(F.transpose(refined, (0, 2, 1)))
wav_np = wav.numpy()[0]
return wav_np return wav_np
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
from ruamel import yaml from ruamel import yaml
...@@ -71,7 +86,8 @@ if __name__ == "__main__": ...@@ -71,7 +86,8 @@ if __name__ == "__main__":
parser.add_argument("--input", type=str, required=True, help="text file to synthesize") parser.add_argument("--input", type=str, required=True, help="text file to synthesize")
parser.add_argument("--output", type=str, required=True, help="path to save audio") parser.add_argument("--output", type=str, required=True, help="path to save audio")
parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint") parser.add_argument("--checkpoint", type=str, required=True, help="data path of the checkpoint")
parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layer, index starts friom 1") parser.add_argument("--monotonic_layers", type=str, required=True, help="monotonic decoder layers' indices(start from 1)")
parser.add_argument("--vocoder", type=str, default="waveflow", choices=['griffin-lim', 'waveflow'], help="vocoder to use")
args = parser.parse_args() args = parser.parse_args()
with open(args.config, 'rt') as f: with open(args.config, 'rt') as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
......
...@@ -31,13 +31,21 @@ class WaveflowVocoder(object): ...@@ -31,13 +31,21 @@ class WaveflowVocoder(object):
return audio return audio
class GriffinLimVocoder(object): class GriffinLimVocoder(object):
def __init__(self, sharpening_factor=1.4, win_length=1024, hop_length=256): def __init__(self, sharpening_factor=1.4, sample_rate=22050, n_fft=1024,
win_length=1024, hop_length=256):
self.sample_rate = sample_rate
self.n_fft = n_fft
self.sharpening_factor = sharpening_factor self.sharpening_factor = sharpening_factor
self.win_length = win_length self.win_length = win_length
self.hop_length = hop_length self.hop_length = hop_length
def __call__(self, spec): def __call__(self, mel):
audio = librosa.core.griffinlim(np.exp(spec * self.sharpening_factor), spec = librosa.feature.inverse.mel_to_stft(
np.exp(mel),
sr=self.sample_rate,
n_fft=self.n_fft,
fmin=0, fmax=8000.0, power=1.0)
audio = librosa.core.griffinlim(spec ** self.sharpening_factor,
win_length=self.win_length, hop_length=self.hop_length) win_length=self.win_length, hop_length=self.hop_length)
return audio return audio
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册