Parakeet model does not work
Created by: qo4on
PaddlePaddle official pretrained model can't generate sound. This pre-trained WaveFlow model should convert a mel spectrogram of a sound to wav
file, but it generates only noise.
You can reproduce it by copypasting this code to Colab:
!wget -qqq https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_ckpt_1.0.zip > /dev/null
!mkdir -p /content/downloads
!unzip -qqq /content/waveflow_res128_ljspeech_ckpt_1.0.zip -d /content/downloads > /dev/null
!rm -rf /content/sample_data /content/waveflow_res128_ljspeech_ckpt_1.0.zip
!sudo apt-get update -y -qqq --fix-missing && apt-get install -y -qqq libsndfile1 > /dev/null
!pip install -U -qqq imgaug scipy albumentations paddlepaddle-gpu > /dev/null
!git clone -qqq https://github.com/PaddlePaddle/Parakeet > /dev/null
%cd /content/Parakeet
!pip install -qqq -e . > /dev/null
import os, pathlib, nltk, sys
import numpy as np
import matplotlib.pyplot as plt
nltk.download("punkt")
nltk.download("cmudict")
pth_1 = "/content/Parakeet"
if pth_1 not in sys.path: sys.path.insert(0, pth_1)
pth_2 = "/content/Parakeet/examples/waveflow"
if pth_2 not in sys.path: sys.path.insert(0, pth_2)
%cd /content/downloads
!curl -Ls https://dl.dropboxusercontent.com/s/jj78665lrhiod97/LJ001-0001.wav -o LJ001-0001.wav
audio_pth = "/content/downloads/LJ001-0001.wav"
!export CUDA_VISIBLE_DEVICES=0
from parakeet.models.waveflow import waveflow_modules
from parakeet.modules import weight_norm
from parakeet.utils import io
import paddle.fluid.dygraph as dg
from paddle import fluid
from scipy.io.wavfile import read
from scipy.io.wavfile import write
from ruamel import yaml
import random, librosa
class Config():
def __init__(self, **entries):
self.__dict__.update(entries)
class WaveFlow():
def __init__(self,
config,
parallel=False,
rank=0,
nranks=1,
tb_logger=None):
self.config = config
self.checkpoint_dir = config.checkpoint_dir
self.parallel = parallel
self.rank = rank
self.nranks = nranks
self.tb_logger = tb_logger
self.dtype = "float16" if config.use_fp16 else "float32"
def build(self):
config = self.config
waveflow = waveflow_modules.WaveFlowModule(config)
# Dry run once to create and initalize all necessary parameters.
audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype))
mel = dg.to_variable(
np.random.randn(1, config.mel_bands, 63).astype(self.dtype))
waveflow(audio, mel)
iteration = io.load_parameters(waveflow, checkpoint_dir=self.checkpoint_dir)
for layer in waveflow.sublayers():
if isinstance(layer, weight_norm.WeightNormWrapper):
layer.remove_weight_norm()
self.waveflow = waveflow
return iteration
@dg.no_grad
def infer(self, mel):
self.waveflow.eval()
config = self.config
print(mel.shape, 'mel.shape')
audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
audio = audio[0]
# Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
audio = audio.numpy().astype("float32") * 32768.0
audio = audio.astype('int16')
filename = 'test.wav'
print(audio.shape, 'audio.shape')
write(filename, config.sample_rate, audio)
def get_mel(audio):
spectrogram = librosa.core.stft(
audio,
n_fft=config.fft_size,
hop_length=config.fft_window_shift,
win_length=config.fft_window_size)
spectrogram_magnitude = np.abs(spectrogram)
# mel_filter_bank shape: [n_mels, 1 + n_fft/2]
mel_filter_bank = librosa.filters.mel(sr=config.sample_rate,
n_fft=config.fft_size,
n_mels=config.mel_bands,
fmin=config.mel_fmin,
fmax=config.mel_fmax)
# mel shape: [n_mels, num_frames]
mel = np.dot(mel_filter_bank, spectrogram_magnitude)
# Normalize mel.
clip_val = 1e-5
ref_constant = 1
mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant)
return mel
def get_config(pth="/content/Parakeet/examples/waveflow/configs/waveflow_ljspeech.yaml"):
with open(pth) as f:
config = yaml.load(f, Loader=yaml.Loader)
config['checkpoint'] = None
config['checkpoint_dir'] = "/content/downloads/waveflow_res128_ljspeech_ckpt_1.0"
config['iteration'] = None
config['name'] = ''
config['output'] = './syn_audios'
config['sample'] = 0
config['use_fp16'] = True
config['use_gpu'] = True
return Config(**config)
config = get_config()
print(config.__dict__)
place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
with dg.guard(place):
# Fix random seed.
seed = config.seed
random.seed(seed)
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# Build model.
model = WaveFlow(config)
iteration = model.build()
print(iteration, "iteration")
# Obtain the current iteration.
if config.checkpoint is None:
if config.iteration is None:
print("_load_latest_checkpoint")
iteration = io._load_latest_checkpoint(config.checkpoint_dir)
else:
iteration = config.iteration
else:
iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
print(config.checkpoint_dir, iteration)
loaded_sr, audio = read(audio_pth)
mel = dg.to_variable(np.expand_dims(get_mel(np.asarray(audio, dtype=np.float32)), axis=0))
model.infer(mel)
{'valid_size': 16, 'segment_length': 16000, 'sample_rate': 22050, 'fft_window_shift': 256, 'fft_window_size': 1024, 'fft_size': 1024, 'mel_bands': 80, 'mel_fmin': 0.0, 'mel_fmax': 8000.0, 'seed': 1234, 'learning_rate': 0.0002, 'batch_size': 8, 'test_every': 2000, 'save_every': 10000, 'max_iterations': 3000000, 'sigma': 1.0, 'n_flows': 8, 'n_group': 16, 'n_layers': 8, 'n_channels': 64, 'kernel_h': 3, 'kernel_w': 3, 'checkpoint': None, 'checkpoint_dir': '/content/downloads/waveflow_res128_ljspeech_ckpt_1.0', 'iteration': None, 'name': '', 'output': './syn_audios', 'sample': 0, 'use_fp16': True, 'use_gpu': True}
/usr/local/lib/python3.6/dist-packages/paddle/fluid/data_feeder.py:93: UserWarning: The data type of 'input' in assign only support float16 in GPU now. (When the type of input in assign is Variable.)
(input_name, op_name, extra_message))
/usr/local/lib/python3.6/dist-packages/paddle/fluid/data_feeder.py:93: UserWarning: The data type of 'x' in cast only support float16 in GPU now.
(input_name, op_name, extra_message))
/usr/local/lib/python3.6/dist-packages/paddle/fluid/data_feeder.py:93: UserWarning: The data type of 'input' in squeeze only support float16 in GPU now.
(input_name, op_name, extra_message))
0 iteration
_load_latest_checkpoint
/content/downloads/waveflow_res128_ljspeech_ckpt_1.0 0
[1, 80, 832] mel.shape
(212720,) audio.shape