Parakeet model does not work (#4523) · Issue · PaddlePaddle / models

Parakeet model does not work

Created by: qo4on
PaddlePaddle official pretrained model can't generate sound. This pre-trained WaveFlow model should convert a mel spectrogram of a sound to wav file, but it generates only noise. You can reproduce it by copypasting this code to Colab:
!wget -qqq https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_ckpt_1.0.zip > /dev/null
!mkdir -p /content/downloads
!unzip -qqq /content/waveflow_res128_ljspeech_ckpt_1.0.zip -d /content/downloads > /dev/null
!rm -rf /content/sample_data /content/waveflow_res128_ljspeech_ckpt_1.0.zip
!sudo apt-get update -y -qqq --fix-missing && apt-get install -y -qqq libsndfile1 > /dev/null
!pip install -U -qqq imgaug scipy albumentations paddlepaddle-gpu > /dev/null
!git clone -qqq https://github.com/PaddlePaddle/Parakeet > /dev/null
%cd /content/Parakeet
!pip install -qqq -e . > /dev/null

import os, pathlib, nltk, sys
import numpy as np
import matplotlib.pyplot as plt
nltk.download("punkt")
nltk.download("cmudict")

pth_1 = "/content/Parakeet"
if pth_1 not in sys.path: sys.path.insert(0, pth_1)

pth_2 = "/content/Parakeet/examples/waveflow"
if pth_2 not in sys.path: sys.path.insert(0, pth_2)

%cd /content/downloads

!curl -Ls https://dl.dropboxusercontent.com/s/jj78665lrhiod97/LJ001-0001.wav -o LJ001-0001.wav
audio_pth = "/content/downloads/LJ001-0001.wav"

!export CUDA_VISIBLE_DEVICES=0
from parakeet.models.waveflow import waveflow_modules
from parakeet.modules import weight_norm
from parakeet.utils import io
import paddle.fluid.dygraph as dg
from paddle import fluid
from scipy.io.wavfile import read
from scipy.io.wavfile import write
from ruamel import yaml
import random, librosa

class Config():
  def __init__(self, **entries):
    self.__dict__.update(entries)

class WaveFlow():
  def __init__(self,
               config,
               parallel=False,
               rank=0,
               nranks=1,
               tb_logger=None):
    self.config = config
    self.checkpoint_dir = config.checkpoint_dir
    self.parallel = parallel
    self.rank = rank
    self.nranks = nranks
    self.tb_logger = tb_logger
    self.dtype = "float16" if config.use_fp16 else "float32"

  def build(self):
    config = self.config

    waveflow = waveflow_modules.WaveFlowModule(config)

    # Dry run once to create and initalize all necessary parameters.
    audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype))
    mel = dg.to_variable(
        np.random.randn(1, config.mel_bands, 63).astype(self.dtype))
    waveflow(audio, mel)

    iteration = io.load_parameters(waveflow, checkpoint_dir=self.checkpoint_dir)

    for layer in waveflow.sublayers():
      if isinstance(layer, weight_norm.WeightNormWrapper):
        layer.remove_weight_norm()    
    self.waveflow = waveflow
    
    return iteration

  @dg.no_grad
  def infer(self, mel):
    self.waveflow.eval()
    config = self.config
    print(mel.shape, 'mel.shape')
    audio = self.waveflow.synthesize(mel, sigma=self.config.sigma)
    audio = audio[0]

    # Denormalize audio from [-1, 1] to [-32768, 32768] int16 range.
    audio = audio.numpy().astype("float32") * 32768.0
    audio = audio.astype('int16')
    filename = 'test.wav'
    print(audio.shape, 'audio.shape')
    write(filename, config.sample_rate, audio)

def get_mel(audio):
  spectrogram = librosa.core.stft(
      audio,
      n_fft=config.fft_size,
      hop_length=config.fft_window_shift,
      win_length=config.fft_window_size)
  spectrogram_magnitude = np.abs(spectrogram)

  # mel_filter_bank shape: [n_mels, 1 + n_fft/2]
  mel_filter_bank = librosa.filters.mel(sr=config.sample_rate,
                                        n_fft=config.fft_size,
                                        n_mels=config.mel_bands,
                                        fmin=config.mel_fmin,
                                        fmax=config.mel_fmax)
  # mel shape: [n_mels, num_frames]
  mel = np.dot(mel_filter_bank, spectrogram_magnitude)

  # Normalize mel.
  clip_val = 1e-5
  ref_constant = 1
  mel = np.log(np.clip(mel, a_min=clip_val, a_max=None) * ref_constant)

  return mel

def get_config(pth="/content/Parakeet/examples/waveflow/configs/waveflow_ljspeech.yaml"):
  with open(pth) as f:
    config = yaml.load(f, Loader=yaml.Loader)

  config['checkpoint'] = None
  config['checkpoint_dir'] = "/content/downloads/waveflow_res128_ljspeech_ckpt_1.0"
  config['iteration'] = None
  config['name'] = ''
  config['output'] = './syn_audios'
  config['sample'] = 0
  config['use_fp16'] = True
  config['use_gpu'] = True
  return Config(**config)

config = get_config()
print(config.__dict__)

place = fluid.CUDAPlace(0) if config.use_gpu else fluid.CPUPlace()
with dg.guard(place):
  # Fix random seed.
  seed = config.seed
  random.seed(seed)
  np.random.seed(seed)
  fluid.default_startup_program().random_seed = seed
  fluid.default_main_program().random_seed = seed

  # Build model.
  model = WaveFlow(config)
  iteration = model.build()
  print(iteration, "iteration")
  # Obtain the current iteration.
  if config.checkpoint is None:
    if config.iteration is None:
      print("_load_latest_checkpoint")
      iteration = io._load_latest_checkpoint(config.checkpoint_dir)
    else:
      iteration = config.iteration
  else:
    iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
  print(config.checkpoint_dir, iteration)
  loaded_sr, audio = read(audio_pth)
  mel = dg.to_variable(np.expand_dims(get_mel(np.asarray(audio, dtype=np.float32)), axis=0))
  model.infer(mel)

{'valid_size': 16, 'segment_length': 16000, 'sample_rate': 22050, 'fft_window_shift': 256, 'fft_window_size': 1024, 'fft_size': 1024, 'mel_bands': 80, 'mel_fmin': 0.0, 'mel_fmax': 8000.0, 'seed': 1234, 'learning_rate': 0.0002, 'batch_size': 8, 'test_every': 2000, 'save_every': 10000, 'max_iterations': 3000000, 'sigma': 1.0, 'n_flows': 8, 'n_group': 16, 'n_layers': 8, 'n_channels': 64, 'kernel_h': 3, 'kernel_w': 3, 'checkpoint': None, 'checkpoint_dir': '/content/downloads/waveflow_res128_ljspeech_ckpt_1.0', 'iteration': None, 'name': '', 'output': './syn_audios', 'sample': 0, 'use_fp16': True, 'use_gpu': True}
/usr/local/lib/python3.6/dist-packages/paddle/fluid/data_feeder.py:93: UserWarning: The data type of 'input' in assign only support float16 in GPU now. (When the type of input in assign is Variable.)
  (input_name, op_name, extra_message))
/usr/local/lib/python3.6/dist-packages/paddle/fluid/data_feeder.py:93: UserWarning: The data type of 'x' in cast only support float16 in GPU now. 
  (input_name, op_name, extra_message))
/usr/local/lib/python3.6/dist-packages/paddle/fluid/data_feeder.py:93: UserWarning: The data type of 'input' in squeeze only support float16 in GPU now. 
  (input_name, op_name, extra_message))
0 iteration
_load_latest_checkpoint
/content/downloads/waveflow_res128_ljspeech_ckpt_1.0 0
[1, 80, 832] mel.shape
(212720,) audio.shape
PaddlePaddle / models 大约 1 年 前同步成功

Parakeet model does not work

PaddlePaddle / models
大约 1 年前同步成功