提交 63c283fa 编写于 作者: C Corentin Jemine

Reorganized the vocoder project structure. Included a good vocoder model

上级 be861eae
......@@ -6,6 +6,6 @@ NOTES:
- On eddard, tacotron_model.ckpt-486000 was the model used to generate GTA.
TODO:
- Begin server-side training of linear then mel and delete the local outdated GTA
- Meanwhile, work on a rough inference demo (don't forget to show side-by-side generated and original sample)
- Begin merging the three projects
- Clean up the rest of the code
\ No newline at end of file
import torch
from vlibs import fileio
from models.wavernn import WaveRNN
from utils.vocoder_dataset import VocoderDataset
from params import *
from utils import audio
from vocoder.model import WaveRNN
from vocoder.vocoder_dataset import VocoderDataset
from vocoder import audio
from vocoder.params import *
import numpy as np
run_name = 'from_synth'
# run_name = 'mu_law'
run_name = 'mu_law'
model_dir = 'checkpoints'
model_fpath = fileio.join(model_dir, run_name + '.pt')
......
import torch
import torch.nn as nn
import torch.nn.functional as F
from utils.display import *
from utils.dsp import *
class WaveRNN(nn.Module) :
def __init__(self, hidden_size=896, quantisation=256) :
super(WaveRNN, self).__init__()
self.hidden_size = hidden_size
self.split_size = hidden_size // 2
# The main matmul
self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
# Output fc layers
self.O1 = nn.Linear(self.split_size, self.split_size)
self.O2 = nn.Linear(self.split_size, quantisation)
self.O3 = nn.Linear(self.split_size, self.split_size)
self.O4 = nn.Linear(self.split_size, quantisation)
# Input fc layers
self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
# biases for the gates
self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
# display num params
self.num_params()
def forward(self, prev_y, prev_hidden, current_coarse) :
# Main matmul - the projection is split 3 ways
R_hidden = self.R(prev_hidden)
R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
# Project the prev input
coarse_input_proj = self.I_coarse(prev_y)
I_coarse_u, I_coarse_r, I_coarse_e = \
torch.split(coarse_input_proj, self.split_size, dim=1)
# Project the prev input and current coarse sample
fine_input = torch.cat([prev_y, current_coarse], dim=1)
fine_input_proj = self.I_fine(fine_input)
I_fine_u, I_fine_r, I_fine_e = \
torch.split(fine_input_proj, self.split_size, dim=1)
# concatenate for the gates
I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
# Compute all gates for coarse and fine
u = F.sigmoid(R_u + I_u + self.bias_u)
r = F.sigmoid(R_r + I_r + self.bias_r)
e = F.tanh(r * R_e + I_e + self.bias_e)
hidden = u * prev_hidden + (1. - u) * e
# Split the hidden state
hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
# Compute outputs
out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
out_fine = self.O4(F.relu(self.O3(hidden_fine)))
return out_coarse, out_fine, hidden
def generate(self, seq_len) :
with torch.no_grad() :
# First split up the biases for the gates
b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
# Lists for the two output seqs
c_outputs, f_outputs = [], []
# Some initial inputs
out_coarse = torch.LongTensor([0]).cuda()
out_fine = torch.LongTensor([0]).cuda()
# We'll meed a hidden state
hidden = self.init_hidden()
# Need a clock for display
start = time.time()
# Loop for generation
for i in range(seq_len) :
# Split into two hidden states
hidden_coarse, hidden_fine = \
torch.split(hidden, self.split_size, dim=1)
# Scale and concat previous predictions
out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
# Project input
coarse_input_proj = self.I_coarse(prev_outputs)
I_coarse_u, I_coarse_r, I_coarse_e = \
torch.split(coarse_input_proj, self.split_size, dim=1)
# Project hidden state and split 6 ways
R_hidden = self.R(hidden)
R_coarse_u , R_fine_u, \
R_coarse_r, R_fine_r, \
R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
# Compute the coarse gates
u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
hidden_coarse = u * hidden_coarse + (1. - u) * e
# Compute the coarse output
out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
posterior = F.softmax(out_coarse, dim=1)
distrib = torch.distributions.Categorical(posterior)
out_coarse = distrib.sample()
c_outputs.append(out_coarse)
# Project the [prev outputs and predicted coarse sample]
coarse_pred = out_coarse.float() / 127.5 - 1.
fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
fine_input_proj = self.I_fine(fine_input)
I_fine_u, I_fine_r, I_fine_e = \
torch.split(fine_input_proj, self.split_size, dim=1)
# Compute the fine gates
u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
hidden_fine = u * hidden_fine + (1. - u) * e
# Compute the fine output
out_fine = self.O4(F.relu(self.O3(hidden_fine)))
posterior = F.softmax(out_fine, dim=1)
distrib = torch.distributions.Categorical(posterior)
out_fine = distrib.sample()
f_outputs.append(out_fine)
# Put the hidden state back together
hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
# Display progress
speed = (i + 1) / (time.time() - start)
stream('Gen: %i/%i -- Speed: %i', (i + 1, seq_len, speed))
coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()
output = combine_signal(coarse, fine)
return output, coarse, fine
def init_hidden(self, batch_size=1) :
return torch.zeros(batch_size, self.hidden_size).cuda()
def num_params(self) :
parameters = filter(lambda p: p.requires_grad, self.parameters())
parameters = sum([np.prod(p.size()) for p in parameters]) / 1000000
print('Trainable Parameters: %.3f million' % parameters)
\ No newline at end of file
# from multiprocessing import Pool, cpu_count
import pickle
from utils.display import *
from utils.dsp import *
from vocoder.display import *
from vlibs import fileio
bits = 9
......@@ -36,7 +35,7 @@ def main():
pool = Pool(processes=cpu_count())
for i, fname in enumerate(pool.imap_unordered(process_wav, wav_files), 1):
dataset_ids += [fname]
stream('Processing: %i/%i', (i, len(wav_files)))
print('\rProcessing: %i/%i', (i, len(wav_files)), end='')
with open(fileio.join(out_dir, 'dataset_ids.pkl'), 'wb') as f:
pickle.dump(dataset_ids, f)
......
import torch
import torch.nn as nn
from utils.display import *
from vocoder.display import *
def np_now(tensor):
......@@ -153,7 +153,6 @@ class Model(nn.Module):
self.layers2prune = [self.rnn, self.fc]
self.pruner = Pruner(self.layers2prune, start_prune,
prune_steps, sparsity_target)
num_params(self)
def forward(self):
# h = torch.ones(1, 2)
......@@ -185,7 +184,7 @@ model = Model(in_size, model_size, start_prune, prune_steps, sparsity_target)
param_idx = [1, 2, 5]
for idx in param_idx:
W = list(model.parameters())[idx].data
plot_spec(W)
# plot_spec(W)
print(W.size(0) * W.size(1), W.shape)
sparsity = []
......@@ -195,15 +194,16 @@ for step in range(num_steps):
model()
sparsity += [model.pruner.z]
pruned_params += [model.pruner.num_pruned]
if step % 100 == 0: stream('%i/%i', (step, num_steps))
if step % 100 == 0:
print('\r%i/%i', (step, num_steps), end='')
plot(sparsity)
plot(pruned_params)
# plot(sparsity)
# plot(pruned_params)
param_idx = [1, 2, 5]
for idx in param_idx:
W = list(model.parameters())[idx].data
plot_spec(torch.abs(W))
# plot_spec(torch.abs(W))
print(W.size(0) * W.size(1), W.shape)
print((model.pruner.Z, model.pruner.z))
......
......@@ -22,19 +22,18 @@ import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from utils.vocoder_dataset import VocoderDataset
from models.wavernn import WaveRNN
from utils.display import *
from vocoder.vocoder_dataset import VocoderDataset
from vocoder.model import WaveRNN
from vlibs import fileio
from params import *
from vocoder.params import *
import time
import numpy as np
run_name = 'mu_law'
# run_name = 'from_synth'
model_dir = 'checkpoints'
fileio.ensure_dir(model_dir)
model_fpath = fileio.join(model_dir, run_name + '.pt')
# data_path = r"E:\Datasets\Synthesizer"
data_path = "../data/Synthesizer"
gen_path = 'model_outputs'
fileio.ensure_dir(gen_path)
......@@ -74,7 +73,6 @@ if __name__ == '__main__':
sample_rate=sample_rate
)
model = model.cuda()
num_params(model)
global step
if os.path.exists(model_fpath):
......@@ -116,8 +114,8 @@ if __name__ == '__main__':
step += 1
k = step // 1000
stream('Epoch: %i/%i -- Batch: %i/%i -- Loss: %.3f -- %.2f steps/sec -- Step: %ik ',
(e + 1, epochs, i + 1, iters, avg_loss, speed, k))
print('\rEpoch: %i/%i -- Batch: %i/%i -- Loss: %.3f -- %.2f steps/sec -- '
'Step: %ik ', (e + 1, epochs, i + 1, iters, avg_loss, speed, k), end='')
if (i + 1) % 1000 == 0:
torch.save({'step': step, 'model_state': model.state_dict()}, model_fpath)
......
import matplotlib.pyplot as plt
import time, sys, math
import numpy as np
def stream(string, variables) :
sys.stdout.write('\r%s' % (string % variables))
def num_params(model) :
parameters = filter(lambda p: p.requires_grad, model.parameters())
parameters = sum([np.prod(p.size()) for p in parameters]) / 1000000
print('Trainable Parameters: %.3f million' % parameters)
def time_since(started) :
elapsed = time.time() - started
m = int(elapsed // 60)
s = int(elapsed % 60)
if m >= 60 :
h = int(m // 60)
m = m % 60
return '%dh %dm %ds' % (h, m, s)
else :
return '%dm %ds' % (m, s)
def plot(array) :
fig = plt.figure(figsize=(30, 5))
ax = fig.add_subplot(111)
ax.xaxis.label.set_color('grey')
ax.yaxis.label.set_color('grey')
ax.xaxis.label.set_fontsize(23)
ax.yaxis.label.set_fontsize(23)
ax.tick_params(axis='x', colors='grey', labelsize=23)
ax.tick_params(axis='y', colors='grey', labelsize=23)
plt.plot(array)
plt.show()
def plot_spec(M) :
M = np.flip(M, axis=0)
plt.figure(figsize=(18,4))
plt.imshow(M, interpolation='nearest', aspect='auto')
plt.show()
import numpy as np
import librosa
from params import *
from vocoder.params import *
from scipy.io import wavfile
def load_wav(fpath):
......@@ -10,22 +10,29 @@ def save_wav(path, wav):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, sample_rate, wav.astype(np.int16))
def compand_signal(wav):
"""
Applies the mu-law to an audio waveform
"""
return np.sign(wav) * np.log(1 + (2 ** bits - 1) * np.abs(wav)) / np.log(1 + (2 ** bits - 1))
def quantize_signal(wav):
"""
Encodes a floating point audio waveform (-1 < wav < 1) to an integer signal (0 <= wav < 2^bits)
"""
if use_mu_law:
wav = np.sign(wav) * np.log(1 + (2 ** bits - 1) * np.abs(wav)) / np.log(1 + (2 ** bits - 1))
return ((wav + 1.) * (2 ** bits - 1) / 2).astype(np.int)
def restore_signal(wav):
"""
Decodes an integer signal (0 <= wav < 2^bits) to a floating point audio waveform (-1 < wav < 1)
"""
wav = 2 * wav.astype(np.float32) / (2 ** bits - 1.) - 1.
if use_mu_law:
wav = np.sign(wav) * (1 / (2 ** bits - 1)) * ((1 + (2 ** bits - 1)) ** np.abs(wav) - 1)
return wav
return 2 * wav.astype(np.float32) / (2 ** bits - 1.) - 1.
def expand_signal(wav):
"""
Applies the inverse mu-law to an audio waveform
"""
return np.sign(wav) * (1 / (2 ** bits - 1)) * ((1 + (2 ** bits - 1)) ** np.abs(wav) - 1)
def split_signal(x):
unsigned = x + 2 ** 15
......@@ -38,42 +45,3 @@ def combine_signal(coarse, fine):
def encode_16bits(x):
return np.clip(x * 2 ** 15, -2 ** 15, 2 ** 15 - 1).astype(np.int16)
# mel_basis = None
#
# def linear_to_mel(spectrogram):
# global mel_basis
# if mel_basis is None:
# mel_basis = build_mel_basis()
# return np.dot(mel_basis, spectrogram)
#
# def build_mel_basis():
# return librosa.filters.mel(sample_rate, n_fft, n_mels=num_mels, fmin=fmin)
#
# def normalize(S):
# return np.clip((S - min_level_db) / -min_level_db, 0, 1)
#
# def denormalize(S):
# return (np.clip(S, 0, 1) * -min_level_db) + min_level_db
#
# def amp_to_db(x):
# return 20 * np.log10(np.maximum(1e-5, x))
#
# def db_to_amp(x):
# return np.power(10.0, x * 0.05)
#
# def spectrogram(y):
# raise Exception()
# D = stft(y)
# S = amp_to_db(np.abs(D)) - ref_level_db
# return normalize(S)
#
# def melspectrogram(y):
# raise Exception()
# D = stft(y)
# S = amp_to_db(linear_to_mel(np.abs(D)))
# return normalize(S)
#
# def stft(y):
# raise Exception()
# return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
\ No newline at end of file
import torch
from vlibs import fileio
from vocoder.model import WaveRNN
from vocoder import audio
from vocoder.params import *
import numpy as np
# run_name = 'from_synth'
run_name = 'mu_law'
model_dir = 'checkpoints'
model_fpath = fileio.join(model_dir, run_name + '.pt')
model = WaveRNN(rnn_dims=512,
fc_dims=512,
bits=bits,
pad=pad,
upsample_factors=(5, 5, 8),
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=hop_length,
sample_rate=sample_rate).cuda()
checkpoint = torch.load(model_fpath)
step = checkpoint['step']
model.load_state_dict(checkpoint['model_state'])
data_path = 'E:\\Datasets\\Synthesizer'
gen_path = 'model_outputs'
fileio.ensure_dir(gen_path)
dataset = VocoderDataset(data_path)
# Generate Samples
target = 11000
overlap = 550
k = step // 1000
indices = np.array(range(len(dataset)))
np.random.shuffle(indices)
for i in indices:
print('Generating...')
mel, wav_gt = dataset[i]
out_gt_fpath = fileio.join(gen_path, "%s_%dk_steps_%d_gt.wav" % (run_name, k, i))
out_pred_fpath = fileio.join(gen_path, "%s_%dk_steps_%d_pred.wav" % (run_name, k, i))
wav_gt = audio.restore_signal(wav_gt)
wav_pred = model.generate(mel, True, target, overlap)
if use_mu_law:
import numpy as np
wav_pred = np.sign(wav_pred) * (1 / (2 ** bits - 1)) * \
((1 + (2 ** bits - 1)) ** np.abs(wav_pred) - 1)
audio.save_wav(out_pred_fpath, wav_pred)
audio.save_wav(out_gt_fpath, wav_gt)
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from utils.display import *
from params import *
from utils import audio
import numpy as np
import time
from vocoder.params import *
class ResBlock(nn.Module):
def __init__(self, dims):
......@@ -185,13 +185,6 @@ class WaveRNN(nn.Module):
distrib = torch.distributions.Categorical(posterior)
sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
# sample = distrib.sample()
# a = sample.detach().cpu().numpy().copy()
# sample = (sample.float() / (2 ** bits)) * 2 - 1
# if use_mu_law:
# sample = torch.sign(sample) * (1 / (2 ** bits - 1)) * ((2 ** bits) **
# torch.abs(sample) - 1)
# assert np.allclose(audio.restore_signal(a), sample.detach().cpu().numpy())
output.append(sample)
x = sample.unsqueeze(-1)
......@@ -214,8 +207,8 @@ class WaveRNN(nn.Module):
def gen_display(self, i, seq_len, b_size, start):
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
realtime_ratio = gen_rate * 1000 / self.sample_rate
stream('%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ',
(i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio))
print('\r%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ',
(i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), end='')
def get_gru_cell(self, gru):
gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
......
......@@ -14,7 +14,7 @@ mel_max_abs_value = 4
# Whether or not to apply a mu-law to the audio before quantization (and after restoration of the
# quantized signal). This results in a greater audio quality but also requires more steps to
# reach convergence.
use_mu_law = False
use_mu_law = True
## Model parameters
# Number of bits for the encoding. Higher means higher quality output but longer training time
......
from torch.utils.data import Dataset
from vlibs import fileio
import numpy as np
from params import *
from utils import audio
from vocoder.params import *
from vocoder import audio
class VocoderDataset(Dataset):
def __init__(self, data_path):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册