提交 6ac1ff26 编写于 作者: C Corentin Jemine

Included wave-rnn in the git, with trained models.

上级 bfa74de1
......@@ -16,4 +16,5 @@ _old
encoder/saved_models/*_backups
tacotron2/logs-*
waveglow
torch-tacotron2
\ No newline at end of file
torch-tacotron2
wave-rnn/checkpoints
\ No newline at end of file
MIT License
Copyright (c) 2019 fatchord (https://github.com/fatchord)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
MIT License
Copyright (c) 2019 fatchord (https://github.com/fatchord)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# ## Alternative Model (Training)
# I've found WaveRNN quite slow to train so here's an alternative that utilises the optimised rnn
# kernels in Pytorch. The model below is much much faster to train, it will converge in 48hrs when
# training on 22.5kHz samples (or 24hrs using 16kHz samples) on a single GTX1080. It also works
# quite well with predicted GTA features.
# The model is simply two residual GRUs in sequence and then three dense layers with a 512 softmax
# output. This is supplemented with an upsampling network.
# Since the Pytorch rnn kernels are 'closed', the options for conditioning sites are greatly
# reduced. Here's the strategy I went with given that restriction:
# 1 - Upsampling: Nearest neighbour upsampling followed by 2d convolutions with 'horizontal' kernels
# to interpolate. Split up into two or three layers depending on the stft hop length.
# 2 - A 1d resnet with a 5 wide conv input and 1x1 res blocks. Not sure if this is necessary, but
# the thinking behind it is: the upsampled features give a local view of the conditioning - why not
# supplement that with a much wider view of conditioning features, including a peek at the future.
# One thing to note is that the resnet is computed only once and in parallel, so it shouldn't slow
# down training/generation much.
# Train this model to ~500k steps for 8/9bit linear samples or ~1M steps for 10bit linear or 9+bit
# mu_law.
import pickle, os
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from utils.display import *
from utils.dsp import *
from vlibs import fileio
import random
bits = 9
pad = 2
seq_len = hop_length * 5
class ResBlock(nn.Module):
def __init__(self, dims):
super().__init__()
self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
self.batch_norm1 = nn.BatchNorm1d(dims)
self.batch_norm2 = nn.BatchNorm1d(dims)
def forward(self, x):
residual = x
x = self.conv1(x)
x = self.batch_norm1(x)
x = F.relu(x)
x = self.conv2(x)
x = self.batch_norm2(x)
return x + residual
class MelResNet(nn.Module):
def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims):
super().__init__()
k_size = pad * 2 + 1
self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
self.batch_norm = nn.BatchNorm1d(compute_dims)
self.layers = nn.ModuleList()
for i in range(res_blocks):
self.layers.append(ResBlock(compute_dims))
self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
def forward(self, x):
x = self.conv_in(x)
x = self.batch_norm(x)
x = F.relu(x)
for f in self.layers: x = f(x)
x = self.conv_out(x)
return x
class Stretch2d(nn.Module):
def __init__(self, x_scale, y_scale):
super().__init__()
self.x_scale = x_scale
self.y_scale = y_scale
def forward(self, x):
b, c, h, w = x.size()
x = x.unsqueeze(-1).unsqueeze(3)
x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
return x.view(b, c, h * self.y_scale, w * self.x_scale)
class UpsampleNetwork(nn.Module):
def __init__(self, feat_dims, upsample_scales, compute_dims,
res_blocks, res_out_dims, pad):
super().__init__()
total_scale = np.cumproduct(upsample_scales)[-1]
self.indent = pad * total_scale
self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims)
self.resnet_stretch = Stretch2d(total_scale, 1)
self.up_layers = nn.ModuleList()
for scale in upsample_scales:
k_size = (1, scale * 2 + 1)
padding = (0, scale)
stretch = Stretch2d(scale, 1)
conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
conv.weight.data.fill_(1. / k_size[1])
self.up_layers.append(stretch)
self.up_layers.append(conv)
def forward(self, m):
aux = self.resnet(m).unsqueeze(1)
aux = self.resnet_stretch(aux)
aux = aux.squeeze(1)
m = m.unsqueeze(1)
for f in self.up_layers: m = f(m)
m = m.squeeze(1)[:, :, self.indent:-self.indent]
return m.transpose(1, 2), aux.transpose(1, 2)
class Model(nn.Module):
def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
feat_dims, compute_dims, res_out_dims, res_blocks,
hop_length, sample_rate):
super().__init__()
self.pad = pad
self.n_classes = 2 ** bits
self.rnn_dims = rnn_dims
self.aux_dims = res_out_dims // 4
self.hop_length = hop_length
self.sample_rate = sample_rate
self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims,
res_blocks, res_out_dims, pad)
self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
self.fc3 = nn.Linear(fc_dims, self.n_classes)
num_params(self)
def forward(self, x, mels):
bsize = x.size(0)
h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
mels, aux = self.upsample(mels)
aux_idx = [self.aux_dims * i for i in range(5)]
a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
x = self.I(x)
res = x
x, _ = self.rnn1(x, h1)
x = x + res
res = x
x = torch.cat([x, a2], dim=2)
x, _ = self.rnn2(x, h2)
x = x + res
x = torch.cat([x, a3], dim=2)
x = F.relu(self.fc1(x))
x = torch.cat([x, a4], dim=2)
x = F.relu(self.fc2(x))
return F.log_softmax(self.fc3(x), dim=-1)
def generate(self, mels, save_path, batched, target, overlap):
self.eval()
output = []
start = time.time()
rnn1 = self.get_gru_cell(self.rnn1)
rnn2 = self.get_gru_cell(self.rnn2)
with torch.no_grad():
mels = torch.FloatTensor(mels).cuda().unsqueeze(0)
mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
mels, aux = self.upsample(mels.transpose(1, 2))
if batched:
mels = self.fold_with_overlap(mels, target, overlap)
aux = self.fold_with_overlap(aux, target, overlap)
b_size, seq_len, _ = mels.size()
h1 = torch.zeros(b_size, self.rnn_dims).cuda()
h2 = torch.zeros(b_size, self.rnn_dims).cuda()
x = torch.zeros(b_size, 1).cuda()
d = self.aux_dims
aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
for i in range(seq_len):
m_t = mels[:, i, :]
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
x = torch.cat([x, m_t, a1_t], dim=1)
x = self.I(x)
h1 = rnn1(x, h1)
x = x + h1
inp = torch.cat([x, a2_t], dim=1)
h2 = rnn2(inp, h2)
x = x + h2
x = torch.cat([x, a3_t], dim=1)
x = F.relu(self.fc1(x))
x = torch.cat([x, a4_t], dim=1)
x = F.relu(self.fc2(x))
logits = self.fc3(x)
posterior = F.softmax(logits, dim=1)
distrib = torch.distributions.Categorical(posterior)
sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
output.append(sample)
x = sample.unsqueeze(-1)
if i % 100 == 0: self.gen_display(i, seq_len, b_size, start)
output = torch.stack(output).transpose(0, 1)
output = output.cpu().numpy()
output = output.astype(np.float64)
if batched:
output = self.xfade_and_unfold(output, target, overlap)
else:
output = output[0]
librosa.output.write_wav(save_path, output.astype(np.float32), self.sample_rate)
self.train()
return output
def gen_display(self, i, seq_len, b_size, start):
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
realtime_ratio = gen_rate * 1000 / self.sample_rate
stream('%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ',
(i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio))
def get_gru_cell(self, gru):
gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
gru_cell.weight_hh.data = gru.weight_hh_l0.data
gru_cell.weight_ih.data = gru.weight_ih_l0.data
gru_cell.bias_hh.data = gru.bias_hh_l0.data
gru_cell.bias_ih.data = gru.bias_ih_l0.data
return gru_cell
def pad_tensor(self, x, pad, side='both'):
# NB - this is just a quick method i need right now
# i.e., it won't generalise to other shapes/dims
b, t, c = x.size()
total = t + 2 * pad if side == 'both' else t + pad
padded = torch.zeros(b, total, c).cuda()
if side == 'before' or side == 'both':
padded[:, pad:pad + t, :] = x
elif side == 'after':
padded[:, :t, :] = x
return padded
def fold_with_overlap(self, x, target, overlap):
""" Fold the tensor with overlap for quick batched inference.
Overlap will be used for crossfading in xfade_and_unfold()
Args:
x (tensor) : Upsampled conditioning features.
shape=(1, timesteps, features)
target (int) : Target timesteps for each index of batch
overlap (int) : Timesteps for both xfade and rnn warmup
Return:
(tensor) : shape=(num_folds, target + 2 * overlap, features)
Details:
x = [[h1, h2, ... hn]]
Where each h is a vector of conditioning features
Eg: target=2, overlap=1 with x.size(1)=10
folded = [[h1, h2, h3, h4],
[h4, h5, h6, h7],
[h7, h8, h9, h10]]
"""
_, total_len, features = x.size()
# Calculate variables needed
num_folds = (total_len - overlap) // (target + overlap)
extended_len = num_folds * (overlap + target) + overlap
remaining = total_len - extended_len
# Pad if some time steps poking out
if remaining != 0:
num_folds += 1
padding = target + 2 * overlap - remaining
x = self.pad_tensor(x, padding, side='after')
folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
# Get the values for the folded tensor
for i in range(num_folds):
start = i * (target + overlap)
end = start + target + 2 * overlap
folded[i] = x[:, start:end, :]
return folded
def xfade_and_unfold(self, y, target, overlap):
""" Applies a crossfade and unfolds into a 1d array.
Args:
y (ndarry) : Batched sequences of audio samples
shape=(num_folds, target + 2 * overlap)
dtype=np.float64
overlap (int) : Timesteps for both xfade and rnn warmup
Return:
(ndarry) : audio samples in a 1d array
shape=(total_len)
dtype=np.float64
Details:
y = [[seq1],
[seq2],
[seq3]]
Apply a gain envelope at both ends of the sequences
y = [[seq1_in, seq1_target, seq1_out],
[seq2_in, seq2_target, seq2_out],
[seq3_in, seq3_target, seq3_out]]
Stagger and add up the groups of samples:
[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
"""
num_folds, length = y.shape
target = length - 2 * overlap
total_len = num_folds * (target + overlap) + overlap
# Need some silence for the rnn warmup
silence_len = overlap // 2
fade_len = overlap - silence_len
silence = np.zeros((silence_len), dtype=np.float64)
# Equal power crossfade
t = np.linspace(-1, 1, fade_len, dtype=np.float64)
fade_in = np.sqrt(0.5 * (1 + t))
fade_out = np.sqrt(0.5 * (1 - t))
# Concat the silence to the fades
fade_in = np.concatenate([silence, fade_in])
fade_out = np.concatenate([fade_out, silence])
# Apply the gain to the overlap samples
y[:, :overlap] *= fade_in
y[:, -overlap:] *= fade_out
unfolded = np.zeros((total_len), dtype=np.float64)
# Loop to add up all the samples
for i in range(num_folds):
start = i * (target + overlap)
end = start + target + 2 * overlap
unfolded[start:end] += y[i]
return unfolded
model = Model(rnn_dims=512,
fc_dims=512,
bits=bits,
pad=pad,
upsample_factors=(5, 5, 8),
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=hop_length,
sample_rate=sample_rate).cuda()
step = 0
if os.path.exists('checkpoints/step.npy'):
step = np.load('checkpoints/step.npy')
model.load_state_dict(torch.load('checkpoints/first.pt'))
DATA_PATH = 'E:\\Datasets\\Vocoder\\'
GEN_PATH = f'model_outputs\\'
fileio.ensure_dir(GEN_PATH)
with open(f'{DATA_PATH}dataset_ids.pkl', 'rb') as f:
dataset_ids = pickle.load(f)
# test_ids = dataset_ids[-50:]
# dataset_ids = dataset_ids[:-50]
random.shuffle(dataset_ids)
## Generate Samples
def generate(samples=3, batched=True, target=11_000, overlap=550):
k = step // 1000
test_mels = [np.load(f'{DATA_PATH}mel/{id}.npy') for id in dataset_ids[:samples]]
ground_truth = [np.load(f'{DATA_PATH}quant/{id}.npy') for id in dataset_ids[:samples]]
for i, (gt, mel) in enumerate(zip(ground_truth, test_mels)) :
print('\nGenerating: %i/%i' % (i+1, samples))
gt = 2 * gt.astype(np.float32) / (2**bits - 1.) - 1.
librosa.output.write_wav(f'{GEN_PATH}{k}k_steps_{i}_target.wav', gt, sr=sample_rate)
if batched :
save_str = f'{GEN_PATH}{k}k_steps_{i}_gen_batched_target{target}_overlap{overlap}.wav'
else :
save_str = f'{GEN_PATH}{k}k_steps_{i}_gen_not_batched.wav'
model.generate(mel, save_str, batched, target, overlap)
generate()
import torch
import torch.nn as nn
import torch.nn.functional as F
from utils.display import *
from utils.dsp import *
class WaveRNN(nn.Module) :
def __init__(self, hidden_size=896, quantisation=256) :
super(WaveRNN, self).__init__()
self.hidden_size = hidden_size
self.split_size = hidden_size // 2
# The main matmul
self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
# Output fc layers
self.O1 = nn.Linear(self.split_size, self.split_size)
self.O2 = nn.Linear(self.split_size, quantisation)
self.O3 = nn.Linear(self.split_size, self.split_size)
self.O4 = nn.Linear(self.split_size, quantisation)
# Input fc layers
self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
# biases for the gates
self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
# display num params
self.num_params()
def forward(self, prev_y, prev_hidden, current_coarse) :
# Main matmul - the projection is split 3 ways
R_hidden = self.R(prev_hidden)
R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
# Project the prev input
coarse_input_proj = self.I_coarse(prev_y)
I_coarse_u, I_coarse_r, I_coarse_e = \
torch.split(coarse_input_proj, self.split_size, dim=1)
# Project the prev input and current coarse sample
fine_input = torch.cat([prev_y, current_coarse], dim=1)
fine_input_proj = self.I_fine(fine_input)
I_fine_u, I_fine_r, I_fine_e = \
torch.split(fine_input_proj, self.split_size, dim=1)
# concatenate for the gates
I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
# Compute all gates for coarse and fine
u = F.sigmoid(R_u + I_u + self.bias_u)
r = F.sigmoid(R_r + I_r + self.bias_r)
e = F.tanh(r * R_e + I_e + self.bias_e)
hidden = u * prev_hidden + (1. - u) * e
# Split the hidden state
hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
# Compute outputs
out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
out_fine = self.O4(F.relu(self.O3(hidden_fine)))
return out_coarse, out_fine, hidden
def generate(self, seq_len) :
with torch.no_grad() :
# First split up the biases for the gates
b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
# Lists for the two output seqs
c_outputs, f_outputs = [], []
# Some initial inputs
out_coarse = torch.LongTensor([0]).cuda()
out_fine = torch.LongTensor([0]).cuda()
# We'll meed a hidden state
hidden = self.init_hidden()
# Need a clock for display
start = time.time()
# Loop for generation
for i in range(seq_len) :
# Split into two hidden states
hidden_coarse, hidden_fine = \
torch.split(hidden, self.split_size, dim=1)
# Scale and concat previous predictions
out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
# Project input
coarse_input_proj = self.I_coarse(prev_outputs)
I_coarse_u, I_coarse_r, I_coarse_e = \
torch.split(coarse_input_proj, self.split_size, dim=1)
# Project hidden state and split 6 ways
R_hidden = self.R(hidden)
R_coarse_u , R_fine_u, \
R_coarse_r, R_fine_r, \
R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
# Compute the coarse gates
u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
hidden_coarse = u * hidden_coarse + (1. - u) * e
# Compute the coarse output
out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
posterior = F.softmax(out_coarse, dim=1)
distrib = torch.distributions.Categorical(posterior)
out_coarse = distrib.sample()
c_outputs.append(out_coarse)
# Project the [prev outputs and predicted coarse sample]
coarse_pred = out_coarse.float() / 127.5 - 1.
fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
fine_input_proj = self.I_fine(fine_input)
I_fine_u, I_fine_r, I_fine_e = \
torch.split(fine_input_proj, self.split_size, dim=1)
# Compute the fine gates
u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
hidden_fine = u * hidden_fine + (1. - u) * e
# Compute the fine output
out_fine = self.O4(F.relu(self.O3(hidden_fine)))
posterior = F.softmax(out_fine, dim=1)
distrib = torch.distributions.Categorical(posterior)
out_fine = distrib.sample()
f_outputs.append(out_fine)
# Put the hidden state back together
hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
# Display progress
speed = (i + 1) / (time.time() - start)
stream('Gen: %i/%i -- Speed: %i', (i + 1, seq_len, speed))
coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()
output = combine_signal(coarse, fine)
return output, coarse, fine
def init_hidden(self, batch_size=1) :
return torch.zeros(batch_size, self.hidden_size).cuda()
def num_params(self) :
parameters = filter(lambda p: p.requires_grad, self.parameters())
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
print('Trainable Parameters: %.3f million' % parameters)
\ No newline at end of file
from multiprocessing import Pool, cpu_count
import pickle, glob
from utils.display import *
from utils.dsp import *
from vlibs import fileio
source_dir = r'E:\Datasets\LibriSpeech\train-clean*'
extension = 'flac'
out_dir = r'E:\Datasets\Vocoder'
quant_path = fileio.join(out_dir, 'quant')
mel_path = fileio.join(out_dir, 'mel')
fileio.ensure_dir(out_dir)
fileio.resetdir(quant_path)
fileio.resetdir(mel_path)
def convert_file(path):
wav = load_wav(path, encode=False)
mel = melspectrogram(wav)
quant = (wav + 1.) * (2 ** 9 - 1) / 2
return mel.astype(np.float32), quant.astype(np.int)
def process_wav(path):
fname = fileio.leaf(path)
fname = fname[:fname.rfind('.')]
m, x = convert_file(path)
np.save(fileio.join(mel_path, fname + ".npy"), m)
np.save(fileio.join(quant_path, fname + ".npy"), x)
return fname
def main():
def get_files(path):
filenames = []
for filename in glob.iglob(f'{path}/**/*.{extension}', recursive=True):
filenames += [filename]
return filenames
wav_files = get_files(source_dir)
# This will take a while depending on size of dataset
pool = Pool(processes=cpu_count())
dataset_ids = []
for i, fname in enumerate(pool.imap_unordered(process_wav, wav_files), 1):
dataset_ids += [fname]
stream('Processing: %i/%i', (i, len(wav_files)))
with open(fileio.join(out_dir, 'dataset_ids.pkl'), 'wb') as f:
pickle.dump(dataset_ids, f)
if __name__ == '__main__':
main()
\ No newline at end of file
# coding: utf-8
import torch
import torch.nn as nn
from utils.display import *
def np_now(tensor) : return tensor.detach().cpu().numpy()
def clamp(x, lo=0, hi=1) : return max(lo, min(hi, x))
class PruneMask() :
def __init__(self, layer, prune_rnn_input) :
self.mask = []
self.p_idx = [0]
self.total_params = 0
self.pruned_params = 0
self.split_size = 0
self.init_mask(layer, prune_rnn_input)
def init_mask(self, layer, prune_rnn_input) :
# Determine the layer type and
# num matrix splits if rnn
layer_type = str(layer).split('(')[0]
splits = {'Linear': 1, 'GRU': 3, 'LSTM': 4}
# Organise the num and indices of layer parameters
# Dense will have one index and rnns two (if pruning input)
if layer_type != 'Linear' :
self.p_idx = [0, 1] if prune_rnn_input else [1]
# Get list of parameters from layers
params = self.get_params(layer)
# For each param matrix in this layer, create a mask
for W in params :
self.mask += [torch.ones_like(W)]
self.total_params += W.size(0) * W.size(1)
# Need a split size for mask_from_matrix() later on
self.split_size = self.mask[0].size(0) // splits[layer_type]
def get_params(self, layer) :
params = []
for idx in self.p_idx :
params += [list(layer.parameters())[idx].data]
return params
def update_mask(self, layer, z) :
params = self.get_params(layer)
for i, W in enumerate(params) :
self.mask[i] = self.mask_from_matrix(W, z)
self.update_prune_count()
def apply_mask(self, layer) :
params = self.get_params(layer)
for M, W in zip(self.mask, params) : W *= M
def mask_from_matrix(self, W, z) :
# Split into gate matrices (or not)
W_split = torch.split(W, self.split_size)
M = []
# Loop through splits
for W in W_split :
# Sort the magnitudes
W_abs = torch.abs(W)
sorted_abs, _ = torch.sort(W_abs.view(-1))
# Pick k (num weights to zero)
k = int(W.size(0) * W.size(1) * z)
threshold = sorted_abs[k]
# Create the mask
M += [(W_abs >= threshold).float()]
return torch.cat(M)
def update_prune_count(self) :
self.pruned_params = 0
for M in self.mask :
self.pruned_params += int(np_now((M - 1).sum() * -1))
class Pruner() :
def __init__(self, layers, start_prune, prune_steps, target_sparsity,
prune_rnn_input=True, prune_every=500) :
self.z = 0 # Objects sparsity @ time t
self.t_0 = start_prune
self.S = prune_steps
self.Z = target_sparsity
self.prune_every = prune_every
self.num_pruned = 0
self.total_params = 0
self.masks = []
for layer in layers :
self.masks += [PruneMask(layer, prune_rnn_input)]
self.count_total_params()
def update_sparsity(self, t) :
t = np_now(t)[0]
z = self.Z * (1 - (1 - (t - self.t_0) / self.S)**3)
self.z = clamp(z, 0, self.Z)
return t
def prune(self, layers, t) :
t = self.update_sparsity(t)
for (l, m) in zip(layers, self.masks) :
if self.prune_or_not(t) : m.update_mask(l, self.z)
if self.apply_or_not(t) : m.apply_mask(l)
self.count_num_pruned()
def prune_or_not(self, t) :
return True if t % self.prune_every == 0 and t > self.t_0 else False
def apply_or_not(self, t) :
return True if t >= self.t_0 else False
def restart(self, layers, t) :
# In case training is stopped
_ = self.update_sparsity(t)
for (l, m) in zip(layers, self.masks) :
m.update_mask(l, self.z)
def count_num_pruned(self) :
self.num_pruned = 0
for m in self.masks :
self.num_pruned += m.pruned_params
def count_total_params(self) :
for m in self.masks :
self.total_params += m.total_params
class Model(nn.Module) :
def __init__(self, in_size, model_size, start_prune=1000, prune_steps=200_000,
sparsity_target=0.98) :
super().__init__()
# Model Layers
self.rnn = nn.GRU(in_size, model_size)
self.fc = nn.Linear(model_size, model_size)
self.t = nn.Parameter(torch.zeros(1), requires_grad=False)
# Model Pruner
self.layers2prune = [self.rnn, self.fc]
self.pruner = Pruner(self.layers2prune, start_prune,
prune_steps, sparsity_target )
num_params(self)
def forward(self) :
# h = torch.ones(1, 2)
# x = self.rnn(x, h)
# x = self.fc(x)
self.prune()
self.step()
return True
def step(self) :
self.t += 1
def prune(self) :
self.pruner.prune(self.layers2prune, self.t)
def restart_pruner(self) :
self.pruner.restart(self.layer2prune, self.t)
in_size = 128
model_size = 512
start_prune = 10
prune_steps = 20000
sparsity_target = 0.9375
model = Model(in_size, model_size, start_prune, prune_steps, sparsity_target)
param_idx = [1, 2, 5]
for idx in param_idx :
W = list(model.parameters())[idx].data
plot_spec(W)
print(W.size(0) * W.size(1), W.shape)
sparsity = []
pruned_params = []
num_steps = start_prune + prune_steps + 1000
for step in range(num_steps) :
model()
sparsity += [model.pruner.z]
pruned_params += [model.pruner.num_pruned]
if step % 100 == 0 : stream('%i/%i', (step, num_steps))
plot(sparsity)
plot(pruned_params)
param_idx = [1, 2, 5]
for idx in param_idx :
W = list(model.parameters())[idx].data
plot_spec(torch.abs(W))
print(W.size(0) * W.size(1), W.shape)
model.pruner.Z, model.pruner.z
model.pruner.num_pruned, model.pruner.total_params,
model.pruner.num_pruned / model.pruner.total_params
model.pruner.total_params - model.pruner.num_pruned
model = Model(in_size=1, model_size=2)
model.t += 44_000
model.state_dict()
model.prune()
model.pruner.z
model.state_dict()
# ## Alternative Model (Training)
# I've found WaveRNN quite slow to train so here's an alternative that utilises the optimised rnn
# kernels in Pytorch. The model below is much much faster to train, it will converge in 48hrs when
# training on 22.5kHz samples (or 24hrs using 16kHz samples) on a single GTX1080. It also works
# quite well with predicted GTA features.
# The model is simply two residual GRUs in sequence and then three dense layers with a 512 softmax
# output. This is supplemented with an upsampling network.
# Since the Pytorch rnn kernels are 'closed', the options for conditioning sites are greatly
# reduced. Here's the strategy I went with given that restriction:
# 1 - Upsampling: Nearest neighbour upsampling followed by 2d convolutions with 'horizontal' kernels
# to interpolate. Split up into two or three layers depending on the stft hop length.
# 2 - A 1d resnet with a 5 wide conv input and 1x1 res blocks. Not sure if this is necessary, but
# the thinking behind it is: the upsampled features give a local view of the conditioning - why not
# supplement that with a much wider view of conditioning features, including a peek at the future.
# One thing to note is that the resnet is computed only once and in parallel, so it shouldn't slow
# down training/generation much.
# Train this model to ~500k steps for 8/9bit linear samples or ~1M steps for 10bit linear or 9+bit
# mu_law.
import pickle, os
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from utils.display import *
from utils.dsp import *
from vlibs import fileio
bits = 9
pad = 2
seq_len = hop_length * 5
run_name = 'first'
model_dir = 'checkpoints'
fileio.ensure_dir(model_dir)
model_path = fileio.join(model_dir, run_name + '.pt')
data_path = r"E:\Datasets\Vocoder"
step_path = fileio.join(model_dir, "step.npy")
gen_path = 'model_outputs'
fileio.ensure_dir(gen_path)
with open(fileio.join(data_path, 'dataset_ids.pkl'), 'rb') as f:
dataset_ids = pickle.load(f)
test_ids = dataset_ids[-50:]
dataset_ids = dataset_ids[:-50]
class AudiobookDataset(Dataset):
def __init__(self, ids, path):
self.path = path
self.metadata = ids
def __getitem__(self, index):
file = self.metadata[index]
m = np.load(fileio.join(self.path, 'mel', file + '.npy'))
x = np.load(fileio.join(self.path, 'quant', file + '.npy'))
return m, x
def __len__(self):
return len(self.metadata)
def collate(batch) :
mel_win = seq_len // hop_length + 2 * pad
max_offsets = [x[0].shape[-1] - (mel_win + 2 * pad) for x in batch]
mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
sig_offsets = [(offset + pad) * hop_length for offset in mel_offsets]
mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
coarse = [x[1][sig_offsets[i]:sig_offsets[i] + seq_len + 1] for i, x in enumerate(batch)]
mels = np.stack(mels).astype(np.float32)
coarse = np.stack(coarse).astype(np.int64)
mels = torch.FloatTensor(mels)
coarse = torch.LongTensor(coarse)
x_input = 2 * coarse[:, :seq_len].float() / (2**bits - 1.) - 1.
y_coarse = coarse[:, 1:]
return x_input, mels, y_coarse
dataset = AudiobookDataset(dataset_ids, data_path)
data_loader = DataLoader(dataset, collate_fn=collate, batch_size=32,
num_workers=0, shuffle=True)
class ResBlock(nn.Module) :
def __init__(self, dims) :
super().__init__()
self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
self.batch_norm1 = nn.BatchNorm1d(dims)
self.batch_norm2 = nn.BatchNorm1d(dims)
def forward(self, x) :
residual = x
x = self.conv1(x)
x = self.batch_norm1(x)
x = F.relu(x)
x = self.conv2(x)
x = self.batch_norm2(x)
return x + residual
class MelResNet(nn.Module) :
def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims) :
super().__init__()
k_size = pad * 2 + 1
self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
self.batch_norm = nn.BatchNorm1d(compute_dims)
self.layers = nn.ModuleList()
for i in range(res_blocks) :
self.layers.append(ResBlock(compute_dims))
self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
def forward(self, x) :
x = self.conv_in(x)
x = self.batch_norm(x)
x = F.relu(x)
for f in self.layers : x = f(x)
x = self.conv_out(x)
return x
class Stretch2d(nn.Module) :
def __init__(self, x_scale, y_scale) :
super().__init__()
self.x_scale = x_scale
self.y_scale = y_scale
def forward(self, x) :
b, c, h, w = x.size()
x = x.unsqueeze(-1).unsqueeze(3)
x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
return x.view(b, c, h * self.y_scale, w * self.x_scale)
class UpsampleNetwork(nn.Module) :
def __init__(self, feat_dims, upsample_scales, compute_dims,
res_blocks, res_out_dims, pad) :
super().__init__()
total_scale = np.cumproduct(upsample_scales)[-1]
self.indent = pad * total_scale
self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims)
self.resnet_stretch = Stretch2d(total_scale, 1)
self.up_layers = nn.ModuleList()
for scale in upsample_scales :
k_size = (1, scale * 2 + 1)
padding = (0, scale)
stretch = Stretch2d(scale, 1)
conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
conv.weight.data.fill_(1. / k_size[1])
self.up_layers.append(stretch)
self.up_layers.append(conv)
def forward(self, m) :
aux = self.resnet(m).unsqueeze(1)
aux = self.resnet_stretch(aux)
aux = aux.squeeze(1)
m = m.unsqueeze(1)
for f in self.up_layers : m = f(m)
m = m.squeeze(1)[:, :, self.indent:-self.indent]
return m.transpose(1, 2), aux.transpose(1, 2)
class Model(nn.Module) :
def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
feat_dims, compute_dims, res_out_dims, res_blocks,
hop_length, sample_rate):
super().__init__()
self.pad = pad
self.n_classes = 2**bits
self.rnn_dims = rnn_dims
self.aux_dims = res_out_dims // 4
self.hop_length = hop_length
self.sample_rate = sample_rate
self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims,
res_blocks, res_out_dims, pad)
self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
self.fc3 = nn.Linear(fc_dims, self.n_classes)
num_params(self)
def forward(self, x, mels) :
bsize = x.size(0)
h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
mels, aux = self.upsample(mels)
aux_idx = [self.aux_dims * i for i in range(5)]
a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
x = self.I(x)
res = x
x, _ = self.rnn1(x, h1)
x = x + res
res = x
x = torch.cat([x, a2], dim=2)
x, _ = self.rnn2(x, h2)
x = x + res
x = torch.cat([x, a3], dim=2)
x = F.relu(self.fc1(x))
x = torch.cat([x, a4], dim=2)
x = F.relu(self.fc2(x))
return F.log_softmax(self.fc3(x), dim=-1)
def generate(self, mels, save_path, batched, target, overlap) :
self.eval()
output = []
start = time.time()
rnn1 = self.get_gru_cell(self.rnn1)
rnn2 = self.get_gru_cell(self.rnn2)
with torch.no_grad() :
mels = torch.FloatTensor(mels).cuda().unsqueeze(0)
mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
mels, aux = self.upsample(mels.transpose(1, 2))
if batched :
mels = self.fold_with_overlap(mels, target, overlap)
aux = self.fold_with_overlap(aux, target, overlap)
b_size, seq_len, _ = mels.size()
h1 = torch.zeros(b_size, self.rnn_dims).cuda()
h2 = torch.zeros(b_size, self.rnn_dims).cuda()
x = torch.zeros(b_size, 1).cuda()
d = self.aux_dims
aux_split = [aux[:, :, d*i:d*(i+1)] for i in range(4)]
for i in range(seq_len) :
m_t = mels[:, i, :]
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
x = torch.cat([x, m_t, a1_t], dim=1)
x = self.I(x)
h1 = rnn1(x, h1)
x = x + h1
inp = torch.cat([x, a2_t], dim=1)
h2 = rnn2(inp, h2)
x = x + h2
x = torch.cat([x, a3_t], dim=1)
x = F.relu(self.fc1(x))
x = torch.cat([x, a4_t], dim=1)
x = F.relu(self.fc2(x))
logits = self.fc3(x)
posterior = F.softmax(logits, dim=1)
distrib = torch.distributions.Categorical(posterior)
sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
output.append(sample)
x = sample.unsqueeze(-1)
if i % 100 == 0 : self.gen_display(i, seq_len, b_size, start)
output = torch.stack(output).transpose(0, 1)
output = output.cpu().numpy()
output = output.astype(np.float64)
if batched :
output = self.xfade_and_unfold(output, target, overlap)
else :
output = output[0]
librosa.output.write_wav(save_path, output.astype(np.float32), self.sample_rate)
self.train()
return output
def gen_display(self, i, seq_len, b_size, start) :
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
realtime_ratio = gen_rate * 1000 / self.sample_rate
stream('%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ',
(i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio))
def get_gru_cell(self, gru) :
gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
gru_cell.weight_hh.data = gru.weight_hh_l0.data
gru_cell.weight_ih.data = gru.weight_ih_l0.data
gru_cell.bias_hh.data = gru.bias_hh_l0.data
gru_cell.bias_ih.data = gru.bias_ih_l0.data
return gru_cell
def pad_tensor(self, x, pad, side='both') :
# NB - this is just a quick method i need right now
# i.e., it won't generalise to other shapes/dims
b, t, c = x.size()
total = t + 2 * pad if side == 'both' else t + pad
padded = torch.zeros(b, total, c).cuda()
if side == 'before' or side == 'both' :
padded[:, pad:pad+t, :] = x
elif side == 'after' :
padded[:, :t, :] = x
return padded
def fold_with_overlap(self, x, target, overlap) :
""" Fold the tensor with overlap for quick batched inference.
Overlap will be used for crossfading in xfade_and_unfold()
Args:
x (tensor) : Upsampled conditioning features.
shape=(1, timesteps, features)
target (int) : Target timesteps for each index of batch
overlap (int) : Timesteps for both xfade and rnn warmup
Return:
(tensor) : shape=(num_folds, target + 2 * overlap, features)
Details:
x = [[h1, h2, ... hn]]
Where each h is a vector of conditioning features
Eg: target=2, overlap=1 with x.size(1)=10
folded = [[h1, h2, h3, h4],
[h4, h5, h6, h7],
[h7, h8, h9, h10]]
"""
_, total_len, features = x.size()
# Calculate variables needed
num_folds = (total_len - overlap) // (target + overlap)
extended_len = num_folds * (overlap + target) + overlap
remaining = total_len - extended_len
# Pad if some time steps poking out
if remaining != 0 :
num_folds += 1
padding = target + 2 * overlap - remaining
x = self.pad_tensor(x, padding, side='after')
folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
# Get the values for the folded tensor
for i in range(num_folds) :
start = i * (target + overlap)
end = start + target + 2 * overlap
folded[i] = x[:, start:end, :]
return folded
def xfade_and_unfold(self, y, target, overlap) :
""" Applies a crossfade and unfolds into a 1d array.
Args:
y (ndarry) : Batched sequences of audio samples
shape=(num_folds, target + 2 * overlap)
dtype=np.float64
overlap (int) : Timesteps for both xfade and rnn warmup
Return:
(ndarry) : audio samples in a 1d array
shape=(total_len)
dtype=np.float64
Details:
y = [[seq1],
[seq2],
[seq3]]
Apply a gain envelope at both ends of the sequences
y = [[seq1_in, seq1_target, seq1_out],
[seq2_in, seq2_target, seq2_out],
[seq3_in, seq3_target, seq3_out]]
Stagger and add up the groups of samples:
[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
"""
num_folds, length = y.shape
target = length - 2 * overlap
total_len = num_folds * (target + overlap) + overlap
# Need some silence for the rnn warmup
silence_len = overlap // 2
fade_len = overlap - silence_len
silence = np.zeros((silence_len), dtype=np.float64)
# Equal power crossfade
t = np.linspace(-1, 1, fade_len, dtype=np.float64)
fade_in = np.sqrt(0.5 * (1 + t))
fade_out = np.sqrt(0.5 * (1 - t))
# Concat the silence to the fades
fade_in = np.concatenate([silence, fade_in])
fade_out = np.concatenate([fade_out, silence])
# Apply the gain to the overlap samples
y[:, :overlap] *= fade_in
y[:, -overlap:] *= fade_out
unfolded = np.zeros((total_len), dtype=np.float64)
# Loop to add up all the samples
for i in range(num_folds ) :
start = i * (target + overlap)
end = start + target + 2 * overlap
unfolded[start:end] += y[i]
return unfolded
model = Model(rnn_dims=512,
fc_dims=512,
bits=bits,
pad=pad,
upsample_factors=(5, 5, 8),
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=hop_length,
sample_rate=sample_rate).cuda()
if os.path.exists(model_path):
model.load_state_dict(torch.load(model_path))
global step
step = 0
if os.path.exists(step_path):
step = np.load(step_path)
def train(model, optimiser, epochs, batch_size, classes, seq_len, step, lr=1e-4) :
for p in optimiser.param_groups : p['lr'] = lr
criterion = nn.NLLLoss().cuda()
for e in range(epochs) :
trn_loader = DataLoader(dataset, collate_fn=collate, batch_size=batch_size,
num_workers=2, shuffle=True, pin_memory=True)
start = time.time()
running_loss = 0.
iters = len(trn_loader)
for i, (x, m, y) in enumerate(trn_loader) :
x, m, y = x.cuda(), m.cuda(), y.cuda()
y_hat = model(x, m)
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
y = y.unsqueeze(-1)
loss = criterion(y_hat, y)
optimiser.zero_grad()
loss.backward()
optimiser.step()
running_loss += loss.item()
speed = (i + 1) / (time.time() - start)
avg_loss = running_loss / (i + 1)
step += 1
k = step // 1000
stream('Epoch: %i/%i -- Batch: %i/%i -- Loss: %.3f -- %.2f steps/sec -- Step: %ik ',
(e + 1, epochs, i + 1, iters, avg_loss, speed, k))
torch.save(model.state_dict(), model_path)
np.save(step_path, step)
print('<saved>')
import random
if __name__ == '__main__':
optimiser = optim.Adam(model.parameters())
train(model, optimiser, epochs=60, batch_size=16, classes=2**bits,
seq_len=seq_len, step=step, lr=1e-4)
## Generate Samples
def generate(samples=3, batched=True, target=11_000, overlap=550):
random.shuffle(test_ids)
outputs = []
k = step // 1000
test_mels = [np.load(fileio.join(data_path, "mel", fname + ".npy"))
for fname in test_ids[:samples]]
ground_truth = [np.load(fileio.join(data_path, "quant", fname + ".npy"))
for fname in test_ids[:samples]]
for i, (gt, mel) in enumerate(zip(ground_truth, test_mels)) :
print('\nGenerating: %i/%i' % (i+1, samples))
gt = 2 * gt.astype(np.float32) / (2**bits - 1.) - 1.
librosa.output.write_wav(fileio.join(gen_path, "%dk_steps_%d_target.wav" % (k, i)),
gt, sr=sample_rate)
if batched :
save_str = fileio.join(gen_path, "%dk_steps_%d_gen_batched_target%d_overlap%d.wav"
% (k, i, target, overlap))
else :
save_str = fileio.join(gen_path, "%dk_steps_%d_gen_not_batched.wav" % (k, i))
outputs.append(model.generate(mel, save_str, batched, target, overlap))
for output in outputs:
plot(output)
generate(batched=True)
import matplotlib.pyplot as plt
import time, sys, math
import numpy as np
def stream(string, variables) :
sys.stdout.write(f'\r{string}' % variables)
def num_params(model) :
parameters = filter(lambda p: p.requires_grad, model.parameters())
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
print('Trainable Parameters: %.3f million' % parameters)
def time_since(started) :
elapsed = time.time() - started
m = int(elapsed // 60)
s = int(elapsed % 60)
if m >= 60 :
h = int(m // 60)
m = m % 60
return f'{h}h {m}m {s}s'
else :
return f'{m}m {s}s'
def plot(array) :
fig = plt.figure(figsize=(30, 5))
ax = fig.add_subplot(111)
ax.xaxis.label.set_color('grey')
ax.yaxis.label.set_color('grey')
ax.xaxis.label.set_fontsize(23)
ax.yaxis.label.set_fontsize(23)
ax.tick_params(axis='x', colors='grey', labelsize=23)
ax.tick_params(axis='y', colors='grey', labelsize=23)
plt.plot(array)
def plot_spec(M) :
M = np.flip(M, axis=0)
plt.figure(figsize=(18,4))
plt.imshow(M, interpolation='nearest', aspect='auto')
plt.show()
import numpy as np
import librosa, math
sample_rate = 16000
n_fft = 800
fft_bins = 513
num_mels = 80
hop_length = 200
win_length = 800
fmin = 55
min_level_db = -100
ref_level_db = 20
def load_wav(filename, encode=True) :
x = librosa.load(filename, sr=sample_rate)[0]
if encode:
x = encode_16bits(x)
return x
def save_wav(y, filename) :
if y.dtype != 'int16' :
y = encode_16bits(y)
librosa.output.write_wav(filename, y.astype(np.int16), sample_rate)
def split_signal(x) :
unsigned = x + 2**15
coarse = unsigned // 256
fine = unsigned % 256
return coarse, fine
def combine_signal(coarse, fine) :
return coarse * 256 + fine - 2**15
def encode_16bits(x) :
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
mel_basis = None
def linear_to_mel(spectrogram):
global mel_basis
if mel_basis is None:
mel_basis = build_mel_basis()
return np.dot(mel_basis, spectrogram)
def build_mel_basis():
return librosa.filters.mel(sample_rate, n_fft, n_mels=num_mels, fmin=fmin)
def normalize(S):
return np.clip((S - min_level_db) / -min_level_db, 0, 1)
def denormalize(S):
return (np.clip(S, 0, 1) * -min_level_db) + min_level_db
def amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))
def db_to_amp(x):
return np.power(10.0, x * 0.05)
def spectrogram(y):
D = stft(y)
S = amp_to_db(np.abs(D)) - ref_level_db
return normalize(S)
def melspectrogram(y):
D = stft(y)
S = amp_to_db(linear_to_mel(np.abs(D)))
return normalize(S)
def stft(y):
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册