提交 60e96934 编写于 作者: C Corentin Jemine

Fixed a bug with the encoding in the vocoder. Trying a bigger vocoder network

上级 4da25d2b
......@@ -3,7 +3,7 @@ from vocoder.vocoder_dataset import VocoderDataset
from vocoder import inference
from vocoder import audio
import numpy as np
from vocoder.params import print_params, model_name
from vocoder.params import print_params, model_name, use_mu_law
print_params()
......@@ -28,6 +28,8 @@ for i in sorted(np.random.choice(len(dataset), n_samples)):
out_pred_fpath = fileio.join(gen_path, "%s_%d_pred.wav" % (model_name, i))
wav_gt = audio.restore_signal(wav_gt)
if use_mu_law:
wav_gt = audio.expand_signal(wav_gt)
wav_pred = inference.infer_waveform(mel, normalize=False) # The dataloader already normalizes
audio.save_wav(out_pred_fpath, wav_pred)
......
......@@ -34,8 +34,8 @@ model_dir = 'checkpoints'
fileio.ensure_dir(model_dir)
model_fpath = fileio.join(model_dir, model_name + '.pt')
# data_path = "../data/Synthesizer"
data_path = "E:/Datasets/Synthesizer"
data_path = "../data/Synthesizer"
# data_path = "E:/Datasets/Synthesizer"
gen_path = 'model_outputs'
fileio.ensure_dir(gen_path)
......@@ -125,6 +125,6 @@ if __name__ == '__main__':
print('<saved>')
optimizer = optim.Adam(model.parameters())
train(model, optimizer, epochs=60, batch_size=64, classes=2 ** bits,
train(model, optimizer, epochs=100, batch_size=100, classes=2 ** bits,
seq_len=seq_len, step=step, lr=1e-4)
\ No newline at end of file
......@@ -17,38 +17,38 @@ use_mu_law = True
# Minimum number of mel frames below which samples are discarded for training
min_n_frames = 10
## Model parameters
model_name = 'mu_law'
# Number of bits for the encoding. Higher means higher quality output but longer training time
# and training memory required.
bits = 9
pad = 2
seq_len = hop_length * 5
mel_win = seq_len // hop_length + 2 * pad
rnn_dims = 512
fc_dims = 512
upsample_factors = (5, 5, 8)
feat_dims = 80
compute_dims = 128
res_out_dims = 128
res_blocks = 10
# ## Model parameters
# model_name = 'mu_law_big'
# model_name = 'mu_law'
# # Number of bits for the encoding. Higher means higher quality output but longer training time
# # and training memory required.
# bits = 9
# pad = 2
# seq_len = hop_length * 5
# mel_win = seq_len // hop_length + 2 * pad
# rnn_dims = 768
# fc_dims = 768
# rnn_dims = 512
# fc_dims = 512
# upsample_factors = (5, 5, 8)
# feat_dims = 80
# compute_dims = 196
# res_out_dims = 196
# compute_dims = 128
# res_out_dims = 128
# res_blocks = 10
## Model parameters
model_name = 'mu_law_big'
# Number of bits for the encoding. Higher means higher quality output but longer training time
# and training memory required.
bits = 9
pad = 2
seq_len = hop_length * 5
mel_win = seq_len // hop_length + 2 * pad
rnn_dims = 768
fc_dims = 768
upsample_factors = (5, 5, 8)
feat_dims = 80
compute_dims = 196
res_out_dims = 196
res_blocks = 10
def print_params():
for param_name in sorted(globals()):
......
......@@ -22,6 +22,8 @@ class VocoderDataset(Dataset):
# Load the wav and quantize it
wav = np.load(wav_path)
if use_mu_law:
wav = audio.compand_signal(wav)
quant = audio.quantize_signal(wav)
# Load the mel spectrogram and adjust its range to [0, 1]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册