提交 f8dc5fce 编写于 作者: C Corentin Jemine

Included VC2 as a dataset for the speaker encoder

上级 5396a54a
......@@ -8,7 +8,7 @@ import struct
from scipy.ndimage.morphology import binary_dilation
from params_data import *
int16_max = 32768
int16_max = (2 ** 15) - 1
def load(fpath):
"""
......
......@@ -6,7 +6,8 @@ project_root = fileio.abspath(fileio.leafdir(__file__))
librispeech_root = "E://Datasets/LibriSpeech"
librispeech_datasets = ["train-other-500"]
voxceleb1_root = "E://Datasets/VoxCeleb1"
voxceleb_datasets = ["voxceleb1"]
voxceleb2_root = "E://Datasets/VoxCeleb2"
voxceleb_datasets = ["voxceleb1", "voxceleb2"]
anglophone_nationalites = ['australia', 'canada', 'ireland', 'uk', 'usa']
clean_data_root = "E://Datasets//SpeakerEncoder"
all_datasets = librispeech_datasets + voxceleb_datasets
......
......@@ -7,8 +7,8 @@ from data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from config import *
if __name__ == '__main__':
dataset = SpeakerVerificationDataset(all_datasets)
loader = SpeakerVerificationDataLoader(dataset, 3, 4, num_workers=3)
dataset = SpeakerVerificationDataset(['voxceleb2'])
loader = SpeakerVerificationDataLoader(dataset, 4, 5, num_workers=3)
for batch in loader:
SpeakerMatrixUI(batch.speakers, batch.partial_utterances)
\ No newline at end of file
......@@ -99,16 +99,13 @@ class SpeakerEncoder(nn.Module):
loss = self.loss_fn(sim_matrix, torch.from_numpy(ground_truth).long())
# EER (not backpropagated)
sim_matrix = sim_matrix.detach().numpy()
with torch.no_grad():
## Imabalanced EER
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
labels = np.array([inv_argmax(i) for i in ground_truth])
preds = sim_matrix
preds = sim_matrix.detach().numpy()
# Snippet from https://yangcha.github.io/EER-ROC/
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
# thresh = interp1d(fpr, thresholds)(eer)
return loss, eer
\ No newline at end of file
import sys
from vlibs.ui import console
from vlibs import fileio
from config import *
......@@ -168,6 +169,60 @@ def preprocess_voxceleb1(n_speakers=None, n_utterances=None):
logger.finalize()
def preprocess_voxceleb2(n_speakers=None, n_utterances=None):
fileio.ensure_dir(clean_data_root)
dataset_name = "voxceleb2"
out_dir = fileio.ensure_dir(fileio.join(clean_data_root, dataset_name))
logger = DatasetLog(clean_data_root, dataset_name)
# Get the speaker directories
speakers_root = fileio.join(voxceleb2_root, "dev", "aac")
speaker_ids = fileio.listdir(speakers_root)[:n_speakers]
print("Preprocessing data for %d speakers." % len(speaker_ids))
# Function to preprocess utterances for one speaker
def preprocess_speaker(speaker_id):
print("Starting speaker %s" % speaker_id)
speaker_name = "VoxCeleb2_%s" % speaker_id
speaker_in_dir = fileio.join(speakers_root, speaker_id)
speaker_out_dir = fileio.ensure_dir(fileio.join(out_dir, speaker_name))
fileio.resetdir(speaker_out_dir)
sources_file = open(fileio.join(speaker_out_dir, "sources.txt"), 'w')
fpaths = fileio.get_files(speaker_in_dir, r"\.m4a", recursive=True)[:n_utterances]
for i, in_fpath in enumerate(fpaths):
# Load and preprocess the waveform
wave = audio.load(in_fpath)
wave = preprocess_wave(wave)
if len(wave) == 0:
print('Warning: audio file %s is entirely silent after processing.' % in_fpath,
file=sys.stderr)
continue
# Create and save the mel spectrogram
frames = audio.wave_to_mel_filterbank(wave)
if len(frames) < partial_utterance_length:
continue
video_id = fileio.leaf(fileio.leafdir(in_fpath))
fname = video_id + '_' + fileio.leaf(in_fpath).replace(".m4a", ".npy")
out_fpath = fileio.join(speaker_out_dir, fname)
np.save(out_fpath, frames)
logger.add_sample(duration=len(wave) / sampling_rate)
sources_file.write("%s %s\n" % (fname, in_fpath))
sources_file.close()
print("Speaker %s done!" % speaker_id)
# Process the utterances for each speaker
with ThreadPool(8) as pool:
list(pool.imap(preprocess_speaker, speaker_ids))
logger.finalize()
if __name__ == '__main__':
# preprocess_librispeech()
preprocess_voxceleb1()
# preprocess_voxceleb1()
preprocess_voxceleb2()
......@@ -10,8 +10,8 @@ import torch
# Specify the run ID here. Note: visdom will group together run IDs starting with the same prefix
# followed by an underscore.
run_id = None
run_id = 'first_debug'
run_id = 'debug_eer2'
run_id = 'all'
implementation_doc = {
'Lr decay': None,
......@@ -36,7 +36,6 @@ if __name__ == '__main__':
# Create the model and the optimizer
model = SpeakerEncoder()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, exponential_decay_beta)
init_step = 1
# Load any existing model
......@@ -72,7 +71,6 @@ if __name__ == '__main__':
loss.backward()
model.do_gradient_ops()
optimizer.step()
# scheduler.step()
# Update visualizations
learning_rate = optimizer.param_groups[0]['lr']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册