Fixed error cases of encoder inference

60e9d6e4 · Corentin Jemine · 4d21f6b0 · 60e9d6e4 · 60e9d6e4 · 60e9d6e4
5 changed file
--- a/encoder/data_objects/speaker.py
+++ b/encoder/data_objects/speaker.py
 from vlibs import fileio
 from vlibs.structs.random_cycler import RandomCycler
-from ..data_objects.utterance import Utterance
+from .utterance import Utterance

 # Contains the set of utterances of a single speaker
 class Speaker:

--- a/encoder/data_objects/speaker_batch.py
+++ b/encoder/data_objects/speaker_batch.py
 from typing import List
 import numpy as np
-from ..data_objects.speaker import Speaker
+from .speaker import Speaker
 from ..params_data import mel_n_channels

 class SpeakerBatch:

--- a/encoder/data_objects/speaker_verification_dataset.py
+++ b/encoder/data_objects/speaker_verification_dataset.py
@@ -4,8 +4,8 @@ from collections import OrderedDict
 from vlibs import fileio
 import numpy as np
 import random
-from ..data_objects.speaker_batch import SpeakerBatch
-from ..data_objects.speaker import Speaker
+from .speaker_batch import SpeakerBatch
+from .speaker import Speaker
 from ..params_data import partial_utterance_n_frames
 from ..config import *


--- a/encoder/inference.py
+++ b/encoder/inference.py
@@ -43,24 +43,40 @@ def embed_frames_batch(frames_batch):
 def compute_partial_splits(n_samples, partial_utterance_n_frames=partial_utterance_n_frames,
                           min_pad_coverage=0.75, overlap=0.5):
    """
-    Computes 
-    
-    :param n_samples: 
-    :param partial_utterance_n_frames: 
-    :param min_pad_coverage: 
-    :param overlap: 
-    :return: 
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
+    spectrogram splits are returned, so as to make each partial utterance waveform correspond to 
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    defined in params_data.py.
+    
+    The returned ranges may be indexing further than the length of the waveform. It is 
+    recommended that you pad the waveform with zeros up to wave_splits[-1].stop.
+    
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    utterance, this parameter is ignored so that the function always returns at least 1 split.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
+    utterances are entirely disjoint. 
+    :return: the waveform splits and mel spectrogram splits as lists of array slices. Index 
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    utterances.
    """
    assert 0 <= overlap < 1
    assert 0 < min_pad_coverage <= 1
    
    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
-    frame_step = int(np.round(partial_utterance_n_frames * (1 - overlap)))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)

    # Compute the splits
    wave_splits, mel_splits = [], []
-    for i in range(0, n_frames - partial_utterance_n_frames + frame_step + 1, frame_step):
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
        mel_range = np.array([i, i + partial_utterance_n_frames])
        wave_range = mel_range * samples_per_frame
        mel_splits.append(slice(*mel_range))
@@ -69,7 +85,7 @@ def compute_partial_splits(n_samples, partial_utterance_n_frames=partial_utteran
    # Evaluate whether extra padding is warranted or not
    last_wave_range = wave_splits[-1]
    coverage = (n_samples - last_wave_range.start) / (last_wave_range.stop - last_wave_range.start)
-    if coverage < min_pad_coverage:
+    if coverage < min_pad_coverage and len(mel_splits) > 1:
        mel_splits = mel_splits[:-1]
        wave_splits = wave_splits[:-1]
    
@@ -122,7 +138,7 @@ def embed_utterance(wave, using_partials=True, return_partial_embeds=False,
        out.append(partial_embeds)
    if return_wave_splits:
        out.append(wave_splits)
-    return tuple(out)
+    return out[0] if len(out) == 1 else tuple(out)
    
 def embed_stream(stream, partial_utterance_n_frames=partial_utterance_n_frames, overlap=0.5):
    pass

--- a/encoder/ui/speaker_matrix_ui.py
+++ b/encoder/ui/speaker_matrix_ui.py
@@ -5,9 +5,9 @@ from PyQt4 import QtGui
 import numpy as np
 import librosa
 import sys
-from .. import audio
 from ..params_data import sampling_rate, mel_window_step
 from ..preprocess import preprocess_wave
+from .. import audio


 class SpeakerMatrixUI(QtGui.QDialog):