提交 60e9d6e4 编写于 作者: C Corentin Jemine

Fixed error cases of encoder inference

上级 4d21f6b0
from vlibs import fileio
from vlibs.structs.random_cycler import RandomCycler
from ..data_objects.utterance import Utterance
from .utterance import Utterance
# Contains the set of utterances of a single speaker
class Speaker:
......
from typing import List
import numpy as np
from ..data_objects.speaker import Speaker
from .speaker import Speaker
from ..params_data import mel_n_channels
class SpeakerBatch:
......
......@@ -4,8 +4,8 @@ from collections import OrderedDict
from vlibs import fileio
import numpy as np
import random
from ..data_objects.speaker_batch import SpeakerBatch
from ..data_objects.speaker import Speaker
from .speaker_batch import SpeakerBatch
from .speaker import Speaker
from ..params_data import partial_utterance_n_frames
from ..config import *
......
......@@ -43,24 +43,40 @@ def embed_frames_batch(frames_batch):
def compute_partial_splits(n_samples, partial_utterance_n_frames=partial_utterance_n_frames,
min_pad_coverage=0.75, overlap=0.5):
"""
Computes
:param n_samples:
:param partial_utterance_n_frames:
:param min_pad_coverage:
:param overlap:
:return:
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
spectrogram splits are returned, so as to make each partial utterance waveform correspond to
its spectrogram. This function assumes that the mel spectrogram parameters used are those
defined in params_data.py.
The returned ranges may be indexing further than the length of the waveform. It is
recommended that you pad the waveform with zeros up to wave_splits[-1].stop.
:param n_samples: the number of samples in the waveform
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
utterance
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
utterance, this parameter is ignored so that the function always returns at least 1 split.
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
utterances are entirely disjoint.
:return: the waveform splits and mel spectrogram splits as lists of array slices. Index
respectively the waveform and the mel spectrogram with these slices to obtain the partial
utterances.
"""
assert 0 <= overlap < 1
assert 0 < min_pad_coverage <= 1
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
frame_step = int(np.round(partial_utterance_n_frames * (1 - overlap)))
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
# Compute the splits
wave_splits, mel_splits = [], []
for i in range(0, n_frames - partial_utterance_n_frames + frame_step + 1, frame_step):
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
for i in range(0, steps, frame_step):
mel_range = np.array([i, i + partial_utterance_n_frames])
wave_range = mel_range * samples_per_frame
mel_splits.append(slice(*mel_range))
......@@ -69,7 +85,7 @@ def compute_partial_splits(n_samples, partial_utterance_n_frames=partial_utteran
# Evaluate whether extra padding is warranted or not
last_wave_range = wave_splits[-1]
coverage = (n_samples - last_wave_range.start) / (last_wave_range.stop - last_wave_range.start)
if coverage < min_pad_coverage:
if coverage < min_pad_coverage and len(mel_splits) > 1:
mel_splits = mel_splits[:-1]
wave_splits = wave_splits[:-1]
......@@ -122,7 +138,7 @@ def embed_utterance(wave, using_partials=True, return_partial_embeds=False,
out.append(partial_embeds)
if return_wave_splits:
out.append(wave_splits)
return tuple(out)
return out[0] if len(out) == 1 else tuple(out)
def embed_stream(stream, partial_utterance_n_frames=partial_utterance_n_frames, overlap=0.5):
pass
......
......@@ -5,9 +5,9 @@ from PyQt4 import QtGui
import numpy as np
import librosa
import sys
from .. import audio
from ..params_data import sampling_rate, mel_window_step
from ..preprocess import preprocess_wave
from .. import audio
class SpeakerMatrixUI(QtGui.QDialog):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册