diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py index 5665b5ee6550344276c9ab333349dae2cd56a4fc..1959e85c5124f2f06af64a37d7115224b8b050fb 100644 --- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py +++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py @@ -11,21 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import ast import os import numpy as np import paddle +import paddle.nn.functional as F from paddle.io import BatchSampler from paddle.io import DataLoader -import paddle.nn.functional as F -from paddlespeech.vector.training.metrics import compute_eer +from tqdm import tqdm + from paddleaudio.datasets.voxceleb import VoxCeleb1 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn -from paddlespeech.vector.training.sid_model import SpeakerIdetification -from tqdm import tqdm +from paddlespeech.vector.modules.sid_model import SpeakerIdetification +from paddlespeech.vector.training.metrics import compute_eer def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs): @@ -44,7 +44,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs): return np.pad(x, pad_width, mode=mode, **kwargs) -def feature_normalize(batch, mean_norm: bool = True, std_norm: bool = True): +def feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True): ids = [item['id'] for item in batch] lengths = np.asarray([item['feat'].shape[1] for item in batch]) feats = list( @@ -58,8 +58,8 @@ def feature_normalize(batch, mean_norm: bool = True, std_norm: bool = True): mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0 std = feat.std(axis=-1, keepdims=True) if std_norm else 1 feats[i][:, :lengths[i]] = (feat - mean) / std - assert feats[i][:, lengths[i]:].sum( - ) == 0 # Padding valus should all be 0. + assert feats[i][:, lengths[ + i]:].sum() == 0 # Padding valus should all be 0. # Converts into ratios. lengths = (lengths / lengths.max()).astype(np.float32) @@ -98,16 +98,16 @@ def main(args): print(f'Checkpoint loaded from {args.load_checkpoint}') # stage4: construct the enroll and test dataloader - enrol_ds = VoxCeleb1(subset='enrol', - feat_type='melspectrogram', - random_chunk=False, - n_mels=80, - window_size=400, - hop_length=160) + enrol_ds = VoxCeleb1( + subset='enrol', + feat_type='melspectrogram', + random_chunk=False, + n_mels=80, + window_size=400, + hop_length=160) enrol_sampler = BatchSampler( - enrol_ds, - batch_size=args.batch_size, - shuffle=True) # Shuffle to make embedding normalization more robust. + enrol_ds, batch_size=args.batch_size, + shuffle=True) # Shuffle to make embedding normalization more robust. enrol_loader = DataLoader(enrol_ds, batch_sampler=enrol_sampler, collate_fn=lambda x: feature_normalize( @@ -115,16 +115,16 @@ def main(args): num_workers=args.num_workers, return_list=True,) - test_ds = VoxCeleb1(subset='test', - feat_type='melspectrogram', - random_chunk=False, - n_mels=80, - window_size=400, - hop_length=160) + test_ds = VoxCeleb1( + subset='test', + feat_type='melspectrogram', + random_chunk=False, + n_mels=80, + window_size=400, + hop_length=160) - test_sampler = BatchSampler(test_ds, - batch_size=args.batch_size, - shuffle=True) + test_sampler = BatchSampler( + test_ds, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_ds, batch_sampler=test_sampler, collate_fn=lambda x: feature_normalize( @@ -169,12 +169,13 @@ def main(args): embedding_mean, embedding_std = mean, std else: weight = 1 / batch_count # Weight decay by batches. - embedding_mean = ( - 1 - weight) * embedding_mean + weight * mean - embedding_std = ( - 1 - weight) * embedding_std + weight * std + embedding_mean = (1 - weight + ) * embedding_mean + weight * mean + embedding_std = (1 - weight + ) * embedding_std + weight * std # Apply global embedding normalization. - embeddings = (embeddings - embedding_mean) / embedding_std + embeddings = ( + embeddings - embedding_mean) / embedding_std # Update embedding dict. id2embedding.update(dict(zip(ids, embeddings))) @@ -201,38 +202,39 @@ def main(args): f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}' ) + if __name__ == "__main__": # yapf: disable parser = argparse.ArgumentParser(__doc__) - parser.add_argument('--device', - choices=['cpu', 'gpu'], - default="gpu", + parser.add_argument('--device', + choices=['cpu', 'gpu'], + default="gpu", help="Select which device to train model, defaults to gpu.") - parser.add_argument("--batch-size", - type=int, - default=16, + parser.add_argument("--batch-size", + type=int, + default=16, help="Total examples' number in batch for training.") - parser.add_argument("--num-workers", - type=int, - default=0, + parser.add_argument("--num-workers", + type=int, + default=0, help="Number of workers in dataloader.") - parser.add_argument("--load-checkpoint", - type=str, - default='', + parser.add_argument("--load-checkpoint", + type=str, + default='', help="Directory to load model checkpoint to contiune trainning.") - parser.add_argument("--global-embedding-norm", - type=bool, - default=True, + parser.add_argument("--global-embedding-norm", + type=bool, + default=True, help="Apply global normalization on speaker embeddings.") - parser.add_argument("--embedding-mean-norm", - type=bool, - default=True, + parser.add_argument("--embedding-mean-norm", + type=bool, + default=True, help="Apply mean normalization on speaker embeddings.") - parser.add_argument("--embedding-std-norm", - type=bool, - default=False, + parser.add_argument("--embedding-std-norm", + type=bool, + default=False, help="Apply std normalization on speaker embeddings.") args = parser.parse_args() # yapf: enable - main(args) \ No newline at end of file + main(args) diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py index f86b0a860237d0d2d8481579c20b70e2da6fb22f..4eabf94c09f36e7995f1a82812ee19803917e88e 100644 --- a/examples/voxceleb/sv0/local/train.py +++ b/examples/voxceleb/sv0/local/train.py @@ -22,22 +22,23 @@ from paddle.io import DistributedBatchSampler from paddleaudio.datasets.voxceleb import VoxCeleb1 from paddleaudio.features.core import melspectrogram -from paddlespeech.vector.training.time import Timer -from paddlespeech.vector.datasets.batch import feature_normalize -from paddlespeech.vector.datasets.batch import waveform_collate_fn -from paddlespeech.vector.layers.loss import AdditiveAngularMargin -from paddlespeech.vector.layers.loss import LogSoftmaxWrapper -from paddlespeech.vector.layers.lr import CyclicLRScheduler +from paddlespeech.vector.io.batch import feature_normalize +from paddlespeech.vector.io.batch import waveform_collate_fn from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn -from paddlespeech.vector.training.sid_model import SpeakerIdetification +from paddlespeech.vector.modules.loss import AdditiveAngularMargin +from paddlespeech.vector.modules.loss import LogSoftmaxWrapper +from paddlespeech.vector.modules.lr import CyclicLRScheduler +from paddlespeech.vector.modules.sid_model import SpeakerIdetification +from paddlespeech.vector.utils.time import Timer # feat configuration cpu_feat_conf = { 'n_mels': 80, - 'window_size': 400, - 'hop_length': 160, + 'window_size': 400, #ms + 'hop_length': 160, #ms } + def main(args): # stage0: set the training device, cpu or gpu paddle.set_device(args.device) diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py index 70cf3e7a258813dd474e5860a7de56d0ae11f9f9..760db72169f0ff16f51d52372dc6d7d618b760c8 100644 --- a/paddleaudio/datasets/voxceleb.py +++ b/paddleaudio/datasets/voxceleb.py @@ -76,6 +76,9 @@ class VoxCeleb1(Dataset): 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) base_path = os.path.join(DATA_HOME, 'vox1') wav_path = os.path.join(base_path, 'wav') + meta_path = os.path.join(base_path, 'meta') + veri_test_file = os.path.join(meta_path, 'veri_test2.txt') + csv_path = os.path.join(base_path, 'csv') subsets = ['train', 'dev', 'enrol', 'test'] def __init__( diff --git a/paddleaudio/utils/download.py b/paddleaudio/utils/download.py index a0c02ee1e850e4f3b1a89b0398a56ef05b3722d1..0535249bb1fb86a9adafadf109762289c1351d25 100644 --- a/paddleaudio/utils/download.py +++ b/paddleaudio/utils/download.py @@ -22,30 +22,22 @@ from .log import logger download.logger = logger +__all__ = [ + 'decompress', + 'download_and_decompress', + 'load_state_dict_from_url', +] -def decompress(file: str, path: str=os.PathLike): + +def decompress(file: str): """ - Extracts all files from a compressed file to specific path. + Extracts all files from a compressed file. """ assert os.path.isfile(file), "File: {} not exists.".format(file) + download._decompress(file) - if path is None: - print("decompress the data: {}".format(file)) - download._decompress(file) - else: - print("decompress the data: {} to {}".format(file, path)) - if not os.path.isdir(path): - os.makedirs(path) - - tmp_file = os.path.join(path, os.path.basename(file)) - os.rename(file, tmp_file) - download._decompress(tmp_file) - os.rename(tmp_file, file) - -def download_and_decompress(archives: List[Dict[str, str]], - path: str, - decompress: bool=True): +def download_and_decompress(archives: List[Dict[str, str]], path: str): """ Download archieves and decompress to specific path. """ @@ -55,8 +47,8 @@ def download_and_decompress(archives: List[Dict[str, str]], for archive in archives: assert 'url' in archive and 'md5' in archive, \ 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' - download.get_path_from_url( - archive['url'], path, archive['md5'], decompress=decompress) + + download.get_path_from_url(archive['url'], path, archive['md5']) def load_state_dict_from_url(url: str, path: str, md5: str=None): @@ -67,4 +59,4 @@ def load_state_dict_from_url(url: str, path: str, md5: str=None): os.makedirs(path) download.get_path_from_url(url, path, md5) - return load_state_dict(os.path.join(path, os.path.basename(url))) + return load_state_dict(os.path.join(path, os.path.basename(url))) \ No newline at end of file diff --git a/paddlespeech/vector/datasets/batch.py b/paddlespeech/vector/io/batch.py similarity index 100% rename from paddlespeech/vector/datasets/batch.py rename to paddlespeech/vector/io/batch.py diff --git a/paddlespeech/vector/layers/loss.py b/paddlespeech/vector/modules/loss.py similarity index 99% rename from paddlespeech/vector/layers/loss.py rename to paddlespeech/vector/modules/loss.py index bf632b13de9edf7225d5615990c421253537df8b..1aa0599a69ced21dd0dddfc5531b0b3266475d1d 100644 --- a/paddlespeech/vector/layers/loss.py +++ b/paddlespeech/vector/modules/loss.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import math import paddle @@ -67,4 +66,4 @@ class LogSoftmaxWrapper(nn.Layer): predictions = F.log_softmax(predictions, axis=1) loss = self.criterion(predictions, targets) / targets.sum() - return loss \ No newline at end of file + return loss diff --git a/paddlespeech/vector/layers/lr.py b/paddlespeech/vector/modules/lr.py similarity index 100% rename from paddlespeech/vector/layers/lr.py rename to paddlespeech/vector/modules/lr.py diff --git a/paddlespeech/vector/training/sid_model.py b/paddlespeech/vector/modules/sid_model.py similarity index 100% rename from paddlespeech/vector/training/sid_model.py rename to paddlespeech/vector/modules/sid_model.py diff --git a/paddlespeech/vector/training/metrics.py b/paddlespeech/vector/training/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..65dc7a3c48407527a3522be6f74af355bc0464f4 --- /dev/null +++ b/paddlespeech/vector/training/metrics.py @@ -0,0 +1,28 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +import numpy as np +from sklearn.metrics import roc_curve + + +def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]: + ''' + Compute EER and return score threshold. + ''' + fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores) + fnr = 1 - tpr + eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))] + eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] + return eer, eer_threshold diff --git a/paddlespeech/vector/utils/download.py b/paddlespeech/vector/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..476bfea7e24f82e5d36e78124c9d565d9a0152f3 --- /dev/null +++ b/paddlespeech/vector/utils/download.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict +from typing import List + +from paddle.framework import load as load_state_dict +from paddle.utils import download + +__all__ = [ + 'decompress', + 'download_and_decompress', + 'load_state_dict_from_url', +] + + +def decompress(file: str, path: str=os.PathLike): + """ + Extracts all files from a compressed file to specific path. + """ + assert os.path.isfile(file), "File: {} not exists.".format(file) + + if path is None: + print("decompress the data: {}".format(file)) + download._decompress(file) + else: + print("decompress the data: {} to {}".format(file, path)) + if not os.path.isdir(path): + os.makedirs(path) + + tmp_file = os.path.join(path, os.path.basename(file)) + os.rename(file, tmp_file) + download._decompress(tmp_file) + os.rename(tmp_file, file) + + +def download_and_decompress(archives: List[Dict[str, str]], + path: str, + decompress: bool=True): + """ + Download archieves and decompress to specific path. + """ + if not os.path.isdir(path): + os.makedirs(path) + + for archive in archives: + assert 'url' in archive and 'md5' in archive, \ + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + download.get_path_from_url( + archive['url'], path, archive['md5'], decompress=decompress) + + +def load_state_dict_from_url(url: str, path: str, md5: str=None): + """ + Download and load a state dict from url + """ + if not os.path.isdir(path): + os.makedirs(path) + + download.get_path_from_url(url, path, md5) + return load_state_dict(os.path.join(path, os.path.basename(url))) diff --git a/paddlespeech/vector/training/time.py b/paddlespeech/vector/utils/time.py similarity index 100% rename from paddlespeech/vector/training/time.py rename to paddlespeech/vector/utils/time.py