diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml index d7f663803ae45b51c6659b34a9b52cc80b0ae950..0f4bf1891dad5f7ebb00a358b6d1f6c8cbed2aad 100644 --- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml +++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml @@ -1,7 +1,10 @@ ########################################### # Data # ########################################### -batch_size: 32 +# we should explicitly specify the wav path of vox2 audio data converted from m4a +vox2_base_path: +augment: True +batch_size: 16 num_workers: 2 num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 shuffle: True @@ -11,10 +14,10 @@ random_chunk: True # FEATURE EXTRACTION SETTING # ########################################################### # currently, we only support fbank -feature: - n_mels: 80 - window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 - hop_length: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160 +sample_rate: 16000 +n_mels: 80 +window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 +hop_length: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160 ########################################################### # MODEL SETTING # @@ -35,6 +38,15 @@ model: ########################################### seed: 1986 # according from speechbrain configuration epochs: 10 -save_interval: 10 -log_interval: 10 +save_interval: 1 +log_interval: 1 learning_rate: 1e-8 + + +########################################### +# Testing # +########################################### +global_embedding_norm: True +embedding_mean_norm: True +embedding_std_norm: False + diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh new file mode 100755 index 0000000000000000000000000000000000000000..ec9c4c58c7c6c2e50000ea4f177e33ce06fde543 --- /dev/null +++ b/examples/voxceleb/sv0/local/data.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +stage=-1 +stop_stage=100 + +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; + +dir=$1 +conf_path=$2 +mkdir -p ${dir} + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav + # we should use the local/convert.sh convert m4a to wav + python3 local/data_prepare.py \ + --data-dir ${dir} \ + --config ${conf_path} +fi \ No newline at end of file diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py index b906b5da4895559a602f94c1ac7f5e683daeea15..19ba41b8e86298ab703245c68db5b6b0829f1446 100644 --- a/examples/voxceleb/sv0/local/data_prepare.py +++ b/examples/voxceleb/sv0/local/data_prepare.py @@ -14,10 +14,10 @@ import argparse import os -import numpy as np import paddle +from yacs.config import CfgNode -from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb +from paddleaudio.datasets.voxceleb import VoxCeleb from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.augment import build_augment_pipeline from paddlespeech.vector.training.seeding import seed_everything @@ -25,46 +25,47 @@ from paddlespeech.vector.training.seeding import seed_everything logger = Log(__name__).getlog() -def main(args): +def main(args, config): # stage0: set the cpu device, all data prepare process will be done in cpu mode paddle.set_device("cpu") # set the random seed, it is a must for multiprocess training - seed_everything(args.seed) + seed_everything(config.seed) # stage 1: generate the voxceleb csv file # Note: this may occurs c++ execption, but the program will execute fine # so we ignore the execption # we explicitly pass the vox2 base path to data prepare and generate the audio info + logger.info("start to generate the voxceleb dataset info") train_dataset = VoxCeleb( - 'train', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path) - dev_dataset = VoxCeleb( - 'dev', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path) + 'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path) # stage 2: generate the augment noise csv file - if args.augment: + if config.augment: + logger.info("start to generate the augment dataset info") augment_pipeline = build_augment_pipeline(target_dir=args.data_dir) if __name__ == "__main__": # yapf: disable parser = argparse.ArgumentParser(__doc__) - parser.add_argument("--seed", - default=0, - type=int, - help="random seed for paddle, numpy and python random package") parser.add_argument("--data-dir", default="./data/", type=str, help="data directory") - parser.add_argument("--vox2-base-path", + parser.add_argument("--config", default=None, type=str, - help="vox2 base path, where is store the wav audio") - parser.add_argument("--augment", - action="store_true", - default=False, - help="Apply audio augments.") + help="configuration file") args = parser.parse_args() # yapf: enable - main(args) + + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) + + config.freeze() + print(config) + + main(args, config) diff --git a/examples/voxceleb/sv0/local/emb.sh b/examples/voxceleb/sv0/local/emb.sh new file mode 100755 index 0000000000000000000000000000000000000000..482e658e7aecb49dcfa645606a9709366e14b436 --- /dev/null +++ b/examples/voxceleb/sv0/local/emb.sh @@ -0,0 +1,13 @@ +#!/bin/bash +. ./path.sh + +exp_dir=exp/ecapa-tdnn-vox12-big//epoch_10/ # experiment directory +conf_path=conf/ecapa_tdnn.yaml +audio_path="demo/voxceleb/00001.wav" + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +# extract the audio embedding +python3 ${BIN_DIR}/extract_emb.py --device "gpu" \ + --config ${conf_path} \ + --audio-path ${audio_path} --load-checkpoint ${exp_dir} \ No newline at end of file diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..d8a1a0ba09152b53bf980c0f76f17a0c756ade82 --- /dev/null +++ b/examples/voxceleb/sv0/local/test.sh @@ -0,0 +1,8 @@ +dir=$1 +exp_dir=$2 +conf_path=$3 + +python3 ${BIN_DIR}/test.py \ + --config ${conf_path} \ + --data-dir ${dir} \ + --load-checkpoint ${exp_dir} \ No newline at end of file diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..385e8caa1d272c5293a52f50ea10392ecbd408d8 --- /dev/null +++ b/examples/voxceleb/sv0/local/train.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +dir=$1 +exp_dir=$2 +conf_path=$3 + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +# train the speaker identification task with voxceleb data +# Note: we will store the log file in exp/log directory +python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \ + ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \ + --data-dir ${dir} --config ${conf_path} + + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 \ No newline at end of file diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh index c5dc3dd295abbcb5e9700916f099586868d47440..e38027e9da427a4b3ab5163530e24b8a0c9158ba 100755 --- a/examples/voxceleb/sv0/run.sh +++ b/examples/voxceleb/sv0/run.sh @@ -18,7 +18,7 @@ set -e ####################################################################### # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv -# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md +# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh # stage 1: train the speaker identification model # stage 2: test speaker identification # stage 3: extract the training embeding to train the LDA and PLDA @@ -30,49 +30,39 @@ set -e # and put all of them to ${PPAUDIO_HOME}/datasets/vox2 # we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav # export PPAUDIO_HOME= - stage=0 +stop_stage=50 + # data directory # if we set the variable ${dir}, we will store the wav info to this directory # otherwise, we will store the wav info to vox1 and vox2 directory respectively -dir=data/ -exp_dir=exp/ecapa-tdnn/ # experiment directory - # vox2 wav path, we must convert the m4a format to wav format -# and store them in the ${PPAUDIO_HOME}/datasets/vox2/wav/ directory -vox2_base_path=${PPAUDIO_HOME}/datasets/vox2/wav/ -mkdir -p ${dir} +# dir=data-demo/ # data info directory +dir=demo/ # data info directory + +exp_dir=exp/ecapa-tdnn-vox12-big// # experiment directory +conf_path=conf/ecapa_tdnn.yaml +gpus=0,1,2,3 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + mkdir -p ${exp_dir} -if [ $stage -le 0 ]; then +if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav - python3 local/data_prepare.py \ - --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path} \ - --config conf/ecapa_tdnn.yaml + # and we should specifiy the vox2 data in the data.sh + bash ./local/data.sh ${dir} ${conf_path}|| exit -1; fi -if [ $stage -le 1 ]; then +if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then # stage 1: train the speaker identification model - python3 \ - -m paddle.distributed.launch --gpus=0,1,2,3 \ - ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \ - --data-dir ${dir} --config conf/ecapa_tdnn.yaml + CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} fi if [ $stage -le 2 ]; then - # stage 1: get the speaker verification scores with cosine function - python3 \ - ${BIN_DIR}/speaker_verification_cosine.py\ - --config conf/ecapa_tdnn.yaml \ - --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/ -fi - -if [ $stage -le 3 ]; then - # stage 3: extract the audio embedding - python3 \ - ${BIN_DIR}/extract_speaker_embedding.py\ - --config conf/ecapa_tdnn.yaml \ - --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/ + # stage 2: get the speaker verification scores with cosine function + # now we only support use cosine to get the scores + CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path} fi # if [ $stage -le 3 ]; then diff --git a/paddleaudio/paddleaudio/datasets/rirs_noises.py b/paddleaudio/paddleaudio/datasets/rirs_noises.py index df5dec610f61eaa82759b902918d6f11b5c49caf..80bb2d74a07ab9729bd8ff80357ea18227690804 100644 --- a/paddleaudio/paddleaudio/datasets/rirs_noises.py +++ b/paddleaudio/paddleaudio/datasets/rirs_noises.py @@ -25,13 +25,10 @@ from tqdm import tqdm from ..backends import load as load_audio from ..backends import save as save_wav -from .dataset import feat_funcs from ..utils import DATA_HOME from ..utils import decompress -from paddlespeech.s2t.utils.log import Log -from paddlespeech.vector.utils.download import download_and_decompress - -logger = Log(__name__).getlog() +from ..utils.download import download_and_decompress +from .dataset import feat_funcs __all__ = ['OpenRIRNoise'] @@ -80,17 +77,17 @@ class OpenRIRNoise(Dataset): def _get_data(self): # Download audio files. - logger.info(f"rirs noises base path: {self.base_path}") + print(f"rirs noises base path: {self.base_path}") if not os.path.isdir(self.base_path): download_and_decompress( self.archieves, self.base_path, decompress=True) else: - logger.info( + print( f"{self.base_path} already exists, we will not download and decompress again" ) # Data preparation. - logger.info(f"prepare the csv to {self.csv_path}") + print(f"prepare the csv to {self.csv_path}") if not os.path.isdir(self.csv_path): os.makedirs(self.csv_path) self.prepare_data() @@ -161,7 +158,7 @@ class OpenRIRNoise(Dataset): wav_files: List[str], output_file: str, split_chunks: bool=True): - logger.info(f'Generating csv: {output_file}') + print(f'Generating csv: {output_file}') header = ["id", "duration", "wav"] infos = list( diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py index f8d634f2491938c8559289292126eef88ec8d347..b9b8c271b53fe8c1fcb973aec3fe788bad1e9cc6 100644 --- a/paddleaudio/paddleaudio/datasets/voxceleb.py +++ b/paddleaudio/paddleaudio/datasets/voxceleb.py @@ -28,13 +28,8 @@ from tqdm import tqdm from ..backends import load as load_audio from ..utils import DATA_HOME from ..utils import decompress +from ..utils.download import download_and_decompress from .dataset import feat_funcs -from paddlespeech.s2t.utils.log import Log -from paddlespeech.vector.utils.download import download_and_decompress -from utils.utility import download -from utils.utility import unpack - -logger = Log(__name__).getlog() __all__ = ['VoxCeleb'] @@ -138,9 +133,9 @@ class VoxCeleb(Dataset): # Download audio files. # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir # so, we check the vox1/wav dir status - logger.info(f"wav base path: {self.wav_path}") + print(f"wav base path: {self.wav_path}") if not os.path.isdir(self.wav_path): - logger.info(f"start to download the voxceleb1 dataset") + print(f"start to download the voxceleb1 dataset") download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip self.archieves_audio_dev, self.base_path, @@ -152,7 +147,7 @@ class VoxCeleb(Dataset): # Download all parts and concatenate the files into one zip file. dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip') - logger.info(f'Concatenating all parts to: {dev_zipfile}') + print(f'Concatenating all parts to: {dev_zipfile}') os.system( f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}' ) @@ -162,6 +157,7 @@ class VoxCeleb(Dataset): # Download meta files. if not os.path.isdir(self.meta_path): + print("prepare the meta data") download_and_decompress( self.archieves_meta, self.meta_path, decompress=False) @@ -171,7 +167,7 @@ class VoxCeleb(Dataset): self.prepare_data() data = [] - logger.info( + print( f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}" ) with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: @@ -266,8 +262,8 @@ class VoxCeleb(Dataset): wav_files: List[str], output_file: str, split_chunks: bool=True): - logger.info(f'Generating csv: {output_file}') - header = ["id", "duration", "wav", "start", "stop", "spk_id"] + print(f'Generating csv: {output_file}') + header = ["ID", "duration", "wav", "start", "stop", "spk_id"] # Note: this may occurs c++ execption, but the program will execute fine # so we can ignore the execption with Pool(cpu_count()) as p: @@ -290,7 +286,7 @@ class VoxCeleb(Dataset): def prepare_data(self): # Audio of speakers in veri_test_file should not be included in training set. - logger.info("start to prepare the data csv file") + print("start to prepare the data csv file") enroll_files = set() test_files = set() # get the enroll and test audio file path @@ -311,13 +307,13 @@ class VoxCeleb(Dataset): # get all the train and dev audios file path audio_files = [] speakers = set() + print("Getting file list...") for path in [self.wav_path, self.vox2_base_path]: # if vox2 directory is not set and vox2 is not a directory # we will not process this directory if not path or not os.path.exists(path): - logger.warning( - f"{path} is an invalid path, please check again, " - "and we will ignore the vox2 base path") + print(f"{path} is an invalid path, please check again, " + "and we will ignore the vox2 base path") continue for file in glob.glob( os.path.join(path, "**", "*.wav"), recursive=True): @@ -327,7 +323,7 @@ class VoxCeleb(Dataset): speakers.add(spk) audio_files.append(file) - logger.info( + print( f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}" ) # encode the train and dev speakers label to spk_id2label.txt diff --git a/paddleaudio/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py index 4658352f948f496a1420b76916ab5a5d15016adf..07d5eea845ff67dfd444794905d811dbf5fb8522 100644 --- a/paddleaudio/paddleaudio/utils/download.py +++ b/paddleaudio/paddleaudio/utils/download.py @@ -37,7 +37,9 @@ def decompress(file: str): download._decompress(file) -def download_and_decompress(archives: List[Dict[str, str]], path: str): +def download_and_decompress(archives: List[Dict[str, str]], + path: str, + decompress: bool=True): """ Download archieves and decompress to specific path. """ @@ -47,8 +49,8 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str): for archive in archives: assert 'url' in archive and 'md5' in archive, \ 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' - - download.get_path_from_url(archive['url'], path, archive['md5']) + download.get_path_from_url( + archive['url'], path, archive['md5'], decompress=decompress) def load_state_dict_from_url(url: str, path: str, md5: str=None): diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py index 44cbd204f9955079b853f9b2b804feeee23ce10e..0d09d2113b3f0e4d94c6d3199bb6f999561a378e 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py @@ -14,12 +14,13 @@ import argparse import os +import time import numpy as np import paddle from yacs.config import CfgNode -from paddleaudio.paddleaudio.backends import load as load_audio -from paddleaudio.paddleaudio.compliance.librosa import melspectrogram +from paddleaudio.backends import load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn @@ -39,7 +40,7 @@ def extract_audio_embedding(args, config): ecapa_tdnn = EcapaTdnn(**config.model) # stage4: build the speaker verification train instance with backbone model - model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=1211) + model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers) # stage 2: load the pre-trained model args.load_checkpoint = os.path.abspath( os.path.expanduser(args.load_checkpoint)) @@ -60,7 +61,12 @@ def extract_audio_embedding(args, config): # feat type is numpy array, whose shape is [dim, time] # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one # so the final shape is [1, dim, time] - feat = melspectrogram(x=waveform, **config.feature) + start_time = time.time() + feat = melspectrogram(x=waveform, + sr=config.sample_rate, + n_mels=config.n_mels, + window_size=config.window_size, + hop_length=config.hop_length) feat = paddle.to_tensor(feat).unsqueeze(0) # in inference period, the lengths is all one without padding @@ -71,9 +77,13 @@ def extract_audio_embedding(args, config): # model backbone network forward the feats and get the embedding embedding = model.backbone( feat, lengths).squeeze().numpy() # (1, emb_size, 1) -> (emb_size) + elapsed_time = time.time() - start_time + audio_length = waveform.shape[0] / sr + # stage 5: do global norm with external mean and std - # todo + rtf = elapsed_time / audio_length + logger.info(f"{args.device} rft={rtf}") return embedding @@ -92,10 +102,6 @@ if __name__ == "__main__": type=str, default='', help="Directory to load model checkpoint to contiune trainning.") - parser.add_argument("--global-embedding-norm", - type=str, - default=None, - help="Apply global normalization on speaker embeddings.") parser.add_argument("--audio-path", default="./data/demo.wav", type=str, diff --git a/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py similarity index 82% rename from paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py rename to paddlespeech/vector/exps/ecapa_tdnn/test.py index 781bf2a5ee413d433ddb97ab720126f51bdd2587..037570330d37c868441ba7114324dd9b3980c13d 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py @@ -23,8 +23,8 @@ from paddle.io import DataLoader from tqdm import tqdm from yacs.config import CfgNode -from paddleaudio.paddleaudio.datasets import VoxCeleb -from paddleaudio.paddleaudio.metric import compute_eer +from paddleaudio.datasets import VoxCeleb +from paddleaudio.metric import compute_eer from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.batch import batch_feature_normalize from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn @@ -48,6 +48,9 @@ def main(args, config): backbone=ecapa_tdnn, num_class=config.num_speakers) # stage3: load the pre-trained model + # we get the last model from the epoch and save_interval + last_save_epoch = (config.epochs // config.save_interval) * config.save_interval + args.load_checkpoint = os.path.join(args.load_checkpoint, "epoch_" + str(last_save_epoch)) args.load_checkpoint = os.path.abspath( os.path.expanduser(args.load_checkpoint)) @@ -63,7 +66,9 @@ def main(args, config): target_dir=args.data_dir, feat_type='melspectrogram', random_chunk=False, - **config.feature) + n_mels=config.n_mels, + window_size=config.window_size, + hop_length=config.hop_length) enroll_sampler = BatchSampler( enroll_dataset, batch_size=config.batch_size, shuffle=True) # Shuffle to make embedding normalization more robust. @@ -73,13 +78,14 @@ def main(args, config): x, mean_norm=True, std_norm=False), num_workers=config.num_workers, return_list=True,) - test_dataset = VoxCeleb( subset='test', target_dir=args.data_dir, feat_type='melspectrogram', random_chunk=False, - **config.feature) + n_mels=config.n_mels, + window_size=config.window_size, + hop_length=config.hop_length) test_sampler = BatchSampler( test_dataset, batch_size=config.batch_size, shuffle=True) @@ -89,19 +95,19 @@ def main(args, config): x, mean_norm=True, std_norm=False), num_workers=config.num_workers, return_list=True,) - # stage6: we must set the model to eval mode + # stage5: we must set the model to eval mode model.eval() - # stage7: global embedding norm to imporve the performance - print("global embedding norm: {}".format(args.global_embedding_norm)) - if args.global_embedding_norm: + # stage6: global embedding norm to imporve the performance + logger.info(f"global embedding norm: {config.global_embedding_norm}") + if config.global_embedding_norm: global_embedding_mean = None global_embedding_std = None - mean_norm_flag = args.embedding_mean_norm - std_norm_flag = args.embedding_std_norm + mean_norm_flag = config.embedding_mean_norm + std_norm_flag = config.embedding_std_norm batch_count = 0 - # stage8: Compute embeddings of audios in enrol and test dataset from model. + # stage7: Compute embeddings of audios in enrol and test dataset from model. id2embedding = {} # Run multi times to make embedding normalization more stable. for i in range(2): @@ -121,7 +127,7 @@ def main(args, config): # Global embedding normalization. # if we use the global embedding norm # eer can reduece about relative 10% - if args.global_embedding_norm: + if config.global_embedding_norm: batch_count += 1 current_mean = embeddings.mean( axis=0) if mean_norm_flag else 0 @@ -145,21 +151,22 @@ def main(args, config): # Update embedding dict. id2embedding.update(dict(zip(ids, embeddings))) - # stage 9: Compute cosine scores. + # stage 8: Compute cosine scores. labels = [] - enrol_ids = [] + enroll_ids = [] test_ids = [] + logger.info(f"read the trial from {VoxCeleb.veri_test_file}") with open(VoxCeleb.veri_test_file, 'r') as f: for line in f.readlines(): - label, enrol_id, test_id = line.strip().split(' ') + label, enroll_id, test_id = line.strip().split(' ') labels.append(int(label)) - enrol_ids.append(enrol_id.split('.')[0].replace('/', '--')) - test_ids.append(test_id.split('.')[0].replace('/', '--')) + enroll_ids.append(enroll_id.split('.')[0].replace('/', '-')) + test_ids.append(test_id.split('.')[0].replace('/', '-')) cos_sim_func = paddle.nn.CosineSimilarity(axis=1) enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor( - np.asarray([id2embedding[id] for id in ids], dtype='float32')), - [enrol_ids, test_ids + np.asarray([id2embedding[uttid] for uttid in ids], dtype='float32')), + [enroll_ids, test_ids ]) # (N, emb_size) scores = cos_sim_func(enrol_embeddings, test_embeddings) EER, threshold = compute_eer(np.asarray(labels), scores.numpy()) @@ -187,17 +194,6 @@ if __name__ == "__main__": type=str, default='', help="Directory to load model checkpoint to contiune trainning.") - parser.add_argument("--global-embedding-norm", - default=False, - action="store_true", - help="Apply global normalization on speaker embeddings.") - parser.add_argument("--embedding-mean-norm", - default=True, - help="Apply mean normalization on speaker embeddings.") - parser.add_argument("--embedding-std-norm", - type=bool, - default=False, - help="Apply std normalization on speaker embeddings.") args = parser.parse_args() # yapf: enable # https://yaml.org/type/float.html diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py index cb20ef16702e6cc538ef23e02619d82c7a4ec203..0d62c69d03d0c2108bf02eaf6f41afc5a8735d12 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/train.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py @@ -21,8 +21,8 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from yacs.config import CfgNode -from paddleaudio.paddleaudio.compliance.librosa import melspectrogram -from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb +from paddleaudio.compliance.librosa import melspectrogram +from paddleaudio.datasets.voxceleb import VoxCeleb from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.augment import build_augment_pipeline from paddlespeech.vector.io.augment import waveform_augment @@ -68,6 +68,8 @@ def main(args, config): backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers) # stage5: build the optimizer, we now only construct the AdamW optimizer + # 140000 is single gpu steps + # so, in multi-gpu mode, wo reduce the step_size to 140000//nranks to enable CyclicLRScheduler lr_schedule = CyclicLRScheduler( base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks) optimizer = paddle.optimizer.AdamW( @@ -138,6 +140,10 @@ def main(args, config): waveforms, labels = batch['waveforms'], batch['labels'] # stage 9-2: audio sample augment method, which is done on the audio sample point + # the original wavefrom and the augmented waveform is concatented in a batch + # eg. five augment method in the augment pipeline + # the final data nums is batch_size * [five + one] + # -> five augmented waveform batch plus one original batch waveform if len(augment_pipeline) != 0: waveforms = waveform_augment(waveforms, augment_pipeline) labels = paddle.concat( @@ -146,7 +152,11 @@ def main(args, config): # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram feats = [] for waveform in waveforms.numpy(): - feat = melspectrogram(x=waveform, **config.feature) + feat = melspectrogram(x=waveform, + sr=config.sample_rate, + n_mels=config.n_mels, + window_size=config.window_size, + hop_length=config.hop_length) feats.append(feat) feats = paddle.to_tensor(np.asarray(feats)) @@ -205,7 +215,7 @@ def main(args, config): # stage 9-12: construct the valid dataset dataloader dev_sampler = BatchSampler( dev_dataset, - batch_size=config.batch_size // 4, + batch_size=config.batch_size, shuffle=False, drop_last=False) dev_loader = DataLoader( @@ -228,8 +238,11 @@ def main(args, config): feats = [] for waveform in waveforms.numpy(): - # feat = melspectrogram(x=waveform, **cpu_feat_conf) - feat = melspectrogram(x=waveform, **config.feature) + feat = melspectrogram(x=waveform, + sr=config.sample_rate, + n_mels=config.n_mels, + window_size=config.window_size, + hop_length=config.hop_length) feats.append(feat) feats = paddle.to_tensor(np.asarray(feats)) diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py index f40ce41b70482e9a74a1b6906345607e510e88a0..6e508c3737756fa4290c982ab1397d7decf6973e 100644 --- a/paddlespeech/vector/io/augment.py +++ b/paddlespeech/vector/io/augment.py @@ -22,8 +22,8 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddleaudio.paddleaudio import load as load_audio -from paddleaudio.paddleaudio.datasets.rirs_noises import OpenRIRNoise +from paddleaudio import load as load_audio +from paddleaudio.datasets.rirs_noises import OpenRIRNoise from paddlespeech.s2t.utils.log import Log from paddlespeech.vector.io.signal_processing import compute_amplitude from paddlespeech.vector.io.signal_processing import convolve1d @@ -879,14 +879,18 @@ def waveform_augment(waveforms: paddle.Tensor, """process the augment pipeline and return all the waveforms Args: - waveforms (paddle.Tensor): _description_ - augment_pipeline (List[paddle.nn.Layer]): _description_ + waveforms (paddle.Tensor): original batch waveform + augment_pipeline (List[paddle.nn.Layer]): agument pipeline process Returns: - paddle.Tensor: _description_ + paddle.Tensor: all the audio waveform including the original waveform and augmented waveform """ + # stage 0: store the original waveforms waveforms_aug_list = [waveforms] + + # augment the original batch waveform for aug in augment_pipeline: + # stage 1: augment the data waveforms_aug = aug(waveforms) # (N, L) if waveforms_aug.shape[1] >= waveforms.shape[1]: # Trunc @@ -897,6 +901,8 @@ def waveform_augment(waveforms: paddle.Tensor, waveforms_aug = F.pad( waveforms_aug.unsqueeze(-1), [0, lengths_to_pad], data_format='NLC').squeeze(-1) + # stage 2: append the augmented waveform into the list waveforms_aug_list.append(waveforms_aug) + # get the all the waveforms return paddle.concat(waveforms_aug_list, axis=0) diff --git a/paddlespeech/vector/utils/download.py b/paddlespeech/vector/utils/download.py deleted file mode 100644 index 476bfea7e24f82e5d36e78124c9d565d9a0152f3..0000000000000000000000000000000000000000 --- a/paddlespeech/vector/utils/download.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -from typing import Dict -from typing import List - -from paddle.framework import load as load_state_dict -from paddle.utils import download - -__all__ = [ - 'decompress', - 'download_and_decompress', - 'load_state_dict_from_url', -] - - -def decompress(file: str, path: str=os.PathLike): - """ - Extracts all files from a compressed file to specific path. - """ - assert os.path.isfile(file), "File: {} not exists.".format(file) - - if path is None: - print("decompress the data: {}".format(file)) - download._decompress(file) - else: - print("decompress the data: {} to {}".format(file, path)) - if not os.path.isdir(path): - os.makedirs(path) - - tmp_file = os.path.join(path, os.path.basename(file)) - os.rename(file, tmp_file) - download._decompress(tmp_file) - os.rename(tmp_file, file) - - -def download_and_decompress(archives: List[Dict[str, str]], - path: str, - decompress: bool=True): - """ - Download archieves and decompress to specific path. - """ - if not os.path.isdir(path): - os.makedirs(path) - - for archive in archives: - assert 'url' in archive and 'md5' in archive, \ - 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' - download.get_path_from_url( - archive['url'], path, archive['md5'], decompress=decompress) - - -def load_state_dict_from_url(url: str, path: str, md5: str=None): - """ - Download and load a state dict from url - """ - if not os.path.isdir(path): - os.makedirs(path) - - download.get_path_from_url(url, path, md5) - return load_state_dict(os.path.join(path, os.path.basename(url)))