diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py index 9058620083ab74dfd21f0d33d368d543e41e7744..95827f708c0a2f22133cb31995a332bcc6159b88 100644 --- a/dataset/voxceleb/voxceleb1.py +++ b/dataset/voxceleb/voxceleb1.py @@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path, # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory if not os.path.exists(os.path.join(target_dir, "wav")): # download all dataset part - print("start to download the vox1 dev zip package") + print(f"start to download the vox1 zip package to {target_dir}") for zip_part in data_list.keys(): download_url = " --no-check-certificate " + base_url + "/" + zip_part download( diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py index 22a2e2ffe21c1ba22c1db94af773b7a5b9938f54..fe9e8b9c83463c301e35cb2d4617f37cdecf8b38 100644 --- a/dataset/voxceleb/voxceleb2.py +++ b/dataset/voxceleb/voxceleb2.py @@ -22,10 +22,12 @@ import codecs import glob import json import os +import subprocess from pathlib import Path import soundfile +from utils.utility import check_md5sum from utils.utility import download from utils.utility import unzip @@ -35,12 +37,22 @@ DATA_HOME = os.path.expanduser('.') BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/" # dev data -DEV_DATA_URL = BASE_URL + '/vox2_aac.zip' -DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402" +DEV_LIST = { + "vox2_dev_aac_partaa": "da070494c573e5c0564b1d11c3b20577", + "vox2_dev_aac_partab": "17fe6dab2b32b48abaf1676429cdd06f", + "vox2_dev_aac_partac": "1de58e086c5edf63625af1cb6d831528", + "vox2_dev_aac_partad": "5a043eb03e15c5a918ee6a52aad477f9", + "vox2_dev_aac_partae": "cea401b624983e2d0b2a87fb5d59aa60", + "vox2_dev_aac_partaf": "fc886d9ba90ab88e7880ee98effd6ae9", + "vox2_dev_aac_partag": "d160ecc3f6ee3eed54d55349531cb42e", + "vox2_dev_aac_partah": "6b84a81b9af72a9d9eecbb3b1f602e65", +} + +DEV_TARGET_DATA = "vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402" # test data -TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip' -TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312" +TEST_LIST = {"vox2_test_aac.zip": "0d2b3ea430a821c33263b5ea37ede312"} +TEST_TARGET_DATA = "vox2_test_aac.zip vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -68,6 +80,14 @@ args = parser.parse_args() def create_manifest(data_dir, manifest_path_prefix): + """Generate the voxceleb2 dataset manifest file. + We will create the ${manifest_path_prefix}.vox2 as the final manifest file + The dev and test wav info will be put in one manifest file. + + Args: + data_dir (str): voxceleb2 wav directory, which include dev and test subdataset + manifest_path_prefix (str): manifest file prefix + """ print("Creating manifest %s ..." % manifest_path_prefix) json_lines = [] data_path = os.path.join(data_dir, "**", "*.wav") @@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix): print(f"{total_sec / total_num} sec/utt", file=f) -def download_dataset(url, md5sum, target_dir, dataset): +def download_dataset(base_url, data_list, target_data, target_dir, dataset): + """Download the voxceleb2 zip package + + Args: + base_url (str): the voxceleb2 dataset download baseline url + data_list (dict): the dataset part zip package and the md5 value + target_data (str): the final dataset zip info + target_dir (str): the dataset stored directory + dataset (str): the dataset name, dev or test + + Raises: + RuntimeError: the md5sum occurs error + """ if not os.path.exists(target_dir): os.makedirs(target_dir) @@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset): # but the test dataset will unzip to aac # so, wo create the ${target_dir}/test and unzip the m4a to test dir if not os.path.exists(os.path.join(target_dir, dataset)): - filepath = download(url, md5sum, target_dir) + print(f"start to download the vox2 zip package to {target_dir}") + for zip_part in data_list.keys(): + download_url = " --no-check-certificate " + base_url + "/" + zip_part + download( + url=download_url, + md5sum=data_list[zip_part], + target_dir=target_dir) + + # pack the all part to target zip file + all_target_part, target_name, target_md5sum = target_data.split() + target_name = os.path.join(target_dir, target_name) + if not os.path.exists(target_name): + pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part, + target_name) + subprocess.call(pack_part_cmd, shell=True) + + # check the target zip file md5sum + if not check_md5sum(target_name, target_md5sum): + raise RuntimeError("{} MD5 checkssum failed".format(target_name)) + else: + print("Check {} md5sum successfully".format(target_name)) + if dataset == "test": - unzip(filepath, os.path.join(target_dir, "test")) + # we need make the test directory + unzip(target_name, os.path.join(target_dir, "test")) + else: + # upzip dev zip pacakge and will create the dev directory + unzip(target_name, target_dir) def main(): @@ -142,14 +199,16 @@ def main(): print("download: {}".format(args.download)) if args.download: download_dataset( - url=DEV_DATA_URL, - md5sum=DEV_MD5SUM, + base_url=BASE_URL, + data_list=DEV_LIST, + target_data=DEV_TARGET_DATA, target_dir=args.target_dir, dataset="dev") download_dataset( - url=TEST_DATA_URL, - md5sum=TEST_MD5SUM, + base_url=BASE_URL, + data_list=TEST_LIST, + target_data=TEST_TARGET_DATA, target_dir=args.target_dir, dataset="test") diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml index 6117b3541a08e4d8f3d9160c21664223bd387ca5..4715c5a3c0be80470b21c1c193fc5c875d8ac4cb 100644 --- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml +++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml @@ -1,8 +1,6 @@ ########################################### # Data # ########################################### -# we should explicitly specify the wav path of vox2 audio data converted from m4a -vox2_base_path: augment: True batch_size: 32 num_workers: 2 @@ -30,7 +28,6 @@ hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160 # if we want use another model, please choose another configuration yaml file model: input_size: 80 - # "channels": [512, 512, 512, 512, 1536], channels: [1024, 1024, 1024, 1024, 3072] kernel_sizes: [5, 3, 3, 3, 1] dilations: [1, 2, 3, 4, 1] @@ -42,8 +39,8 @@ model: ########################################### seed: 1986 # according from speechbrain configuration epochs: 10 -save_interval: 1 -log_interval: 1 +save_interval: 10 +log_interval: 10 learning_rate: 1e-8 diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ad5ea285368ff0180d18eb4ef3ba44d2e17ad98 --- /dev/null +++ b/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml @@ -0,0 +1,53 @@ +########################################### +# Data # +########################################### +augment: True +batch_size: 16 +num_workers: 2 +num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 +shuffle: True +skip_prep: False +split_ratio: 0.9 +chunk_duration: 3.0 # seconds +random_chunk: True +verification_file: data/vox1/veri_test2.txt + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### +# currently, we only support fbank +sr: 16000 # sample rate +n_mels: 80 +window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 +hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160 + +########################################################### +# MODEL SETTING # +########################################################### +# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml +# if we want use another model, please choose another configuration yaml file +model: + input_size: 80 + channels: [512, 512, 512, 512, 1536] + kernel_sizes: [5, 3, 3, 3, 1] + dilations: [1, 2, 3, 4, 1] + attention_channels: 128 + lin_neurons: 192 + +########################################### +# Training # +########################################### +seed: 1986 # according from speechbrain configuration +epochs: 100 +save_interval: 10 +log_interval: 10 +learning_rate: 1e-8 + + +########################################### +# Testing # +########################################### +global_embedding_norm: True +embedding_mean_norm: True +embedding_std_norm: False + diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh index 3f41b9c878e0b815ccc64f68cd8a1eb370edd268..da44d431ba775ce351a1f71548732074e5fbe4b6 100755 --- a/examples/voxceleb/sv0/local/data.sh +++ b/examples/voxceleb/sv0/local/data.sh @@ -38,7 +38,10 @@ mkdir -p ${TARGET_DIR} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # download data, generate manifests - # we will generate the manifest.{dev, test} file in ${dir}/vox1/ directory + # we will generate the manifest.{dev,test} file from ${TARGET_DIR}/voxceleb/vox1/{dev,test} directory + # and generate the meta info and download the trial file + # manifest.dev: 148642 + # manifest.test: 4847 echo "Start to download vox1 dataset and generate the manifest files " python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \ --manifest_prefix="${dir}/vox1/manifest" \ @@ -53,6 +56,8 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # download voxceleb2 data + # we will download the data and unzip the package + # and we will store the m4a file in ${TARGET_DIR}/voxceleb/vox2/{dev,test} echo "start to download vox2 dataset" python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \ --download \ @@ -99,7 +104,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # Currently, our training system use csv file for dataset echo "convert the json format to csv format to be compatible with training process" python3 local/make_vox_csv_dataset_from_json.py\ - --train "${dir}/vox1/manifest.dev" \ + --train "${dir}/vox1/manifest.dev" "${dir}/vox2/manifest.vox2"\ --test "${dir}/vox1/manifest.test" \ --target_dir "${dir}/vox/" \ --config ${conf_path} diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh index bbc9e3dbb66b4dc18a1fc54d0bd0808f01169ea8..e1dccf2ae9e359a3dcaa3e6a2cecde79b4af8a24 100755 --- a/examples/voxceleb/sv0/run.sh +++ b/examples/voxceleb/sv0/run.sh @@ -18,24 +18,22 @@ set -e ####################################################################### # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv -# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh +# voxceleb2 data is m4a format, so we need convert the m4a to wav yourselves with the script local/convert.sh # stage 1: train the speaker identification model # stage 2: test speaker identification -# stage 3: extract the training embeding to train the LDA and PLDA +# stage 3: (todo)extract the training embeding to train the LDA and PLDA ###################################################################### -# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset -# default the dataset will be stored in the ~/.paddleaudio/ # the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself -# and put all of them to ${PPAUDIO_HOME}/datasets/vox2 -# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav -# export PPAUDIO_HOME= +# and put all of them to ${MAIN_ROOT}/datasets/vox2 +# we will find the wav from ${MAIN_ROOT}/datasets/vox1/{dev,test}/wav and ${MAIN_ROOT}/datasets/vox2/wav + stage=0 stop_stage=50 # data directory # if we set the variable ${dir}, we will store the wav info to this directory -# otherwise, we will store the wav info to vox1 and vox2 directory respectively +# otherwise, we will store the wav info to data/vox1 and data/vox2 directory respectively # vox2 wav path, we must convert the m4a format to wav format dir=data/ # data info directory @@ -64,6 +62,6 @@ if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then fi # if [ $stage -le 3 ]; then -# # stage 2: extract the training embeding to train the LDA and PLDA +# # stage 3: extract the training embeding to train the LDA and PLDA # # todo: extract the training embedding # fi