refactor voxceleb2 data download, test=doc

38e4e9c8 · xiongxinlei · ebfe3e6b · 38e4e9c8 · 38e4e9c8 · 38e4e9c8
6 changed file
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
    # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory 
    if not os.path.exists(os.path.join(target_dir, "wav")):
        # download all dataset part
-        print("start to download the vox1 dev zip package")
+        print(f"start to download the vox1 zip package to {target_dir}")
        for zip_part in data_list.keys():
            download_url = " --no-check-certificate " + base_url + "/" + zip_part
            download(

--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@@ -22,10 +22,12 @@ import codecs
 import glob
 import json
 import os
+import subprocess
 from pathlib import Path

 import soundfile

+from utils.utility import check_md5sum
 from utils.utility import download
 from utils.utility import unzip

@@ -35,12 +37,22 @@ DATA_HOME = os.path.expanduser('.')
 BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"

 # dev data
-DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
-DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
+DEV_LIST = {
+    "vox2_dev_aac_partaa": "da070494c573e5c0564b1d11c3b20577",
+    "vox2_dev_aac_partab": "17fe6dab2b32b48abaf1676429cdd06f",
+    "vox2_dev_aac_partac": "1de58e086c5edf63625af1cb6d831528",
+    "vox2_dev_aac_partad": "5a043eb03e15c5a918ee6a52aad477f9",
+    "vox2_dev_aac_partae": "cea401b624983e2d0b2a87fb5d59aa60",
+    "vox2_dev_aac_partaf": "fc886d9ba90ab88e7880ee98effd6ae9",
+    "vox2_dev_aac_partag": "d160ecc3f6ee3eed54d55349531cb42e",
+    "vox2_dev_aac_partah": "6b84a81b9af72a9d9eecbb3b1f602e65",
+}
+
+DEV_TARGET_DATA = "vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402"

 # test data
-TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
-TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
+TEST_LIST = {"vox2_test_aac.zip": "0d2b3ea430a821c33263b5ea37ede312"}
+TEST_TARGET_DATA = "vox2_test_aac.zip vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312"

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@@ -68,6 +80,14 @@ args = parser.parse_args()


 def create_manifest(data_dir, manifest_path_prefix):
+    """Generate the voxceleb2 dataset manifest file.
+    We will create the ${manifest_path_prefix}.vox2 as the final manifest file 
+    The dev and test wav info will be put in one manifest file.
+
+    Args:
+        data_dir (str): voxceleb2 wav directory, which include dev and test subdataset
+        manifest_path_prefix (str): manifest file prefix
+    """
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    data_path = os.path.join(data_dir, "**", "*.wav")
@@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix):
        print(f"{total_sec / total_num} sec/utt", file=f)


-def download_dataset(url, md5sum, target_dir, dataset):
+def download_dataset(base_url, data_list, target_data, target_dir, dataset):
+    """Download the voxceleb2 zip package
+
+    Args:
+        base_url (str): the voxceleb2 dataset download baseline url
+        data_list (dict): the dataset part zip package and the md5 value
+        target_data (str): the final dataset zip info
+        target_dir (str): the dataset stored directory
+        dataset (str): the dataset name, dev or test
+
+    Raises:
+        RuntimeError: the md5sum occurs error
+    """
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

@@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset):
    # but the test dataset will unzip to aac
    # so, wo create the ${target_dir}/test and unzip the m4a to test dir
    if not os.path.exists(os.path.join(target_dir, dataset)):
-        filepath = download(url, md5sum, target_dir)
+        print(f"start to download the vox2 zip package to {target_dir}")
+        for zip_part in data_list.keys():
+            download_url = " --no-check-certificate " + base_url + "/" + zip_part
+            download(
+                url=download_url,
+                md5sum=data_list[zip_part],
+                target_dir=target_dir)
+
+        # pack the all part to target zip file
+        all_target_part, target_name, target_md5sum = target_data.split()
+        target_name = os.path.join(target_dir, target_name)
+        if not os.path.exists(target_name):
+            pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
+                                                    target_name)
+            subprocess.call(pack_part_cmd, shell=True)
+
+        # check the target zip file md5sum
+        if not check_md5sum(target_name, target_md5sum):
+            raise RuntimeError("{} MD5 checkssum failed".format(target_name))
+        else:
+            print("Check {} md5sum successfully".format(target_name))
+
        if dataset == "test":
-            unzip(filepath, os.path.join(target_dir, "test"))
+            # we need make the test directory
+            unzip(target_name, os.path.join(target_dir, "test"))
+        else:
+            # upzip dev zip pacakge and will create the dev directory
+            unzip(target_name, target_dir)


 def main():
@@ -142,14 +199,16 @@ def main():
    print("download: {}".format(args.download))
    if args.download:
        download_dataset(
-            url=DEV_DATA_URL,
-            md5sum=DEV_MD5SUM,
+            base_url=BASE_URL,
+            data_list=DEV_LIST,
+            target_data=DEV_TARGET_DATA,
            target_dir=args.target_dir,
            dataset="dev")

        download_dataset(
-            url=TEST_DATA_URL,
-            md5sum=TEST_MD5SUM,
+            base_url=BASE_URL,
+            data_list=TEST_LIST,
+            target_data=TEST_TARGET_DATA,
            target_dir=args.target_dir,
            dataset="test")


--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
 ###########################################
 #                Data                 #
 ###########################################
-# we should explicitly specify the wav path of vox2 audio data converted from m4a
-vox2_base_path: 
 augment: True
 batch_size: 32
 num_workers: 2
@@ -30,7 +28,6 @@ hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 # if we want use another model, please choose another configuration yaml file
 model:
  input_size: 80
-  # "channels": [512, 512, 512, 512, 1536],
  channels: [1024, 1024, 1024, 1024, 3072]
  kernel_sizes: [5, 3, 3, 3, 1]
  dilations: [1, 2, 3, 4, 1]
@@ -42,8 +39,8 @@ model:
 ###########################################
 seed: 1986 # according from speechbrain configuration
 epochs: 10
-save_interval: 1
-log_interval: 1
+save_interval: 10
+log_interval: 10
 learning_rate: 1e-8



--- a/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
+###########################################
+#                Data                 #
+###########################################
+augment: True
+batch_size: 16
+num_workers: 2
+num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+shuffle: True
+skip_prep: False
+split_ratio: 0.9
+chunk_duration: 3.0 # seconds
+random_chunk: True
+verification_file: data/vox1/veri_test2.txt
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# currently, we only support fbank
+sr: 16000           # sample rate
+n_mels: 80
+window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
+# if we want use another model, please choose another configuration yaml file
+model:
+  input_size: 80
+  channels: [512, 512, 512, 512, 1536]
+  kernel_sizes: [5, 3, 3, 3, 1]
+  dilations: [1, 2, 3, 4, 1]
+  attention_channels: 128
+  lin_neurons: 192
+
+###########################################
+#                Training                 #
+###########################################
+seed: 1986 # according from speechbrain configuration
+epochs: 100
+save_interval: 10
+log_interval: 10
+learning_rate: 1e-8
+
+
+###########################################
+#                Testing                  #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+
--- a/examples/voxceleb/sv0/local/data.sh
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -38,7 +38,10 @@ mkdir -p ${TARGET_DIR}

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   # download data, generate manifests
-   # we will generate the manifest.{dev, test} file in ${dir}/vox1/ directory
+   # we will generate the manifest.{dev,test} file from ${TARGET_DIR}/voxceleb/vox1/{dev,test} directory
+   # and generate the meta info and download the trial file
+   # manifest.dev: 148642
+   # manifest.test: 4847
   echo "Start to download vox1 dataset and generate the manifest files "
   python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
      --manifest_prefix="${dir}/vox1/manifest" \
@@ -53,6 +56,8 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   # download voxceleb2 data
+   # we will download the data and unzip the package
+   # and we will store the m4a file in ${TARGET_DIR}/voxceleb/vox2/{dev,test}
   echo "start to download vox2 dataset"
   python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
      --download \
@@ -99,7 +104,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
   # Currently, our training system use csv file for dataset
   echo "convert the json format to csv format to be compatible with training process"
   python3 local/make_vox_csv_dataset_from_json.py\
-      --train "${dir}/vox1/manifest.dev" \
+      --train "${dir}/vox1/manifest.dev" "${dir}/vox2/manifest.vox2"\
      --test "${dir}/vox1/manifest.test" \
      --target_dir "${dir}/vox/" \
      --config ${conf_path}

--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -18,24 +18,22 @@ set -e

 #######################################################################
 # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
-#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
+#          voxceleb2 data is m4a format, so we need convert the m4a to wav yourselves with the script local/convert.sh
 # stage 1: train the speaker identification model
 # stage 2: test speaker identification 
-# stage 3: extract the training embeding to train the LDA and PLDA
+# stage 3: (todo)extract the training embeding to train the LDA and PLDA
 ######################################################################

-# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset 
-# default the dataset will be stored in the ~/.paddleaudio/
 # the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
-# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
-# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
-# export PPAUDIO_HOME=
+# and put all of them to ${MAIN_ROOT}/datasets/vox2
+# we will find the wav from ${MAIN_ROOT}/datasets/vox1/{dev,test}/wav and ${MAIN_ROOT}/datasets/vox2/wav
+
 stage=0
 stop_stage=50

 # data directory
 # if we set the variable ${dir}, we will store the wav info to this directory
-# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+# otherwise, we will store the wav info to data/vox1 and data/vox2 directory respectively
 # vox2 wav path, we must convert the m4a format to wav format    
 dir=data/                                 # data info directory   

@@ -64,6 +62,6 @@ if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 fi

 # if [ $stage -le 3 ]; then
-#      # stage 2: extract the training embeding to train the LDA and PLDA
+#      # stage 3: extract the training embeding to train the LDA and PLDA
 #      # todo: extract the training embedding
 # fi