Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
38e4e9c8
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
38e4e9c8
编写于
4月 06, 2022
作者:
X
xiongxinlei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor voxceleb2 data download, test=doc
上级
ebfe3e6b
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
140 addition
and
28 deletion
+140
-28
dataset/voxceleb/voxceleb1.py
dataset/voxceleb/voxceleb1.py
+1
-1
dataset/voxceleb/voxceleb2.py
dataset/voxceleb/voxceleb2.py
+70
-11
examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+2
-5
examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
+53
-0
examples/voxceleb/sv0/local/data.sh
examples/voxceleb/sv0/local/data.sh
+7
-2
examples/voxceleb/sv0/run.sh
examples/voxceleb/sv0/run.sh
+7
-9
未找到文件。
dataset/voxceleb/voxceleb1.py
浏览文件 @
38e4e9c8
...
...
@@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
# we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
target_dir
,
"wav"
)):
# download all dataset part
print
(
"start to download the vox1 dev zip package
"
)
print
(
f
"start to download the vox1 zip package to
{
target_dir
}
"
)
for
zip_part
in
data_list
.
keys
():
download_url
=
" --no-check-certificate "
+
base_url
+
"/"
+
zip_part
download
(
...
...
dataset/voxceleb/voxceleb2.py
浏览文件 @
38e4e9c8
...
...
@@ -22,10 +22,12 @@ import codecs
import
glob
import
json
import
os
import
subprocess
from
pathlib
import
Path
import
soundfile
from
utils.utility
import
check_md5sum
from
utils.utility
import
download
from
utils.utility
import
unzip
...
...
@@ -35,12 +37,22 @@ DATA_HOME = os.path.expanduser('.')
BASE_URL
=
"--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
# dev data
DEV_DATA_URL
=
BASE_URL
+
'/vox2_aac.zip'
DEV_MD5SUM
=
"bbc063c46078a602ca71605645c2a402"
DEV_LIST
=
{
"vox2_dev_aac_partaa"
:
"da070494c573e5c0564b1d11c3b20577"
,
"vox2_dev_aac_partab"
:
"17fe6dab2b32b48abaf1676429cdd06f"
,
"vox2_dev_aac_partac"
:
"1de58e086c5edf63625af1cb6d831528"
,
"vox2_dev_aac_partad"
:
"5a043eb03e15c5a918ee6a52aad477f9"
,
"vox2_dev_aac_partae"
:
"cea401b624983e2d0b2a87fb5d59aa60"
,
"vox2_dev_aac_partaf"
:
"fc886d9ba90ab88e7880ee98effd6ae9"
,
"vox2_dev_aac_partag"
:
"d160ecc3f6ee3eed54d55349531cb42e"
,
"vox2_dev_aac_partah"
:
"6b84a81b9af72a9d9eecbb3b1f602e65"
,
}
DEV_TARGET_DATA
=
"vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402"
# test data
TEST_
DATA_URL
=
BASE_URL
+
'/vox2_test_aac.zip'
TEST_
MD5SUM
=
"
0d2b3ea430a821c33263b5ea37ede312"
TEST_
LIST
=
{
"vox2_test_aac.zip"
:
"0d2b3ea430a821c33263b5ea37ede312"
}
TEST_
TARGET_DATA
=
"vox2_test_aac.zip vox2_test_aac.zip
0d2b3ea430a821c33263b5ea37ede312"
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
...
...
@@ -68,6 +80,14 @@ args = parser.parse_args()
def
create_manifest
(
data_dir
,
manifest_path_prefix
):
"""Generate the voxceleb2 dataset manifest file.
We will create the ${manifest_path_prefix}.vox2 as the final manifest file
The dev and test wav info will be put in one manifest file.
Args:
data_dir (str): voxceleb2 wav directory, which include dev and test subdataset
manifest_path_prefix (str): manifest file prefix
"""
print
(
"Creating manifest %s ..."
%
manifest_path_prefix
)
json_lines
=
[]
data_path
=
os
.
path
.
join
(
data_dir
,
"**"
,
"*.wav"
)
...
...
@@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix):
print
(
f
"
{
total_sec
/
total_num
}
sec/utt"
,
file
=
f
)
def
download_dataset
(
url
,
md5sum
,
target_dir
,
dataset
):
def
download_dataset
(
base_url
,
data_list
,
target_data
,
target_dir
,
dataset
):
"""Download the voxceleb2 zip package
Args:
base_url (str): the voxceleb2 dataset download baseline url
data_list (dict): the dataset part zip package and the md5 value
target_data (str): the final dataset zip info
target_dir (str): the dataset stored directory
dataset (str): the dataset name, dev or test
Raises:
RuntimeError: the md5sum occurs error
"""
if
not
os
.
path
.
exists
(
target_dir
):
os
.
makedirs
(
target_dir
)
...
...
@@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset):
# but the test dataset will unzip to aac
# so, wo create the ${target_dir}/test and unzip the m4a to test dir
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
target_dir
,
dataset
)):
filepath
=
download
(
url
,
md5sum
,
target_dir
)
print
(
f
"start to download the vox2 zip package to
{
target_dir
}
"
)
for
zip_part
in
data_list
.
keys
():
download_url
=
" --no-check-certificate "
+
base_url
+
"/"
+
zip_part
download
(
url
=
download_url
,
md5sum
=
data_list
[
zip_part
],
target_dir
=
target_dir
)
# pack the all part to target zip file
all_target_part
,
target_name
,
target_md5sum
=
target_data
.
split
()
target_name
=
os
.
path
.
join
(
target_dir
,
target_name
)
if
not
os
.
path
.
exists
(
target_name
):
pack_part_cmd
=
"cat {}/{} > {}"
.
format
(
target_dir
,
all_target_part
,
target_name
)
subprocess
.
call
(
pack_part_cmd
,
shell
=
True
)
# check the target zip file md5sum
if
not
check_md5sum
(
target_name
,
target_md5sum
):
raise
RuntimeError
(
"{} MD5 checkssum failed"
.
format
(
target_name
))
else
:
print
(
"Check {} md5sum successfully"
.
format
(
target_name
))
if
dataset
==
"test"
:
unzip
(
filepath
,
os
.
path
.
join
(
target_dir
,
"test"
))
# we need make the test directory
unzip
(
target_name
,
os
.
path
.
join
(
target_dir
,
"test"
))
else
:
# upzip dev zip pacakge and will create the dev directory
unzip
(
target_name
,
target_dir
)
def
main
():
...
...
@@ -142,14 +199,16 @@ def main():
print
(
"download: {}"
.
format
(
args
.
download
))
if
args
.
download
:
download_dataset
(
url
=
DEV_DATA_URL
,
md5sum
=
DEV_MD5SUM
,
base_url
=
BASE_URL
,
data_list
=
DEV_LIST
,
target_data
=
DEV_TARGET_DATA
,
target_dir
=
args
.
target_dir
,
dataset
=
"dev"
)
download_dataset
(
url
=
TEST_DATA_URL
,
md5sum
=
TEST_MD5SUM
,
base_url
=
BASE_URL
,
data_list
=
TEST_LIST
,
target_data
=
TEST_TARGET_DATA
,
target_dir
=
args
.
target_dir
,
dataset
=
"test"
)
...
...
examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
浏览文件 @
38e4e9c8
###########################################
# Data #
###########################################
# we should explicitly specify the wav path of vox2 audio data converted from m4a
vox2_base_path
:
augment
:
True
batch_size
:
32
num_workers
:
2
...
...
@@ -30,7 +28,6 @@ hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
# if we want use another model, please choose another configuration yaml file
model
:
input_size
:
80
# "channels": [512, 512, 512, 512, 1536],
channels
:
[
1024
,
1024
,
1024
,
1024
,
3072
]
kernel_sizes
:
[
5
,
3
,
3
,
3
,
1
]
dilations
:
[
1
,
2
,
3
,
4
,
1
]
...
...
@@ -42,8 +39,8 @@ model:
###########################################
seed
:
1986
# according from speechbrain configuration
epochs
:
10
save_interval
:
1
log_interval
:
1
save_interval
:
1
0
log_interval
:
1
0
learning_rate
:
1e-8
...
...
examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
0 → 100644
浏览文件 @
38e4e9c8
###########################################
# Data #
###########################################
augment
:
True
batch_size
:
16
num_workers
:
2
num_speakers
:
1211
# 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
shuffle
:
True
skip_prep
:
False
split_ratio
:
0.9
chunk_duration
:
3.0
# seconds
random_chunk
:
True
verification_file
:
data/vox1/veri_test2.txt
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
# currently, we only support fbank
sr
:
16000
# sample rate
n_mels
:
80
window_size
:
400
#25ms, sample rate 16000, 25 * 16000 / 1000 = 400
hop_size
:
160
#10ms, sample rate 16000, 10 * 16000 / 1000 = 160
###########################################################
# MODEL SETTING #
###########################################################
# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
# if we want use another model, please choose another configuration yaml file
model
:
input_size
:
80
channels
:
[
512
,
512
,
512
,
512
,
1536
]
kernel_sizes
:
[
5
,
3
,
3
,
3
,
1
]
dilations
:
[
1
,
2
,
3
,
4
,
1
]
attention_channels
:
128
lin_neurons
:
192
###########################################
# Training #
###########################################
seed
:
1986
# according from speechbrain configuration
epochs
:
100
save_interval
:
10
log_interval
:
10
learning_rate
:
1e-8
###########################################
# Testing #
###########################################
global_embedding_norm
:
True
embedding_mean_norm
:
True
embedding_std_norm
:
False
examples/voxceleb/sv0/local/data.sh
浏览文件 @
38e4e9c8
...
...
@@ -38,7 +38,10 @@ mkdir -p ${TARGET_DIR}
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# download data, generate manifests
# we will generate the manifest.{dev, test} file in ${dir}/vox1/ directory
# we will generate the manifest.{dev,test} file from ${TARGET_DIR}/voxceleb/vox1/{dev,test} directory
# and generate the meta info and download the trial file
# manifest.dev: 148642
# manifest.test: 4847
echo
"Start to download vox1 dataset and generate the manifest files "
python3
${
TARGET_DIR
}
/voxceleb/voxceleb1.py
\
--manifest_prefix
=
"
${
dir
}
/vox1/manifest"
\
...
...
@@ -53,6 +56,8 @@ fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# download voxceleb2 data
# we will download the data and unzip the package
# and we will store the m4a file in ${TARGET_DIR}/voxceleb/vox2/{dev,test}
echo
"start to download vox2 dataset"
python3
${
TARGET_DIR
}
/voxceleb/voxceleb2.py
\
--download
\
...
...
@@ -99,7 +104,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# Currently, our training system use csv file for dataset
echo
"convert the json format to csv format to be compatible with training process"
python3
local
/make_vox_csv_dataset_from_json.py
\
--train
"
${
dir
}
/vox1/manifest.dev"
\
--train
"
${
dir
}
/vox1/manifest.dev"
"
${
dir
}
/vox2/manifest.vox2"
\
--test
"
${
dir
}
/vox1/manifest.test"
\
--target_dir
"
${
dir
}
/vox/"
\
--config
${
conf_path
}
...
...
examples/voxceleb/sv0/run.sh
浏览文件 @
38e4e9c8
...
...
@@ -18,24 +18,22 @@ set -e
#######################################################################
# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
# voxceleb2 data is m4a format, so we need
user to convert the m4a to wav yourselves as described in Readme.md
with the script local/convert.sh
# voxceleb2 data is m4a format, so we need
convert the m4a to wav yourselves
with the script local/convert.sh
# stage 1: train the speaker identification model
# stage 2: test speaker identification
# stage 3: extract the training embeding to train the LDA and PLDA
# stage 3:
(todo)
extract the training embeding to train the LDA and PLDA
######################################################################
# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset
# default the dataset will be stored in the ~/.paddleaudio/
# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
# and put all of them to ${
PPAUDIO_HOME
}/datasets/vox2
# we will find the wav from ${
PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME
}/datasets/vox2/wav
# export PPAUDIO_HOME=
# and put all of them to ${
MAIN_ROOT
}/datasets/vox2
# we will find the wav from ${
MAIN_ROOT}/datasets/vox1/{dev,test}/wav and ${MAIN_ROOT
}/datasets/vox2/wav
stage
=
0
stop_stage
=
50
# data directory
# if we set the variable ${dir}, we will store the wav info to this directory
# otherwise, we will store the wav info to
vox1 and
vox2 directory respectively
# otherwise, we will store the wav info to
data/vox1 and data/
vox2 directory respectively
# vox2 wav path, we must convert the m4a format to wav format
dir
=
data/
# data info directory
...
...
@@ -64,6 +62,6 @@ if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
# if [ $stage -le 3 ]; then
# # stage
2
: extract the training embeding to train the LDA and PLDA
# # stage
3
: extract the training embeding to train the LDA and PLDA
# # todo: extract the training embedding
# fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录