Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
97ec0126
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
97ec0126
编写于
3月 04, 2022
作者:
X
xiongxinlei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add speaker verification using cosine score, test=doc
上级
1f74af11
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
267 addition
and
5 deletion
+267
-5
examples/voxceleb/sv0/local/speaker_verification_cosine.py
examples/voxceleb/sv0/local/speaker_verification_cosine.py
+238
-0
examples/voxceleb/sv0/run.sh
examples/voxceleb/sv0/run.sh
+29
-5
未找到文件。
examples/voxceleb/sv0/local/speaker_verification_cosine.py
0 → 100644
浏览文件 @
97ec0126
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
ast
import
os
import
numpy
as
np
import
paddle
from
paddle.io
import
BatchSampler
from
paddle.io
import
DataLoader
import
paddle.nn.functional
as
F
from
paddlespeech.vector.training.metrics
import
compute_eer
from
paddleaudio.datasets.voxceleb
import
VoxCeleb1
from
paddlespeech.vector.models.ecapa_tdnn
import
EcapaTdnn
from
paddlespeech.vector.training.sid_model
import
SpeakerIdetification
from
tqdm
import
tqdm
def
pad_right_2d
(
x
,
target_length
,
axis
=-
1
,
mode
=
'constant'
,
**
kwargs
):
x
=
np
.
asarray
(
x
)
assert
len
(
x
.
shape
)
==
2
,
f
'Only 2D arrays supported, but got shape:
{
x
.
shape
}
'
w
=
target_length
-
x
.
shape
[
axis
]
assert
w
>=
0
,
f
'Target length
{
target_length
}
is less than origin length
{
x
.
shape
[
axis
]
}
'
if
axis
==
0
:
pad_width
=
[[
0
,
w
],
[
0
,
0
]]
else
:
pad_width
=
[[
0
,
0
],
[
0
,
w
]]
return
np
.
pad
(
x
,
pad_width
,
mode
=
mode
,
**
kwargs
)
def
feature_normalize
(
batch
,
mean_norm
:
bool
=
True
,
std_norm
:
bool
=
True
):
ids
=
[
item
[
'id'
]
for
item
in
batch
]
lengths
=
np
.
asarray
([
item
[
'feat'
].
shape
[
1
]
for
item
in
batch
])
feats
=
list
(
map
(
lambda
x
:
pad_right_2d
(
x
,
lengths
.
max
()),
[
item
[
'feat'
]
for
item
in
batch
]))
feats
=
np
.
stack
(
feats
)
# Features normalization if needed
for
i
in
range
(
len
(
feats
)):
feat
=
feats
[
i
][:,
:
lengths
[
i
]]
# Excluding pad values.
mean
=
feat
.
mean
(
axis
=-
1
,
keepdims
=
True
)
if
mean_norm
else
0
std
=
feat
.
std
(
axis
=-
1
,
keepdims
=
True
)
if
std_norm
else
1
feats
[
i
][:,
:
lengths
[
i
]]
=
(
feat
-
mean
)
/
std
assert
feats
[
i
][:,
lengths
[
i
]:].
sum
(
)
==
0
# Padding valus should all be 0.
# Converts into ratios.
lengths
=
(
lengths
/
lengths
.
max
()).
astype
(
np
.
float32
)
return
{
'ids'
:
ids
,
'feats'
:
feats
,
'lengths'
:
lengths
}
def
main
(
args
):
# stage0: set the training device, cpu or gpu
paddle
.
set_device
(
args
.
device
)
# stage1: build the dnn backbone model network
##"channels": [1024, 1024, 1024, 1024, 3072],
model_conf
=
{
"input_size"
:
80
,
"channels"
:
[
512
,
512
,
512
,
512
,
1536
],
"kernel_sizes"
:
[
5
,
3
,
3
,
3
,
1
],
"dilations"
:
[
1
,
2
,
3
,
4
,
1
],
"attention_channels"
:
128
,
"lin_neurons"
:
192
,
}
ecapa_tdnn
=
EcapaTdnn
(
**
model_conf
)
# stage2: build the speaker verification eval instance with backbone model
model
=
SpeakerIdetification
(
backbone
=
ecapa_tdnn
,
num_class
=
VoxCeleb1
.
num_speakers
)
# stage3: load the pre-trained model
args
.
load_checkpoint
=
os
.
path
.
abspath
(
os
.
path
.
expanduser
(
args
.
load_checkpoint
))
# load model checkpoint to sid model
state_dict
=
paddle
.
load
(
os
.
path
.
join
(
args
.
load_checkpoint
,
'model.pdparams'
))
model
.
set_state_dict
(
state_dict
)
print
(
f
'Checkpoint loaded from
{
args
.
load_checkpoint
}
'
)
# stage4: construct the enroll and test dataloader
enrol_ds
=
VoxCeleb1
(
subset
=
'enrol'
,
feat_type
=
'melspectrogram'
,
random_chunk
=
False
,
n_mels
=
80
,
window_size
=
400
,
hop_length
=
160
)
enrol_sampler
=
BatchSampler
(
enrol_ds
,
batch_size
=
args
.
batch_size
,
shuffle
=
True
)
# Shuffle to make embedding normalization more robust.
enrol_loader
=
DataLoader
(
enrol_ds
,
batch_sampler
=
enrol_sampler
,
collate_fn
=
lambda
x
:
feature_normalize
(
x
,
mean_norm
=
True
,
std_norm
=
False
),
num_workers
=
args
.
num_workers
,
return_list
=
True
,)
test_ds
=
VoxCeleb1
(
subset
=
'test'
,
feat_type
=
'melspectrogram'
,
random_chunk
=
False
,
n_mels
=
80
,
window_size
=
400
,
hop_length
=
160
)
test_sampler
=
BatchSampler
(
test_ds
,
batch_size
=
args
.
batch_size
,
shuffle
=
True
)
test_loader
=
DataLoader
(
test_ds
,
batch_sampler
=
test_sampler
,
collate_fn
=
lambda
x
:
feature_normalize
(
x
,
mean_norm
=
True
,
std_norm
=
False
),
num_workers
=
args
.
num_workers
,
return_list
=
True
,)
# stage6: we must set the model to eval mode
model
.
eval
()
# stage7: global embedding norm to imporve the performance
if
args
.
global_embedding_norm
:
embedding_mean
=
None
embedding_std
=
None
mean_norm
=
args
.
embedding_mean_norm
std_norm
=
args
.
embedding_std_norm
batch_count
=
0
# stage8: Compute embeddings of audios in enrol and test dataset from model.
id2embedding
=
{}
# Run multi times to make embedding normalization more stable.
for
i
in
range
(
2
):
for
dl
in
[
enrol_loader
,
test_loader
]:
print
(
f
'Loop
{
[
i
+
1
]
}
: Computing embeddings on
{
dl
.
dataset
.
subset
}
dataset'
)
with
paddle
.
no_grad
():
for
batch_idx
,
batch
in
enumerate
(
tqdm
(
dl
)):
# stage 8-1: extrac the audio embedding
ids
,
feats
,
lengths
=
batch
[
'ids'
],
batch
[
'feats'
],
batch
[
'lengths'
]
embeddings
=
model
.
backbone
(
feats
,
lengths
).
squeeze
(
-
1
).
numpy
()
# (N, emb_size, 1) -> (N, emb_size)
# Global embedding normalization.
if
args
.
global_embedding_norm
:
batch_count
+=
1
mean
=
embeddings
.
mean
(
axis
=
0
)
if
mean_norm
else
0
std
=
embeddings
.
std
(
axis
=
0
)
if
std_norm
else
1
# Update global mean and std.
if
embedding_mean
is
None
and
embedding_std
is
None
:
embedding_mean
,
embedding_std
=
mean
,
std
else
:
weight
=
1
/
batch_count
# Weight decay by batches.
embedding_mean
=
(
1
-
weight
)
*
embedding_mean
+
weight
*
mean
embedding_std
=
(
1
-
weight
)
*
embedding_std
+
weight
*
std
# Apply global embedding normalization.
embeddings
=
(
embeddings
-
embedding_mean
)
/
embedding_std
# Update embedding dict.
id2embedding
.
update
(
dict
(
zip
(
ids
,
embeddings
)))
# stage 9: Compute cosine scores.
labels
=
[]
enrol_ids
=
[]
test_ids
=
[]
with
open
(
VoxCeleb1
.
veri_test_file
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
label
,
enrol_id
,
test_id
=
line
.
strip
().
split
(
' '
)
labels
.
append
(
int
(
label
))
enrol_ids
.
append
(
enrol_id
.
split
(
'.'
)[
0
].
replace
(
'/'
,
'-'
))
test_ids
.
append
(
test_id
.
split
(
'.'
)[
0
].
replace
(
'/'
,
'-'
))
cos_sim_func
=
paddle
.
nn
.
CosineSimilarity
(
axis
=
1
)
enrol_embeddings
,
test_embeddings
=
map
(
lambda
ids
:
paddle
.
to_tensor
(
np
.
asarray
([
id2embedding
[
id
]
for
id
in
ids
],
dtype
=
'float32'
)),
[
enrol_ids
,
test_ids
])
# (N, emb_size)
scores
=
cos_sim_func
(
enrol_embeddings
,
test_embeddings
)
EER
,
threshold
=
compute_eer
(
np
.
asarray
(
labels
),
scores
.
numpy
())
print
(
f
'EER of verification test:
{
EER
*
100
:.
4
f
}
%, score threshold:
{
threshold
:.
5
f
}
'
)
if
__name__
==
"__main__"
:
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
'--device'
,
choices
=
[
'cpu'
,
'gpu'
],
default
=
"gpu"
,
help
=
"Select which device to train model, defaults to gpu."
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
default
=
16
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--num-workers"
,
type
=
int
,
default
=
0
,
help
=
"Number of workers in dataloader."
)
parser
.
add_argument
(
"--load-checkpoint"
,
type
=
str
,
default
=
''
,
help
=
"Directory to load model checkpoint to contiune trainning."
)
parser
.
add_argument
(
"--global-embedding-norm"
,
type
=
bool
,
default
=
True
,
help
=
"Apply global normalization on speaker embeddings."
)
parser
.
add_argument
(
"--embedding-mean-norm"
,
type
=
bool
,
default
=
True
,
help
=
"Apply mean normalization on speaker embeddings."
)
parser
.
add_argument
(
"--embedding-std-norm"
,
type
=
bool
,
default
=
False
,
help
=
"Apply std normalization on speaker embeddings."
)
args
=
parser
.
parse_args
()
# yapf: enable
main
(
args
)
\ No newline at end of file
examples/voxceleb/sv0/run.sh
浏览文件 @
97ec0126
...
@@ -2,9 +2,33 @@
...
@@ -2,9 +2,33 @@
.
./path.sh
.
./path.sh
set
-e
set
-e
dir
=
./data/
#######################################################################
mkdir
-p
${
dir
}
# stage 1: train the speaker identification model
# stage 2: test speaker identification
# stage 3: extract the training embeding to train the LDA and PLDA
######################################################################
# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
python3
\
# default the dataset is the ~/.paddleaudio/
local
/train.py
\
# export PPAUDIO_HOME=
--data-dir
${
dir
}
stage
=
2
dir
=
data/
# data directory
exp_dir
=
exp/ecapa-tdnn/
# experiment directory
mkdir
-p
${
dir
}
if
[
$stage
-le
1
]
;
then
# stage 1: train the speaker identification model
python3
\
-m
paddle.distributed.launch
--gpus
=
0,1,2,3
\
local
/train.py
--device
"gpu"
--checkpoint-dir
${
exp_dir
}
\
--save-freq
10
--data-dir
${
dir
}
--batch-size
256
--epochs
60
fi
if
[
$stage
-le
2
]
;
then
# stage 1: train the speaker identification model
python3
\
local
/speaker_verification_cosine.py
\
--load-checkpoint
${
exp_dir
}
/epoch_40/
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录