Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
14efbf5b
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
14efbf5b
编写于
3月 08, 2022
作者:
X
xiongxinlei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
check extract embedding result, test=doc
上级
386ef3f1
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
76 addition
and
31 deletion
+76
-31
examples/voxceleb/sv0/local/extract_speaker_embedding.py
examples/voxceleb/sv0/local/extract_speaker_embedding.py
+21
-7
examples/voxceleb/sv0/local/speaker_verification_cosine.py
examples/voxceleb/sv0/local/speaker_verification_cosine.py
+2
-2
examples/voxceleb/sv0/local/train.py
examples/voxceleb/sv0/local/train.py
+7
-7
examples/voxceleb/sv0/run.sh
examples/voxceleb/sv0/run.sh
+28
-7
paddleaudio/datasets/voxceleb.py
paddleaudio/datasets/voxceleb.py
+5
-4
paddlespeech/vector/io/batch.py
paddlespeech/vector/io/batch.py
+13
-4
未找到文件。
examples/voxceleb/sv0/local/extract_speaker_embedding.py
浏览文件 @
14efbf5b
...
@@ -22,11 +22,11 @@ from paddle.io import BatchSampler
...
@@ -22,11 +22,11 @@ from paddle.io import BatchSampler
from
paddle.io
import
DataLoader
from
paddle.io
import
DataLoader
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
paddleaudio.backends
import
load
as
load_audio
from
paddleaudio.datasets.voxceleb
import
VoxCeleb1
from
paddleaudio.datasets.voxceleb
import
VoxCeleb1
from
paddleaudio.features.core
import
melspectrogram
from
paddleaudio.features.core
import
melspectrogram
from
paddleaudio.backends
import
load
as
load_audio
from
paddlespeech.vector.io.batch
import
feature_normalize
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.vector.io.batch
import
feature_normalize
from
paddlespeech.vector.models.ecapa_tdnn
import
EcapaTdnn
from
paddlespeech.vector.models.ecapa_tdnn
import
EcapaTdnn
from
paddlespeech.vector.modules.sid_model
import
SpeakerIdetification
from
paddlespeech.vector.modules.sid_model
import
SpeakerIdetification
from
paddlespeech.vector.training.metrics
import
compute_eer
from
paddlespeech.vector.training.metrics
import
compute_eer
...
@@ -41,6 +41,7 @@ cpu_feat_conf = {
...
@@ -41,6 +41,7 @@ cpu_feat_conf = {
'hop_length'
:
160
,
#ms
'hop_length'
:
160
,
#ms
}
}
def
extract_audio_embedding
(
args
):
def
extract_audio_embedding
(
args
):
# stage 0: set the training device, cpu or gpu
# stage 0: set the training device, cpu or gpu
paddle
.
set_device
(
args
.
device
)
paddle
.
set_device
(
args
.
device
)
...
@@ -59,6 +60,8 @@ def extract_audio_embedding(args):
...
@@ -59,6 +60,8 @@ def extract_audio_embedding(args):
}
}
ecapa_tdnn
=
EcapaTdnn
(
**
model_conf
)
ecapa_tdnn
=
EcapaTdnn
(
**
model_conf
)
# stage4: build the speaker verification train instance with backbone model
model
=
SpeakerIdetification
(
backbone
=
ecapa_tdnn
,
num_class
=
1211
)
# stage 2: load the pre-trained model
# stage 2: load the pre-trained model
args
.
load_checkpoint
=
os
.
path
.
abspath
(
args
.
load_checkpoint
=
os
.
path
.
abspath
(
os
.
path
.
expanduser
(
args
.
load_checkpoint
))
os
.
path
.
expanduser
(
args
.
load_checkpoint
))
...
@@ -71,18 +74,29 @@ def extract_audio_embedding(args):
...
@@ -71,18 +74,29 @@ def extract_audio_embedding(args):
# stage 3: we must set the model to eval mode
# stage 3: we must set the model to eval mode
model
.
eval
()
model
.
eval
()
# stage 4: read the audio data and extract the embedding
# stage 4: read the audio data and extract the embedding
# wavform is one dimension numpy array
waveform
,
sr
=
load_audio
(
args
.
audio_path
)
waveform
,
sr
=
load_audio
(
args
.
audio_path
)
# feat type is numpy array, whose shape is [dim, time]
# we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
# so the final shape is [1, dim, time]
feat
=
melspectrogram
(
x
=
waveform
,
**
cpu_feat_conf
)
feat
=
melspectrogram
(
x
=
waveform
,
**
cpu_feat_conf
)
feat
=
paddle
.
to_tensor
(
feat
).
unsqueeze
(
0
)
feat
=
paddle
.
to_tensor
(
feat
).
unsqueeze
(
0
)
lengths
=
paddle
.
ones
([
1
])
# in paddle inference model, the lengths is all one without padding
feat
=
feature_normalize
(
feat
,
mean_norm
=
True
,
std_norm
=
False
)
# in inference period, the lengths is all one without padding
embedding
=
ecapa_tdnn
(
feat
,
lengths
lengths
=
paddle
.
ones
([
1
])
).
squeeze
().
numpy
()
# (1, emb_size, 1) -> (emb_size)
feat
=
feature_normalize
(
feat
,
mean_norm
=
True
,
std_norm
=
False
,
convert_to_numpy
=
True
)
# model backbone network forward the feats and get the embedding
embedding
=
model
.
backbone
(
feat
,
lengths
).
squeeze
().
numpy
()
# (1, emb_size, 1) -> (emb_size)
# stage 5: do global norm with external mean and std
# stage 5: do global norm with external mean and std
# todo
# todo
# np.save("audio-embedding", embedding)
return
embedding
return
embedding
...
...
examples/voxceleb/sv0/local/speaker_verification_cosine.py
浏览文件 @
14efbf5b
...
@@ -120,7 +120,7 @@ def main(args):
...
@@ -120,7 +120,7 @@ def main(args):
**
cpu_feat_conf
)
**
cpu_feat_conf
)
enrol_sampler
=
BatchSampler
(
enrol_sampler
=
BatchSampler
(
enrol_ds
,
batch_size
=
args
.
batch_size
,
enrol_ds
,
batch_size
=
args
.
batch_size
,
shuffle
=
Fals
e
)
# Shuffle to make embedding normalization more robust.
shuffle
=
Tru
e
)
# Shuffle to make embedding normalization more robust.
enrol_loader
=
DataLoader
(
enrol_ds
,
enrol_loader
=
DataLoader
(
enrol_ds
,
batch_sampler
=
enrol_sampler
,
batch_sampler
=
enrol_sampler
,
collate_fn
=
lambda
x
:
feature_normalize
(
collate_fn
=
lambda
x
:
feature_normalize
(
...
@@ -136,7 +136,7 @@ def main(args):
...
@@ -136,7 +136,7 @@ def main(args):
**
cpu_feat_conf
)
**
cpu_feat_conf
)
test_sampler
=
BatchSampler
(
test_sampler
=
BatchSampler
(
test_ds
,
batch_size
=
args
.
batch_size
,
shuffle
=
Fals
e
)
test_ds
,
batch_size
=
args
.
batch_size
,
shuffle
=
Tru
e
)
test_loader
=
DataLoader
(
test_ds
,
test_loader
=
DataLoader
(
test_ds
,
batch_sampler
=
test_sampler
,
batch_sampler
=
test_sampler
,
collate_fn
=
lambda
x
:
feature_normalize
(
collate_fn
=
lambda
x
:
feature_normalize
(
...
...
examples/voxceleb/sv0/local/train.py
浏览文件 @
14efbf5b
...
@@ -56,10 +56,10 @@ def main(args):
...
@@ -56,10 +56,10 @@ def main(args):
# set the random seed, it is a must for multiprocess training
# set the random seed, it is a must for multiprocess training
seed_everything
(
args
.
seed
)
seed_everything
(
args
.
seed
)
# stage2: data prepare, such vox1 and vox2 data, and augment data and pipline
# stage2: data prepare, such vox1 and vox2 data, and augment
noise
data and pipline
# note: some cmd must do in rank==0, so wo will refactor the data prepare code
# note: some cmd must do in rank==0, so wo will refactor the data prepare code
train_d
s
=
VoxCeleb1
(
'train'
,
target_dir
=
args
.
data_dir
)
train_d
ataset
=
VoxCeleb1
(
'train'
,
target_dir
=
args
.
data_dir
)
dev_d
s
=
VoxCeleb1
(
'dev'
,
target_dir
=
args
.
data_dir
)
dev_d
ataset
=
VoxCeleb1
(
'dev'
,
target_dir
=
args
.
data_dir
)
if
args
.
augment
:
if
args
.
augment
:
augment_pipeline
=
build_augment_pipeline
(
target_dir
=
args
.
data_dir
)
augment_pipeline
=
build_augment_pipeline
(
target_dir
=
args
.
data_dir
)
...
@@ -123,9 +123,9 @@ def main(args):
...
@@ -123,9 +123,9 @@ def main(args):
# stage8: we build the batch sampler for paddle.DataLoader
# stage8: we build the batch sampler for paddle.DataLoader
train_sampler
=
DistributedBatchSampler
(
train_sampler
=
DistributedBatchSampler
(
train_d
s
,
batch_size
=
args
.
batch_size
,
shuffle
=
True
,
drop_last
=
False
)
train_d
ataset
,
batch_size
=
args
.
batch_size
,
shuffle
=
True
,
drop_last
=
False
)
train_loader
=
DataLoader
(
train_loader
=
DataLoader
(
train_d
s
,
train_d
ataset
,
batch_sampler
=
train_sampler
,
batch_sampler
=
train_sampler
,
num_workers
=
args
.
num_workers
,
num_workers
=
args
.
num_workers
,
collate_fn
=
waveform_collate_fn
,
collate_fn
=
waveform_collate_fn
,
...
@@ -216,12 +216,12 @@ def main(args):
...
@@ -216,12 +216,12 @@ def main(args):
# stage 9-12: construct the valid dataset dataloader
# stage 9-12: construct the valid dataset dataloader
dev_sampler
=
BatchSampler
(
dev_sampler
=
BatchSampler
(
dev_d
s
,
dev_d
ataset
,
batch_size
=
args
.
batch_size
//
4
,
batch_size
=
args
.
batch_size
//
4
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
)
drop_last
=
False
)
dev_loader
=
DataLoader
(
dev_loader
=
DataLoader
(
dev_d
s
,
dev_d
ataset
,
batch_sampler
=
dev_sampler
,
batch_sampler
=
dev_sampler
,
collate_fn
=
waveform_collate_fn
,
collate_fn
=
waveform_collate_fn
,
num_workers
=
args
.
num_workers
,
num_workers
=
args
.
num_workers
,
...
...
examples/voxceleb/sv0/run.sh
浏览文件 @
14efbf5b
...
@@ -3,6 +3,8 @@
...
@@ -3,6 +3,8 @@
set
-e
set
-e
#######################################################################
#######################################################################
# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md
# stage 1: train the speaker identification model
# stage 1: train the speaker identification model
# stage 2: test speaker identification
# stage 2: test speaker identification
# stage 3: extract the training embeding to train the LDA and PLDA
# stage 3: extract the training embeding to train the LDA and PLDA
...
@@ -12,23 +14,42 @@ set -e
...
@@ -12,23 +14,42 @@ set -e
# default the dataset is the ~/.paddleaudio/
# default the dataset is the ~/.paddleaudio/
# export PPAUDIO_HOME=
# export PPAUDIO_HOME=
stage
=
2
stage
=
0
dir
=
data/
# data directory
dir
=
data
.bak
/
# data directory
exp_dir
=
exp/ecapa-tdnn/
# experiment directory
exp_dir
=
exp/ecapa-tdnn/
# experiment directory
mkdir
-p
${
dir
}
mkdir
-p
${
dir
}
mkdir
-p
${
exp_dir
}
# if [ $stage -le 0 ]; then
# # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
# # todo
# fi
if
[
$stage
-le
1
]
;
then
if
[
$stage
-le
1
]
;
then
# stage 1: train the speaker identification model
# stage 1: train the speaker identification model
python3
\
python3
\
-m
paddle.distributed.launch
--gpus
=
0,1,2,3
\
-m
paddle.distributed.launch
--gpus
=
0,1,2,3
\
local
/train.py
--device
"gpu"
--checkpoint-dir
${
exp_dir
}
\
local
/train.py
--device
"gpu"
--checkpoint-dir
${
exp_dir
}
--augment
\
--save-freq
10
--data-dir
${
dir
}
--batch-size
256
--epochs
6
0
--save-freq
10
--data-dir
${
dir
}
--batch-size
64
--epochs
10
0
fi
fi
if
[
$stage
-le
2
]
;
then
if
[
$stage
-le
2
]
;
then
# stage 1: train the speaker identification model
# stage 1: train the speaker identification model
# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
python3
\
local
/speaker_verification_cosine.py
\
--batch-size
4
--data-dir
${
dir
}
--load-checkpoint
${
exp_dir
}
/epoch_10/
fi
if
[
$stage
-le
3
]
;
then
# stage 1: train the speaker identification model
# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
python3
\
python3
\
local
/
speaker_verification_cosine.py
\
local
/
extract_speaker_embedding.py
\
--
load-checkpoint
${
exp_dir
}
/epoch_4
0/
--
audio-path
"demo/csv/00001.wav"
--load-checkpoint
${
exp_dir
}
/epoch_6
0/
fi
fi
# if [ $stage -le 3 ]; then
# # stage 2: extract the training embeding to train the LDA and PLDA
# # todo: extract the training embedding
# fi
paddleaudio/datasets/voxceleb.py
浏览文件 @
14efbf5b
...
@@ -28,7 +28,7 @@ from paddleaudio.backends import load as load_audio
...
@@ -28,7 +28,7 @@ from paddleaudio.backends import load as load_audio
from
paddleaudio.datasets.dataset
import
feat_funcs
from
paddleaudio.datasets.dataset
import
feat_funcs
from
paddleaudio.utils
import
DATA_HOME
from
paddleaudio.utils
import
DATA_HOME
from
paddleaudio.utils
import
decompress
from
paddleaudio.utils
import
decompress
from
paddle
audio.utils
import
download_and_decompress
from
paddle
speech.vector.utils.download
import
download_and_decompress
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.s2t.utils.log
import
Log
from
utils.utility
import
download
from
utils.utility
import
download
from
utils.utility
import
unpack
from
utils.utility
import
unpack
...
@@ -106,13 +106,14 @@ class VoxCeleb1(Dataset):
...
@@ -106,13 +106,14 @@ class VoxCeleb1(Dataset):
self
.
chunk_duration
=
chunk_duration
self
.
chunk_duration
=
chunk_duration
self
.
split_ratio
=
split_ratio
self
.
split_ratio
=
split_ratio
self
.
target_dir
=
target_dir
if
target_dir
else
self
.
base_path
self
.
target_dir
=
target_dir
if
target_dir
else
self
.
base_path
self
.
csv_path
=
os
.
path
.
join
(
VoxCeleb1
.
csv_path
=
os
.
path
.
join
(
target_dir
,
'csv'
)
if
target_dir
else
os
.
path
.
join
(
self
.
base_path
,
target_dir
,
'csv'
)
if
target_dir
else
os
.
path
.
join
(
self
.
base_path
,
'csv'
)
'csv'
)
self
.
meta_path
=
os
.
path
.
join
(
VoxCeleb1
.
meta_path
=
os
.
path
.
join
(
target_dir
,
'meta'
)
if
target_dir
else
os
.
path
.
join
(
self
.
base_path
,
target_dir
,
'meta'
)
if
target_dir
else
os
.
path
.
join
(
self
.
base_path
,
'meta'
)
'meta'
)
self
.
veri_test_file
=
os
.
path
.
join
(
self
.
meta_path
,
'veri_test2.txt'
)
VoxCeleb1
.
veri_test_file
=
os
.
path
.
join
(
self
.
meta_path
,
'veri_test2.txt'
)
# self._data = self._get_data()[:1000] # KP: Small dataset test.
# self._data = self._get_data()[:1000] # KP: Small dataset test.
self
.
_data
=
self
.
_get_data
()
self
.
_data
=
self
.
_get_data
()
super
(
VoxCeleb1
,
self
).
__init__
()
super
(
VoxCeleb1
,
self
).
__init__
()
...
...
paddlespeech/vector/io/batch.py
浏览文件 @
14efbf5b
...
@@ -24,10 +24,19 @@ def waveform_collate_fn(batch):
...
@@ -24,10 +24,19 @@ def waveform_collate_fn(batch):
def
feature_normalize
(
feats
:
paddle
.
Tensor
,
def
feature_normalize
(
feats
:
paddle
.
Tensor
,
mean_norm
:
bool
=
True
,
mean_norm
:
bool
=
True
,
std_norm
:
bool
=
True
):
std_norm
:
bool
=
True
,
convert_to_numpy
:
bool
=
False
):
# Features normalization if needed
# Features normalization if needed
mean
=
feats
.
mean
(
axis
=-
1
,
keepdim
=
True
)
if
mean_norm
else
0
# numpy.mean is a little with paddle.mean about 1e-6
std
=
feats
.
std
(
axis
=-
1
,
keepdim
=
True
)
if
std_norm
else
1
if
convert_to_numpy
:
feats
=
(
feats
-
mean
)
/
std
feats_np
=
feats
.
numpy
()
mean
=
feats_np
.
mean
(
axis
=-
1
,
keepdims
=
True
)
if
mean_norm
else
0
std
=
feats_np
.
std
(
axis
=-
1
,
keepdims
=
True
)
if
std_norm
else
1
feats_np
=
(
feats_np
-
mean
)
/
std
feats
=
paddle
.
to_tensor
(
feats_np
,
dtype
=
feats
.
dtype
)
else
:
mean
=
feats
.
mean
(
axis
=-
1
,
keepdim
=
True
)
if
mean_norm
else
0
std
=
feats
.
std
(
axis
=-
1
,
keepdim
=
True
)
if
std_norm
else
1
feats
=
(
feats
-
mean
)
/
std
return
feats
return
feats
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录