Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
acebfad7
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
acebfad7
编写于
4月 03, 2022
作者:
X
xiongxinlei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change the vector csv.spk_id to csv.label, test=doc
上级
57c11dca
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
68 addition
and
64 deletion
+68
-64
examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
...xceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
+5
-3
examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
...ples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
+30
-22
paddlespeech/vector/exps/ecapa_tdnn/test.py
paddlespeech/vector/exps/ecapa_tdnn/test.py
+11
-13
paddlespeech/vector/exps/ecapa_tdnn/train.py
paddlespeech/vector/exps/ecapa_tdnn/train.py
+2
-5
paddlespeech/vector/io/dataset.py
paddlespeech/vector/io/dataset.py
+16
-17
paddlespeech/vector/utils/vector_utils.py
paddlespeech/vector/utils/vector_utils.py
+4
-4
未找到文件。
examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
浏览文件 @
acebfad7
...
...
@@ -25,7 +25,7 @@ from yacs.config import CfgNode
from
paddleaudio
import
load
as
load_audio
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.vector.utils.utils
import
get_chunks
from
paddlespeech.vector.utils.
vector_
utils
import
get_chunks
logger
=
Log
(
__name__
).
getlog
()
...
...
@@ -57,7 +57,9 @@ def get_chunks_list(wav_file: str,
end_sample
=
int
(
float
(
e
)
*
sr
)
# currently, all vector csv data format use one representation
# id, duration, wav, start, stop, spk_id
# id, duration, wav, start, stop, label
# in rirs noise, all the label name is 'noise'
# the label is string type and we will convert it to integer type in training
ret
.
append
([
chunk
,
audio_duration
,
wav_file
,
start_sample
,
end_sample
,
"noise"
...
...
@@ -81,7 +83,7 @@ def generate_csv(wav_files,
split_chunks (bool): audio split flag
"""
logger
.
info
(
f
'Generating csv:
{
output_file
}
'
)
header
=
[
"utt_id"
,
"duration"
,
"wav"
,
"start"
,
"stop"
,
"lab
_id
"
]
header
=
[
"utt_id"
,
"duration"
,
"wav"
,
"start"
,
"stop"
,
"lab
el
"
]
csv_lines
=
[]
for
item
in
tqdm
.
tqdm
(
wav_files
):
csv_lines
.
extend
(
...
...
examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
浏览文件 @
acebfad7
...
...
@@ -26,7 +26,7 @@ from yacs.config import CfgNode
from
paddleaudio
import
load
as
load_audio
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.vector.utils.utils
import
get_chunks
from
paddlespeech.vector.utils.
vector_
utils
import
get_chunks
logger
=
Log
(
__name__
).
getlog
()
...
...
@@ -38,28 +38,31 @@ def prepare_csv(wav_files, output_file, config, split_chunks=True):
wav_files (list): all the audio list to prepare the csv file
output_file (str): the output csv file
config (CfgNode): yaml configuration content
split_chunks (bool
): audio split flag
split_chunks (bool
, optional): audio split flag. Defaults to True.
"""
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
output_file
)):
os
.
makedirs
(
os
.
path
.
dirname
(
output_file
))
csv_lines
=
[]
header
=
[
"utt_id"
,
"duration"
,
"wav"
,
"start"
,
"stop"
,
"lab
_id
"
]
header
=
[
"utt_id"
,
"duration"
,
"wav"
,
"start"
,
"stop"
,
"lab
el
"
]
# voxceleb meta info for each training utterance segment
# we extract a segment from a utterance to train
# and the segment' period is between start and stop time point in the original wav file
# each field in the meta means as follows:
# utt_id: the utterance segment name
# duration: utterance segment time
# wav: utterance file path
# start: start point in the original wav file
# stop: stop point in the original wav file
# lab_id: the utterance segment's speaker name
# each field in the meta info means as follows:
# utt_id: the utterance segment name, which is uniq in training dataset
# duration: the total utterance time
# wav: utterance file path, which should be absoulute path
# start: start point in the original wav file sample point range
# stop: stop point in the original wav file sample point range
# label: the utterance segment's label name,
# which is speaker name in speaker verification domain
for
item
in
tqdm
.
tqdm
(
wav_files
,
total
=
len
(
wav_files
)):
item
=
json
.
loads
(
item
.
strip
())
audio_id
=
item
[
'utt'
].
replace
(
".wav"
,
""
)
audio_id
=
item
[
'utt'
].
replace
(
".wav"
,
""
)
# we remove the wav suffix name
audio_duration
=
item
[
'feat_shape'
][
0
]
wav_file
=
item
[
'feat'
]
spk_id
=
audio_id
.
split
(
'-'
)[
0
]
label
=
audio_id
.
split
(
'-'
)[
0
]
# speaker name in speaker verification domain
waveform
,
sr
=
load_audio
(
wav_file
)
if
split_chunks
:
uniq_chunks_list
=
get_chunks
(
config
.
chunk_duration
,
audio_id
,
...
...
@@ -68,14 +71,15 @@ def prepare_csv(wav_files, output_file, config, split_chunks=True):
s
,
e
=
chunk
.
split
(
"_"
)[
-
2
:]
# Timestamps of start and end
start_sample
=
int
(
float
(
s
)
*
sr
)
end_sample
=
int
(
float
(
e
)
*
sr
)
# id, duration, wav, start, stop, spk_id
# id, duration, wav, start, stop, label
# in vector, the label in speaker id
csv_lines
.
append
([
chunk
,
audio_duration
,
wav_file
,
start_sample
,
end_sample
,
spk_id
label
])
else
:
csv_lines
.
append
([
audio_id
,
audio_duration
,
wav_file
,
0
,
waveform
.
shape
[
0
],
spk_id
audio_id
,
audio_duration
,
wav_file
,
0
,
waveform
.
shape
[
0
],
label
])
with
open
(
output_file
,
mode
=
"w"
)
as
csv_f
:
...
...
@@ -113,6 +117,9 @@ def get_enroll_test_list(dataset_list, verification_file):
for
dataset
in
dataset_list
:
with
open
(
dataset
,
'r'
)
as
f
:
for
line
in
f
:
# audio_id may be in enroll and test at the same time
# eg: 1 a.wav a.wav
# the audio a.wav is enroll and test file at the same time
audio_id
=
json
.
loads
(
line
.
strip
())[
'utt'
]
if
audio_id
in
enroll_audios
:
enroll_files
.
append
(
line
)
...
...
@@ -145,17 +152,18 @@ def get_train_dev_list(dataset_list, target_dir, split_ratio):
for
dataset
in
dataset_list
:
with
open
(
dataset
,
'r'
)
as
f
:
for
line
in
f
:
spk_id
=
json
.
loads
(
line
.
strip
())[
'utt2spk'
]
speakers
.
add
(
spk_id
)
# the label is speaker name
label_name
=
json
.
loads
(
line
.
strip
())[
'utt2spk'
]
speakers
.
add
(
label_name
)
audio_files
.
append
(
line
.
strip
())
speakers
=
sorted
(
speakers
)
logger
.
info
(
f
"we get
{
len
(
speakers
)
}
speakers from all the train dataset"
)
with
open
(
os
.
path
.
join
(
target_dir
,
"meta"
,
"
spk_id2label
.txt"
),
'w'
)
as
f
:
for
label
,
spk_id
in
enumerate
(
speakers
):
f
.
write
(
f
'
{
spk_id
}
{
label
}
\n
'
)
with
open
(
os
.
path
.
join
(
target_dir
,
"meta"
,
"
label2id
.txt"
),
'w'
)
as
f
:
for
label
_id
,
label_name
in
enumerate
(
speakers
):
f
.
write
(
f
'
{
label_name
}
{
label_id
}
\n
'
)
logger
.
info
(
f
'we store the speakers to
{
os
.
path
.
join
(
target_dir
,
"meta"
,
"
spk_id2label
.txt"
)
}
'
f
'we store the speakers to
{
os
.
path
.
join
(
target_dir
,
"meta"
,
"
label2id
.txt"
)
}
'
)
# the split_ratio is for train dataset
...
...
@@ -185,7 +193,7 @@ def prepare_data(args, config):
return
# stage 1: prepare the enroll and test csv file
# And we generate the speaker to label file
spk_id2label
.txt
# And we generate the speaker to label file
label2id
.txt
logger
.
info
(
"start to prepare the data csv file"
)
enroll_files
,
test_files
=
get_enroll_test_list
(
[
args
.
test
],
verification_file
=
config
.
verification_file
)
...
...
paddlespeech/vector/exps/ecapa_tdnn/test.py
浏览文件 @
acebfad7
...
...
@@ -21,10 +21,10 @@ from paddle.io import DataLoader
from
tqdm
import
tqdm
from
yacs.config
import
CfgNode
from
paddleaudio.datasets
import
VoxCeleb
from
paddleaudio.metric
import
compute_eer
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.vector.io.batch
import
batch_feature_normalize
from
paddlespeech.vector.io.dataset
import
CSVDataset
from
paddlespeech.vector.models.ecapa_tdnn
import
EcapaTdnn
from
paddlespeech.vector.modules.sid_model
import
SpeakerIdetification
from
paddlespeech.vector.training.seeding
import
seed_everything
...
...
@@ -58,9 +58,8 @@ def main(args, config):
# stage4: construct the enroll and test dataloader
enroll_dataset
=
VoxCeleb
(
subset
=
'enroll'
,
target_dir
=
args
.
data_dir
,
enroll_dataset
=
CSVDataset
(
os
.
path
.
join
(
args
.
data_dir
,
"vox/csv/enroll.csv"
),
feat_type
=
'melspectrogram'
,
random_chunk
=
False
,
n_mels
=
config
.
n_mels
,
...
...
@@ -69,15 +68,14 @@ def main(args, config):
enroll_sampler
=
BatchSampler
(
enroll_dataset
,
batch_size
=
config
.
batch_size
,
shuffle
=
True
)
# Shuffle to make embedding normalization more robust.
enrol_loader
=
DataLoader
(
enroll_dataset
,
enrol
l
_loader
=
DataLoader
(
enroll_dataset
,
batch_sampler
=
enroll_sampler
,
collate_fn
=
lambda
x
:
batch_feature_normalize
(
x
,
mean_norm
=
True
,
std_norm
=
False
),
x
,
mean_norm
=
True
,
std_norm
=
False
),
num_workers
=
config
.
num_workers
,
return_list
=
True
,)
test_dataset
=
VoxCeleb
(
subset
=
'test'
,
target_dir
=
args
.
data_dir
,
test_dataset
=
CSVDataset
(
os
.
path
.
join
(
args
.
data_dir
,
"vox/csv/test.csv"
),
feat_type
=
'melspectrogram'
,
random_chunk
=
False
,
n_mels
=
config
.
n_mels
,
...
...
@@ -108,9 +106,9 @@ def main(args, config):
id2embedding
=
{}
# Run multi times to make embedding normalization more stable.
for
i
in
range
(
2
):
for
dl
in
[
enrol_loader
,
test_loader
]:
for
dl
in
[
enrol
l
_loader
,
test_loader
]:
logger
.
info
(
f
'Loop
{
[
i
+
1
]
}
: Computing embeddings on
{
dl
.
dataset
.
subset
}
dataset'
f
'Loop
{
[
i
+
1
]
}
: Computing embeddings on
{
dl
.
dataset
.
csv_path
}
dataset'
)
with
paddle
.
no_grad
():
for
batch_idx
,
batch
in
enumerate
(
tqdm
(
dl
)):
...
...
@@ -152,8 +150,8 @@ def main(args, config):
labels
=
[]
enroll_ids
=
[]
test_ids
=
[]
logger
.
info
(
f
"read the trial from
{
VoxCeleb
.
veri_test
_file
}
"
)
with
open
(
VoxCeleb
.
veri_test
_file
,
'r'
)
as
f
:
logger
.
info
(
f
"read the trial from
{
config
.
verification
_file
}
"
)
with
open
(
config
.
verification
_file
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
label
,
enroll_id
,
test_id
=
line
.
strip
().
split
(
' '
)
labels
.
append
(
int
(
label
))
...
...
paddlespeech/vector/exps/ecapa_tdnn/train.py
浏览文件 @
acebfad7
...
...
@@ -57,12 +57,10 @@ def main(args, config):
# note: some cmd must do in rank==0, so wo will refactor the data prepare code
train_dataset
=
CSVDataset
(
csv_path
=
os
.
path
.
join
(
args
.
data_dir
,
"vox/csv/train.csv"
),
spk_id2label_path
=
os
.
path
.
join
(
args
.
data_dir
,
"vox/meta/spk_id2label.txt"
))
label2id_path
=
os
.
path
.
join
(
args
.
data_dir
,
"vox/meta/label2id.txt"
))
dev_dataset
=
CSVDataset
(
csv_path
=
os
.
path
.
join
(
args
.
data_dir
,
"vox/csv/dev.csv"
),
spk_id2label_path
=
os
.
path
.
join
(
args
.
data_dir
,
"vox/meta/spk_id2label.txt"
))
label2id_path
=
os
.
path
.
join
(
args
.
data_dir
,
"vox/meta/label2id.txt"
))
if
config
.
augment
:
augment_pipeline
=
build_augment_pipeline
(
target_dir
=
args
.
data_dir
)
...
...
@@ -148,7 +146,6 @@ def main(args, config):
train_reader_cost
=
0.0
train_feat_cost
=
0.0
train_run_cost
=
0.0
train_misce_cost
=
0.0
reader_start
=
time
.
time
()
for
batch_idx
,
batch
in
enumerate
(
train_loader
):
...
...
paddlespeech/vector/io/dataset.py
浏览文件 @
acebfad7
...
...
@@ -25,7 +25,7 @@ logger = Log(__name__).getlog()
# wav: utterance file path
# start: start point in the original wav file
# stop: stop point in the original wav file
# lab
_id
: the utterance segment's label id
# lab
el
: the utterance segment's label id
@
dataclass
...
...
@@ -45,24 +45,24 @@ class meta_info:
wav
:
str
start
:
int
stop
:
int
lab
_id
:
str
lab
el
:
str
class
CSVDataset
(
Dataset
):
def
__init__
(
self
,
csv_path
,
spk_id2label
_path
=
None
,
config
=
None
):
def
__init__
(
self
,
csv_path
,
label2id
_path
=
None
,
config
=
None
):
"""Implement the CSV Dataset
Args:
csv_path (str): csv dataset file path
spk_id2label
_path (str): the utterance label to integer id map file path
label2id
_path (str): the utterance label to integer id map file path
config (CfgNode): yaml config
"""
super
().
__init__
()
self
.
csv_path
=
csv_path
self
.
spk_id2label_path
=
spk_id2label
_path
self
.
label2id_path
=
label2id
_path
self
.
config
=
config
self
.
spk_
id2label
=
{}
self
.
label2
spk_
id
=
{}
self
.
id2label
=
{}
self
.
label2id
=
{}
self
.
data
=
self
.
load_data_csv
()
self
.
load_speaker_to_label
()
...
...
@@ -71,7 +71,7 @@ class CSVDataset(Dataset):
the csv dataset's format has six fields,
that is audio_id or utt_id, audio duration, segment start point, segment stop point
and utterance label.
Note in training period, the utterance label must has a map to integer id in
spk_id2label
_path
Note in training period, the utterance label must has a map to integer id in
label2id
_path
"""
data
=
[]
...
...
@@ -91,16 +91,15 @@ class CSVDataset(Dataset):
The speaker label is real speaker label in speaker verification domain,
and in language identification is language label.
"""
if
not
self
.
spk_id2label
_path
:
if
not
self
.
label2id
_path
:
logger
.
warning
(
"No speaker id to label file"
)
return
self
.
spk_id2label
=
{}
self
.
label2spk_id
=
{}
with
open
(
self
.
spk_id2label_path
,
'r'
)
as
f
:
with
open
(
self
.
label2id_path
,
'r'
)
as
f
:
for
line
in
f
.
readlines
():
spk_id
,
label
=
line
.
strip
().
split
(
' '
)
self
.
spk_id2label
[
spk_id
]
=
int
(
label
)
self
.
label2spk_id
[
int
(
label
)]
=
spk_id
label_name
,
label_id
=
line
.
strip
().
split
(
' '
)
self
.
label2id
[
label_name
]
=
int
(
label_id
)
self
.
id2label
[
int
(
label_id
)]
=
label_name
def
convert_to_record
(
self
,
idx
:
int
):
"""convert the dataset sample to training record the CSV Dataset
...
...
@@ -130,8 +129,8 @@ class CSVDataset(Dataset):
# we only return the waveform as feat
waveform
=
waveform
[
start
:
stop
]
record
.
update
({
'feat'
:
waveform
})
if
self
.
spk_id2label
:
record
.
update
({
'label'
:
self
.
spk_id2label
[
record
[
'lab_id
'
]]})
if
self
.
label2id
:
record
.
update
({
'label'
:
self
.
label2id
[
record
[
'label
'
]]})
return
record
...
...
paddlespeech/vector/utils/utils.py
→
paddlespeech/vector/utils/
vector_
utils.py
浏览文件 @
acebfad7
...
...
@@ -17,14 +17,14 @@ def get_chunks(seg_dur, audio_id, audio_duration):
"""Get all chunk segments from a utterance
Args:
seg_dur (float): segment chunk duration
audio_id (str): utterance name
audio_duration (float): utterance duration
seg_dur (float): segment chunk duration
, seconds
audio_id (str): utterance name
,
audio_duration (float): utterance duration
, seconds
Returns:
List: all the chunk segments
"""
num_chunks
=
int
(
audio_duration
/
seg_dur
)
# all in
milli
seconds
num_chunks
=
int
(
audio_duration
/
seg_dur
)
# all in seconds
chunk_lst
=
[
audio_id
+
"_"
+
str
(
i
*
seg_dur
)
+
"_"
+
str
(
i
*
seg_dur
+
seg_dur
)
for
i
in
range
(
num_chunks
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录