Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
9874fb7d
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9874fb7d
编写于
3月 22, 2022
作者:
X
xiongxinlei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add some comments in code
上级
b9eafddd
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
39 addition
and
47 deletion
+39
-47
paddlespeech/cli/vector/infer.py
paddlespeech/cli/vector/infer.py
+21
-25
paddlespeech/vector/io/batch.py
paddlespeech/vector/io/batch.py
+12
-17
paddlespeech/vector/modules/sid_model.py
paddlespeech/vector/modules/sid_model.py
+6
-5
未找到文件。
paddlespeech/cli/vector/infer.py
浏览文件 @
9874fb7d
...
...
@@ -19,34 +19,28 @@ from typing import List
from
typing
import
Optional
from
typing
import
Union
import
librosa
import
numpy
as
np
import
paddle
import
soundfile
from
yacs.config
import
CfgNode
from
paddleaudio.backends
import
load
as
load_audio
from
paddleaudio.compliance.librosa
import
melspectrogram
from
..download
import
get_path_from_url
from
..executor
import
BaseExecutor
from
..log
import
logger
from
..utils
import
cli_register
from
..utils
import
download_and_decompress
from
..utils
import
MODEL_HOME
from
..utils
import
stats_wrapper
from
paddlespeech.vector.io.batch
import
feature_normalize
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.transform.transformation
import
Transformation
from
paddleaudio.backends
import
load
as
load_audio
from
paddleaudio.compliance.librosa
import
melspectrogram
from
paddlespeech.s2t.utils.dynamic_import
import
dynamic_import
from
paddlespeech.
s2t.utils.utility
import
UpdateConfig
from
paddlespeech.
vector.io.batch
import
feature_normalize
from
paddlespeech.vector.modules.sid_model
import
SpeakerIdetification
pretrained_models
=
{
# The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
# e.g. "
EcapaT
dnn_voxceleb12-16k".
# e.g. "
ecapat
dnn_voxceleb12-16k".
# Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
# "paddlespeech vector --task spk --model
EcapaT
dnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
"
EcapaT
dnn_voxceleb12-16k"
:
{
# "paddlespeech vector --task spk --model
ecapat
dnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
"
ecapat
dnn_voxceleb12-16k"
:
{
'url'
:
'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz'
,
'md5'
:
...
...
@@ -59,7 +53,7 @@ pretrained_models = {
}
model_alias
=
{
"
EcapaT
dnn"
:
"paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn"
,
"
ecapat
dnn"
:
"paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn"
,
}
...
...
@@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
self
.
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"
EcapaT
dnn_voxceleb12"
,
choices
=
[
"
EcapaT
dnn_voxceleb12"
],
default
=
"
ecapat
dnn_voxceleb12"
,
choices
=
[
"
ecapat
dnn_voxceleb12"
],
help
=
"Choose model type of asr task."
)
self
.
parser
.
add_argument
(
"--task"
,
...
...
@@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
"--sample_rate"
,
type
=
int
,
default
=
16000
,
choices
=
[
16000
,
8000
],
choices
=
[
16000
],
help
=
"Choose the audio sample rate of the model. 8000 or 16000"
)
self
.
parser
.
add_argument
(
"--ckpt_path"
,
...
...
@@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
@
stats_wrapper
def
__call__
(
self
,
audio_file
:
os
.
PathLike
,
model
:
str
=
'
EcapaT
dnn-voxceleb12'
,
model
:
str
=
'
ecapat
dnn-voxceleb12'
,
sample_rate
:
int
=
16000
,
config
:
os
.
PathLike
=
None
,
ckpt_path
:
os
.
PathLike
=
None
,
...
...
@@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
def
_get_pretrained_path
(
self
,
tag
:
str
)
->
os
.
PathLike
:
support_models
=
list
(
pretrained_models
.
keys
())
assert
tag
in
pretrained_models
,
\
'The model "{}" you want to use has not been supported,
\
please choose other models.
\n
\
The support models includes
\n\t\t
{}'
.
format
(
tag
,
"
\n\t\t
"
.
join
(
support_models
))
'The model "{}" you want to use has not been supported,
'
\
'please choose other models.
\n
'
\
'The support models includes
\n\t\t
{}'
.
format
(
tag
,
"
\n\t\t
"
.
join
(
support_models
))
res_path
=
os
.
path
.
join
(
MODEL_HOME
,
tag
)
decompressed_path
=
download_and_decompress
(
pretrained_models
[
tag
],
...
...
@@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
return
decompressed_path
def
_init_from_path
(
self
,
model_type
:
str
=
'
EcapaT
dnn_voxceleb12'
,
model_type
:
str
=
'
ecapat
dnn_voxceleb12'
,
sample_rate
:
int
=
16000
,
cfg_path
:
Optional
[
os
.
PathLike
]
=
None
,
ckpt_path
:
Optional
[
os
.
PathLike
]
=
None
):
...
...
@@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
res_path
=
self
.
_get_pretrained_path
(
tag
)
self
.
res_path
=
res_path
self
.
cfg_path
=
os
.
path
.
join
(
res_path
,
pretrained_models
[
tag
][
'cfg_path'
])
self
.
ckpt_path
=
os
.
path
.
join
(
res_path
,
pretrained_models
[
tag
][
'ckpt_path'
]
+
'.pdparams'
)
self
.
cfg_path
=
os
.
path
.
join
(
res_path
,
pretrained_models
[
tag
][
'cfg_path'
])
self
.
ckpt_path
=
os
.
path
.
join
(
res_path
,
pretrained_models
[
tag
][
'ckpt_path'
]
+
'.pdparams'
)
else
:
self
.
cfg_path
=
os
.
path
.
abspath
(
cfg_path
)
self
.
ckpt_path
=
os
.
path
.
abspath
(
ckpt_path
+
".pdparams"
)
...
...
@@ -239,7 +235,7 @@ class VectorExecutor(BaseExecutor):
logger
.
info
(
f
"start to read the ckpt from
{
self
.
ckpt_path
}
"
)
logger
.
info
(
f
"read the config from
{
self
.
cfg_path
}
"
)
logger
.
info
(
f
"get the res path
{
self
.
res_path
}
"
)
# stage 2: read and config and init the model body
self
.
config
=
CfgNode
(
new_allowed
=
True
)
self
.
config
.
merge_from_file
(
self
.
cfg_path
)
...
...
@@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):
feats
=
self
.
_inputs
[
"feats"
]
lengths
=
self
.
_inputs
[
"lengths"
]
logger
.
info
(
f
"start to do backbone network model forward"
)
logger
.
info
(
"start to do backbone network model forward"
)
logger
.
info
(
f
"feats shape:
{
feats
.
shape
}
, lengths shape:
{
lengths
.
shape
}
"
)
# embedding from (1, emb_size, 1) -> (emb_size)
...
...
paddlespeech/vector/io/batch.py
浏览文件 @
9874fb7d
...
...
@@ -11,9 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
import
numpy
as
np
import
paddle
import
numpy
def
waveform_collate_fn
(
batch
):
waveforms
=
np
.
stack
([
item
[
'feat'
]
for
item
in
batch
])
...
...
@@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
return
np
.
pad
(
x
,
pad_width
,
mode
=
mode
,
**
kwargs
)
def
batch_feature_normalize
(
batch
,
mean_norm
:
bool
=
True
,
std_norm
:
bool
=
True
):
ids
=
[
item
[
'id'
]
for
item
in
batch
]
lengths
=
np
.
asarray
([
item
[
'feat'
].
shape
[
1
]
for
item
in
batch
])
...
...
@@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
"""
assert
len
(
target_shape
)
==
array
.
ndim
pads
=
[]
# this contains the abs length of the padding for each dimension.
valid_vals
=
[]
# thi
c
contains the relative lengths for each dimension.
i
=
0
# iterating over target_shape ndims
valid_vals
=
[]
# thi
s
contains the relative lengths for each dimension.
i
=
0
# iterating over target_shape ndims
while
i
<
len
(
target_shape
):
assert
(
target_shape
[
i
]
>=
array
.
shape
[
i
]
),
"Target shape must be >= original shape for every dim"
assert
(
target_shape
[
i
]
>=
array
.
shape
[
i
]
),
"Target shape must be >= original shape for every dim"
pads
.
append
([
0
,
target_shape
[
i
]
-
array
.
shape
[
i
]])
valid_vals
.
append
(
array
.
shape
[
i
]
/
target_shape
[
i
])
i
+=
1
...
...
@@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
# if there is only one array in the batch we simply unsqueeze it.
return
numpy
.
expand_dims
(
arrays
[
0
],
axis
=
0
),
numpy
.
array
([
1.0
])
if
not
(
any
(
[
arrays
[
i
].
ndim
==
arrays
[
0
].
ndim
for
i
in
range
(
1
,
len
(
arrays
))]
)
):
if
not
(
any
(
[
arrays
[
i
].
ndim
==
arrays
[
0
].
ndim
for
i
in
range
(
1
,
len
(
arrays
))])):
raise
IndexError
(
"All arrays must have same number of dimensions"
)
# FIXME we limit the support here: we allow padding of only the last dimension
...
...
@@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
for
dim
in
range
(
arrays
[
0
].
ndim
):
if
dim
!=
(
arrays
[
0
].
ndim
-
1
):
if
not
all
(
[
x
.
shape
[
dim
]
==
arrays
[
0
].
shape
[
dim
]
for
x
in
arrays
[
1
:]]
):
[
x
.
shape
[
dim
]
==
arrays
[
0
].
shape
[
dim
]
for
x
in
arrays
[
1
:]]):
raise
EnvironmentError
(
"arrays should have same dimensions except for last one"
)
"arrays should have same dimensions except for last one"
)
max_shape
.
append
(
max
([
x
.
shape
[
dim
]
for
x
in
arrays
]))
batched
=
[]
...
...
@@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
for
t
in
arrays
:
# for each array we apply pad_right_to
padded
,
valid_percent
=
pad_right_to
(
t
,
max_shape
,
mode
=
mode
,
value
=
value
)
t
,
max_shape
,
mode
=
mode
,
value
=
value
)
batched
.
append
(
padded
)
valid
.
append
(
valid_percent
[
-
1
])
...
...
paddlespeech/vector/modules/sid_model.py
浏览文件 @
9874fb7d
...
...
@@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
lin_blocks
=
0
,
lin_neurons
=
192
,
dropout
=
0.1
,
):
"""_summary_
"""The speaker identification model, which includes the speaker backbone network
and the a linear transform to speaker class num in training
Args:
backbone (Paddle.nn.Layer class): the speaker identification backbone network model
...
...
@@ -41,7 +42,7 @@ class SpeakerIdetification(nn.Layer):
self
.
dropout
=
nn
.
Dropout
(
dropout
)
else
:
self
.
dropout
=
None
# construct the speaker classifer
input_size
=
self
.
backbone
.
emb_size
self
.
blocks
=
nn
.
LayerList
()
...
...
@@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
including the speaker embedding model and the classifier model network
Args:
x (
P
addle.Tensor): input audio feats,
x (
p
addle.Tensor): input audio feats,
shape=[batch, dimension, times]
lengths (
_type_
, optional): input audio length.
lengths (
paddle.Tensor
, optional): input audio length.
shape=[batch, times]
Defaults to None.
Returns:
_type_: _description_
paddle.Tensor: return the logits of the feats
"""
# x.shape: (N, C, L)
x
=
self
.
backbone
(
x
,
lengths
).
squeeze
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录