Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
e01abc50
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e01abc50
编写于
4月 07, 2022
作者:
K
KP
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add KWS example.
上级
521e222d
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
119 addition
and
5 deletion
+119
-5
audio/paddleaudio/datasets/__init__.py
audio/paddleaudio/datasets/__init__.py
+1
-0
audio/paddleaudio/datasets/dataset.py
audio/paddleaudio/datasets/dataset.py
+17
-5
audio/paddleaudio/datasets/hey_snips.py
audio/paddleaudio/datasets/hey_snips.py
+72
-0
paddlespeech/kws/__init__.py
paddlespeech/kws/__init__.py
+1
-0
paddlespeech/kws/models/mdtc.py
paddlespeech/kws/models/mdtc.py
+28
-0
未找到文件。
audio/paddleaudio/datasets/__init__.py
浏览文件 @
e01abc50
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
.esc50
import
ESC50
from
.gtzan
import
GTZAN
from
.hey_snips
import
HeySnips
from
.rirs_noises
import
OpenRIRNoise
from
.tess
import
TESS
from
.urban_sound
import
UrbanSound8K
...
...
audio/paddleaudio/datasets/dataset.py
浏览文件 @
e01abc50
...
...
@@ -17,6 +17,8 @@ import numpy as np
import
paddle
from
..backends
import
load
as
load_audio
from
..compliance.kaldi
import
fbank
as
kaldi_fbank
from
..compliance.kaldi
import
mfcc
as
kaldi_mfcc
from
..compliance.librosa
import
melspectrogram
from
..compliance.librosa
import
mfcc
...
...
@@ -24,6 +26,8 @@ feat_funcs = {
'raw'
:
None
,
'melspectrogram'
:
melspectrogram
,
'mfcc'
:
mfcc
,
'kaldi_fbank'
:
kaldi_fbank
,
'kaldi_mfcc'
:
kaldi_mfcc
,
}
...
...
@@ -73,16 +77,24 @@ class AudioClassificationDataset(paddle.io.Dataset):
feat_func
=
feat_funcs
[
self
.
feat_type
]
record
=
{}
record
[
'feat'
]
=
feat_func
(
waveform
,
sample_rate
,
**
self
.
feat_config
)
if
feat_func
else
waveform
if
self
.
feat_type
in
[
'kaldi_fbank'
,
'kaldi_mfcc'
]:
waveform
=
paddle
.
to_tensor
(
waveform
).
unsqueeze
(
0
)
# (C, T)
record
[
'feat'
]
=
feat_func
(
waveform
=
waveform
,
sr
=
self
.
sample_rate
,
**
self
.
feat_config
)
else
:
record
[
'feat'
]
=
feat_func
(
waveform
,
sample_rate
,
**
self
.
feat_config
)
if
feat_func
else
waveform
record
[
'label'
]
=
label
return
record
def
__getitem__
(
self
,
idx
):
record
=
self
.
_convert_to_record
(
idx
)
return
np
.
array
(
record
[
'feat'
]).
transpose
(),
np
.
array
(
record
[
'label'
],
dtype
=
np
.
int64
)
if
self
.
feat_type
in
[
'kaldi_fbank'
,
'kaldi_mfcc'
]:
return
self
.
keys
[
idx
],
record
[
'feat'
],
record
[
'label'
]
else
:
return
np
.
array
(
record
[
'feat'
]).
transpose
(),
np
.
array
(
record
[
'label'
],
dtype
=
np
.
int64
)
def
__len__
(
self
):
return
len
(
self
.
files
)
audio/paddleaudio/datasets/hey_snips.py
0 → 100644
浏览文件 @
e01abc50
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
collections
import
json
import
os
from
typing
import
List
from
typing
import
Tuple
from
.dataset
import
AudioClassificationDataset
__all__
=
[
'HeySnips'
]
class
HeySnips
(
AudioClassificationDataset
):
meta_info
=
collections
.
namedtuple
(
'META_INFO'
,
(
'key'
,
'label'
,
'duration'
,
'wav'
))
def
__init__
(
self
,
data_dir
:
os
.
PathLike
,
mode
:
str
=
'train'
,
feat_type
:
str
=
'kaldi_fbank'
,
sample_rate
:
int
=
16000
,
**
kwargs
):
self
.
data_dir
=
data_dir
files
,
labels
=
self
.
_get_data
(
mode
)
super
(
HeySnips
,
self
).
__init__
(
files
=
files
,
labels
=
labels
,
feat_type
=
feat_type
,
sample_rate
=
sample_rate
,
**
kwargs
)
def
_get_meta_info
(
self
,
mode
)
->
List
[
collections
.
namedtuple
]:
ret
=
[]
with
open
(
os
.
path
.
join
(
self
.
data_dir
,
'{}.json'
.
format
(
mode
)),
'r'
)
as
f
:
data
=
json
.
load
(
f
)
for
item
in
data
:
sample
=
collections
.
OrderedDict
()
if
item
[
'duration'
]
>
0
:
sample
[
'key'
]
=
item
[
'id'
]
sample
[
'label'
]
=
0
if
item
[
'is_hotword'
]
==
1
else
-
1
sample
[
'duration'
]
=
item
[
'duration'
]
sample
[
'wav'
]
=
os
.
path
.
join
(
self
.
data_dir
,
item
[
'audio_file_path'
])
ret
.
append
(
self
.
meta_info
(
*
sample
.
values
()))
return
ret
def
_get_data
(
self
,
mode
:
str
)
->
Tuple
[
List
[
str
],
List
[
int
]]:
meta_info
=
self
.
_get_meta_info
(
mode
)
files
=
[]
labels
=
[]
self
.
keys
=
[]
for
sample
in
meta_info
:
key
,
target
,
_
,
wav
=
sample
files
.
append
(
wav
)
labels
.
append
(
int
(
target
))
self
.
keys
.
append
(
key
)
return
files
,
labels
paddlespeech/kws/__init__.py
浏览文件 @
e01abc50
...
...
@@ -11,3 +11,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.models.mdtc
import
MDTC
paddlespeech/kws/models/mdtc.py
浏览文件 @
e01abc50
...
...
@@ -179,6 +179,7 @@ class MDTC(nn.Layer):
causal
))
self
.
receptive_fields
+=
self
.
blocks
[
-
1
].
receptive_fields
self
.
half_receptive_fields
=
self
.
receptive_fields
//
2
self
.
hidden_dim
=
res_channels
def
forward
(
self
,
x
:
paddle
.
Tensor
):
if
self
.
causal
:
...
...
@@ -216,3 +217,30 @@ class MDTC(nn.Layer):
outputs
+=
x
outputs
=
outputs
.
transpose
([
0
,
2
,
1
])
return
outputs
,
None
class
KWSModel
(
nn
.
Layer
):
def
__init__
(
self
,
backbone
,
num_keywords
):
super
(
KWSModel
,
self
).
__init__
()
self
.
backbone
=
backbone
self
.
linear
=
nn
.
Linear
(
self
.
backbone
.
hidden_dim
,
num_keywords
)
self
.
activation
=
nn
.
Sigmoid
()
def
forward
(
self
,
x
):
outputs
=
self
.
backbone
(
x
)
outputs
=
self
.
linear
(
outputs
)
return
self
.
activation
(
outputs
)
if
__name__
==
'__main__'
:
paddle
.
set_device
(
'cpu'
)
from
paddleaudio.features
import
LogMelSpectrogram
mdtc
=
MDTC
(
3
,
4
,
80
,
32
,
5
,
causal
=
True
)
x
=
paddle
.
randn
(
shape
=
(
32
,
16000
*
5
))
feature_extractor
=
LogMelSpectrogram
(
sr
=
16000
,
n_fft
=
512
,
n_mels
=
80
)
feats
=
feature_extractor
(
x
).
transpose
([
0
,
2
,
1
])
print
(
feats
.
shape
)
res
,
_
=
mdtc
(
feats
)
print
(
res
.
shape
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录