Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
a1c6ee5c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a1c6ee5c
编写于
6月 10, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
差异文件
merge
上级
3d5f2943
875139ca
变更
21
展开全部
隐藏空白更改
内联
并排
Showing
21 changed file
with
4442 addition
and
44 deletion
+4442
-44
deepspeech/__init__.py
deepspeech/__init__.py
+9
-0
deepspeech/exps/u2/model.py
deepspeech/exps/u2/model.py
+1
-4
deepspeech/io/dataset.py
deepspeech/io/dataset.py
+1
-25
deepspeech/modules/conformer_convolution.py
deepspeech/modules/conformer_convolution.py
+1
-1
deepspeech/modules/crf.py
deepspeech/modules/crf.py
+370
-0
deepspeech/modules/encoder.py
deepspeech/modules/encoder.py
+3
-1
deepspeech/modules/mask.py
deepspeech/modules/mask.py
+5
-3
examples/aishell/s0/README.md
examples/aishell/s0/README.md
+2
-1
examples/aishell/s0/conf/augmentation.json
examples/aishell/s0/conf/augmentation.json
+15
-0
examples/aishell/s1/conf/chunk_conformer.yaml
examples/aishell/s1/conf/chunk_conformer.yaml
+114
-0
examples/aishell/s1/conf/conformer.yaml
examples/aishell/s1/conf/conformer.yaml
+1
-1
examples/librispeech/s1/conf/chunk_confermer.yaml
examples/librispeech/s1/conf/chunk_confermer.yaml
+2
-2
examples/tiny/s1/conf/transformer.yaml
examples/tiny/s1/conf/transformer.yaml
+2
-2
examples/tiny/s1/run.sh
examples/tiny/s1/run.sh
+2
-4
third_party/nnAudio/nnAudio/Spectrogram.py
third_party/nnAudio/nnAudio/Spectrogram.py
+2440
-0
third_party/nnAudio/nnAudio/__init__.py
third_party/nnAudio/nnAudio/__init__.py
+1
-0
third_party/nnAudio/nnAudio/librosa_functions.py
third_party/nnAudio/nnAudio/librosa_functions.py
+490
-0
third_party/nnAudio/nnAudio/utils.py
third_party/nnAudio/nnAudio/utils.py
+535
-0
third_party/nnAudio/setup.py
third_party/nnAudio/setup.py
+37
-0
third_party/nnAudio/tests/parameters.py
third_party/nnAudio/tests/parameters.py
+38
-0
third_party/nnAudio/tests/test_spectrogram.py
third_party/nnAudio/tests/test_spectrogram.py
+373
-0
未找到文件。
deepspeech/__init__.py
浏览文件 @
a1c6ee5c
...
...
@@ -345,6 +345,15 @@ if not hasattr(paddle.Tensor, 'float'):
setattr
(
paddle
.
Tensor
,
'float'
,
func_float
)
def
func_int
(
x
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
return
x
.
astype
(
paddle
.
int
)
if
not
hasattr
(
paddle
.
Tensor
,
'int'
):
logger
.
warn
(
"register user int to paddle.Tensor, remove this when fixed!"
)
setattr
(
paddle
.
Tensor
,
'int'
,
func_int
)
def
tolist
(
x
:
paddle
.
Tensor
)
->
List
[
Any
]:
return
x
.
numpy
().
tolist
()
...
...
deepspeech/exps/u2/model.py
浏览文件 @
a1c6ee5c
...
...
@@ -368,7 +368,7 @@ class U2Tester(U2Trainer):
trans
.
append
(
''
.
join
([
chr
(
i
)
for
i
in
ids
]))
return
trans
def
compute_metrics
(
self
,
utts
,
audio
,
audio_len
,
texts
,
texts_len
,
fout
=
None
,
fref
=
None
):
def
compute_metrics
(
self
,
utts
,
audio
,
audio_len
,
texts
,
texts_len
,
fout
=
None
):
cfg
=
self
.
config
.
decoding
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
...
...
@@ -402,8 +402,6 @@ class U2Tester(U2Trainer):
num_ins
+=
1
if
fout
:
fout
.
write
(
utt
+
" "
+
result
+
"
\n
"
)
if
fref
:
fref
.
write
(
utt
+
" "
+
target
+
"
\n
"
)
logger
.
info
(
"
\n
Target Transcription: %s
\n
Output Transcription: %s"
%
(
target
,
result
))
logger
.
info
(
"One example error rate [%s] = %f"
%
...
...
@@ -432,7 +430,6 @@ class U2Tester(U2Trainer):
num_time
=
0.0
with
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
for
i
,
batch
in
enumerate
(
self
.
test_loader
):
# utt, audio, audio_len, text, text_len = batch
metrics
=
self
.
compute_metrics
(
*
batch
,
fout
=
fout
)
num_frames
+=
metrics
[
'num_frames'
]
num_time
+=
metrics
[
"decode_time"
]
...
...
deepspeech/io/dataset.py
浏览文件 @
a1c6ee5c
...
...
@@ -223,33 +223,9 @@ class ManifestDataset(Dataset):
def
manifest
(
self
):
return
self
.
_manifest
@
property
def
vocab_size
(
self
):
return
self
.
_speech_featurizer
.
vocab_size
@
property
def
vocab_list
(
self
):
return
self
.
_speech_featurizer
.
vocab_list
@
property
def
vocab_dict
(
self
):
return
self
.
_speech_featurizer
.
vocab_dict
@
property
def
text_feature
(
self
):
return
self
.
_speech_featurizer
.
text_feature
@
property
def
feature_size
(
self
):
return
self
.
_speech_featurizer
.
feature_size
@
property
def
stride_ms
(
self
):
return
self
.
_speech_featurizer
.
stride_ms
def
__len__
(
self
):
return
len
(
self
.
_manifest
)
def
__getitem__
(
self
,
idx
):
instance
=
self
.
_manifest
[
idx
]
return
(
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
])
return
instance
[
"utt"
],
instance
[
"feat"
],
instance
[
"text"
]
deepspeech/modules/conformer_convolution.py
浏览文件 @
a1c6ee5c
...
...
@@ -126,7 +126,7 @@ class ConvolutionModule(nn.Layer):
if
self
.
lorder
>
0
:
if
cache
is
None
:
x
=
nn
.
functional
.
pad
(
x
,
(
self
.
lorder
,
0
)
,
'constant'
,
0.0
,
data_format
=
'NCL'
)
x
,
[
self
.
lorder
,
0
]
,
'constant'
,
0.0
,
data_format
=
'NCL'
)
else
:
assert
cache
.
shape
[
0
]
==
x
.
shape
[
0
]
# B
assert
cache
.
shape
[
1
]
==
x
.
shape
[
1
]
# C
...
...
deepspeech/modules/crf.py
0 → 100644
浏览文件 @
a1c6ee5c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle
import
nn
from
deepspeech.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'CRF'
]
class
CRF
(
nn
.
Layer
):
"""
Linear-chain Conditional Random Field (CRF).
Args:
nb_labels (int): number of labels in your tagset, including special symbols.
bos_tag_id (int): integer representing the beginning of sentence symbol in
your tagset.
eos_tag_id (int): integer representing the end of sentence symbol in your tagset.
pad_tag_id (int, optional): integer representing the pad symbol in your tagset.
If None, the model will treat the PAD as a normal tag. Otherwise, the model
will apply constraints for PAD transitions.
batch_first (bool): Whether the first dimension represents the batch dimension.
"""
def
__init__
(
self
,
nb_labels
:
int
,
bos_tag_id
:
int
,
eos_tag_id
:
int
,
pad_tag_id
:
int
=
None
,
batch_first
:
bool
=
True
):
super
().
__init__
()
self
.
nb_labels
=
nb_labels
self
.
BOS_TAG_ID
=
bos_tag_id
self
.
EOS_TAG_ID
=
eos_tag_id
self
.
PAD_TAG_ID
=
pad_tag_id
self
.
batch_first
=
batch_first
# initialize transitions from a random uniform distribution between -0.1 and 0.1
self
.
transitions
=
self
.
create_parameter
(
[
self
.
nb_labels
,
self
.
nb_labels
],
default_initializer
=
nn
.
initializer
.
Uniform
(
-
0.1
,
0.1
))
self
.
init_weights
()
def
init_weights
(
self
):
# enforce contraints (rows=from, columns=to) with a big negative number
# so exp(-10000) will tend to zero
# no transitions allowed to the beginning of sentence
self
.
transitions
[:,
self
.
BOS_TAG_ID
]
=
-
10000.0
# no transition alloed from the end of sentence
self
.
transitions
[
self
.
EOS_TAG_ID
,
:]
=
-
10000.0
if
self
.
PAD_TAG_ID
is
not
None
:
# no transitions from padding
self
.
transitions
[
self
.
PAD_TAG_ID
,
:]
=
-
10000.0
# no transitions to padding
self
.
transitions
[:,
self
.
PAD_TAG_ID
]
=
-
10000.0
# except if the end of sentence is reached
# or we are already in a pad position
self
.
transitions
[
self
.
PAD_TAG_ID
,
self
.
EOS_TAG_ID
]
=
0.0
self
.
transitions
[
self
.
PAD_TAG_ID
,
self
.
PAD_TAG_ID
]
=
0.0
def
forward
(
self
,
emissions
:
paddle
.
Tensor
,
tags
:
paddle
.
Tensor
,
mask
:
paddle
.
Tensor
=
None
)
->
paddle
.
Tensor
:
"""Compute the negative log-likelihood. See `log_likelihood` method."""
nll
=
-
self
.
log_likelihood
(
emissions
,
tags
,
mask
=
mask
)
return
nll
def
log_likelihood
(
self
,
emissions
,
tags
,
mask
=
None
):
"""Compute the probability of a sequence of tags given a sequence of
emissions scores.
Args:
emissions (paddle.Tensor): Sequence of emissions for each label.
Shape of (batch_size, seq_len, nb_labels) if batch_first is True,
(seq_len, batch_size, nb_labels) otherwise.
tags (paddle.LongTensor): Sequence of labels.
Shape of (batch_size, seq_len) if batch_first is True,
(seq_len, batch_size) otherwise.
mask (paddle.FloatTensor, optional): Tensor representing valid positions.
If None, all positions are considered valid.
Shape of (batch_size, seq_len) if batch_first is True,
(seq_len, batch_size) otherwise.
Returns:
paddle.Tensor: sum of the log-likelihoods for each sequence in the batch.
Shape of ()
"""
# fix tensors order by setting batch as the first dimension
if
not
self
.
batch_first
:
emissions
=
emissions
.
transpose
(
0
,
1
)
tags
=
tags
.
transpose
(
0
,
1
)
if
mask
is
None
:
mask
=
paddle
.
ones
(
emissions
.
shape
[:
2
],
dtype
=
paddle
.
float
)
scores
=
self
.
_compute_scores
(
emissions
,
tags
,
mask
=
mask
)
partition
=
self
.
_compute_log_partition
(
emissions
,
mask
=
mask
)
return
paddle
.
sum
(
scores
-
partition
)
def
decode
(
self
,
emissions
,
mask
=
None
):
"""Find the most probable sequence of labels given the emissions using
the Viterbi algorithm.
Args:
emissions (paddle.Tensor): Sequence of emissions for each label.
Shape (batch_size, seq_len, nb_labels) if batch_first is True,
(seq_len, batch_size, nb_labels) otherwise.
mask (paddle.FloatTensor, optional): Tensor representing valid positions.
If None, all positions are considered valid.
Shape (batch_size, seq_len) if batch_first is True,
(seq_len, batch_size) otherwise.
Returns:
paddle.Tensor: the viterbi score for the for each batch.
Shape of (batch_size,)
list of lists: the best viterbi sequence of labels for each batch. [B, T]
"""
# fix tensors order by setting batch as the first dimension
if
not
self
.
batch_first
:
emissions
=
emissions
.
transpose
(
0
,
1
)
tags
=
tags
.
transpose
(
0
,
1
)
if
mask
is
None
:
mask
=
paddle
.
ones
(
emissions
.
shape
[:
2
],
dtype
=
paddle
.
float
)
scores
,
sequences
=
self
.
_viterbi_decode
(
emissions
,
mask
)
return
scores
,
sequences
def
_compute_scores
(
self
,
emissions
,
tags
,
mask
):
"""Compute the scores for a given batch of emissions with their tags.
Args:
emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
tags (Paddle.LongTensor): (batch_size, seq_len)
mask (Paddle.FloatTensor): (batch_size, seq_len)
Returns:
paddle.Tensor: Scores for each batch.
Shape of (batch_size,)
"""
batch_size
,
seq_length
=
tags
.
shape
scores
=
paddle
.
zeros
([
batch_size
])
# save first and last tags to be used later
first_tags
=
tags
[:,
0
]
last_valid_idx
=
mask
.
int
().
sum
(
1
)
-
1
# TODO(Hui Zhang): not support fancy index.
# last_tags = tags.gather(last_valid_idx.unsqueeze(1), axis=1).squeeze()
batch_idx
=
paddle
.
arange
(
batch_size
,
dtype
=
last_valid_idx
.
dtype
)
gather_last_valid_idx
=
paddle
.
stack
(
[
batch_idx
,
last_valid_idx
],
axis
=-
1
)
last_tags
=
tags
.
gather_nd
(
gather_last_valid_idx
)
# add the transition from BOS to the first tags for each batch
# t_scores = self.transitions[self.BOS_TAG_ID, first_tags]
t_scores
=
self
.
transitions
[
self
.
BOS_TAG_ID
].
gather
(
first_tags
)
# add the [unary] emission scores for the first tags for each batch
# for all batches, the first word, see the correspondent emissions
# for the first tags (which is a list of ids):
# emissions[:, 0, [tag_1, tag_2, ..., tag_nblabels]]
# e_scores = emissions[:, 0].gather(1, first_tags.unsqueeze(1)).squeeze()
gather_first_tags_idx
=
paddle
.
stack
([
batch_idx
,
first_tags
],
axis
=-
1
)
e_scores
=
emissions
[:,
0
].
gather_nd
(
gather_first_tags_idx
)
# the scores for a word is just the sum of both scores
scores
+=
e_scores
+
t_scores
# now lets do this for each remaining word
for
i
in
range
(
1
,
seq_length
):
# we could: iterate over batches, check if we reached a mask symbol
# and stop the iteration, but vecotrizing is faster due to gpu,
# so instead we perform an element-wise multiplication
is_valid
=
mask
[:,
i
]
previous_tags
=
tags
[:,
i
-
1
]
current_tags
=
tags
[:,
i
]
# calculate emission and transition scores as we did before
# e_scores = emissions[:, i].gather(1, current_tags.unsqueeze(1)).squeeze()
gather_current_tags_idx
=
paddle
.
stack
(
[
batch_idx
,
current_tags
],
axis
=-
1
)
e_scores
=
emissions
[:,
i
].
gather_nd
(
gather_current_tags_idx
)
# t_scores = self.transitions[previous_tags, current_tags]
gather_transitions_idx
=
paddle
.
stack
(
[
previous_tags
,
current_tags
],
axis
=-
1
)
t_scores
=
self
.
transitions
.
gather_nd
(
gather_transitions_idx
)
# apply the mask
e_scores
=
e_scores
*
is_valid
t_scores
=
t_scores
*
is_valid
scores
+=
e_scores
+
t_scores
# add the transition from the end tag to the EOS tag for each batch
# scores += self.transitions[last_tags, self.EOS_TAG_ID]
scores
+=
self
.
transitions
.
gather
(
last_tags
)[:,
self
.
EOS_TAG_ID
]
return
scores
def
_compute_log_partition
(
self
,
emissions
,
mask
):
"""Compute the partition function in log-space using the forward-algorithm.
Args:
emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
mask (Paddle.FloatTensor): (batch_size, seq_len)
Returns:
paddle.Tensor: the partition scores for each batch.
Shape of (batch_size,)
"""
batch_size
,
seq_length
,
nb_labels
=
emissions
.
shape
# in the first iteration, BOS will have all the scores
alphas
=
self
.
transitions
[
self
.
BOS_TAG_ID
,
:].
unsqueeze
(
0
)
+
emissions
[:,
0
]
for
i
in
range
(
1
,
seq_length
):
# (bs, nb_labels) -> (bs, 1, nb_labels)
e_scores
=
emissions
[:,
i
].
unsqueeze
(
1
)
# (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels)
t_scores
=
self
.
transitions
.
unsqueeze
(
0
)
# (bs, nb_labels) -> (bs, nb_labels, 1)
a_scores
=
alphas
.
unsqueeze
(
2
)
scores
=
e_scores
+
t_scores
+
a_scores
new_alphas
=
paddle
.
logsumexp
(
scores
,
axis
=
1
)
# set alphas if the mask is valid, otherwise keep the current values
is_valid
=
mask
[:,
i
].
unsqueeze
(
-
1
)
alphas
=
is_valid
*
new_alphas
+
(
1
-
is_valid
)
*
alphas
# add the scores for the final transition
last_transition
=
self
.
transitions
[:,
self
.
EOS_TAG_ID
]
end_scores
=
alphas
+
last_transition
.
unsqueeze
(
0
)
# return a *log* of sums of exps
return
paddle
.
logsumexp
(
end_scores
,
axis
=
1
)
def
_viterbi_decode
(
self
,
emissions
,
mask
):
"""Compute the viterbi algorithm to find the most probable sequence of labels
given a sequence of emissions.
Args:
emissions (paddle.Tensor): (batch_size, seq_len, nb_labels)
mask (Paddle.FloatTensor): (batch_size, seq_len)
Returns:
paddle.Tensor: the viterbi score for the for each batch.
Shape of (batch_size,)
list of lists of ints: the best viterbi sequence of labels for each batch
"""
batch_size
,
seq_length
,
nb_labels
=
emissions
.
shape
# in the first iteration, BOS will have all the scores and then, the max
alphas
=
self
.
transitions
[
self
.
BOS_TAG_ID
,
:].
unsqueeze
(
0
)
+
emissions
[:,
0
]
backpointers
=
[]
for
i
in
range
(
1
,
seq_length
):
# (bs, nb_labels) -> (bs, 1, nb_labels)
e_scores
=
emissions
[:,
i
].
unsqueeze
(
1
)
# (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels)
t_scores
=
self
.
transitions
.
unsqueeze
(
0
)
# (bs, nb_labels) -> (bs, nb_labels, 1)
a_scores
=
alphas
.
unsqueeze
(
2
)
# combine current scores with previous alphas
scores
=
e_scores
+
t_scores
+
a_scores
# so far is exactly like the forward algorithm,
# but now, instead of calculating the logsumexp,
# we will find the highest score and the tag associated with it
# max_scores, max_score_tags = paddle.max(scores, axis=1)
max_scores
=
paddle
.
max
(
scores
,
axis
=
1
)
max_score_tags
=
paddle
.
argmax
(
scores
,
axis
=
1
)
# set alphas if the mask is valid, otherwise keep the current values
is_valid
=
mask
[:,
i
].
unsqueeze
(
-
1
)
alphas
=
is_valid
*
max_scores
+
(
1
-
is_valid
)
*
alphas
# add the max_score_tags for our list of backpointers
# max_scores has shape (batch_size, nb_labels) so we transpose it to
# be compatible with our previous loopy version of viterbi
backpointers
.
append
(
max_score_tags
.
t
())
# add the scores for the final transition
last_transition
=
self
.
transitions
[:,
self
.
EOS_TAG_ID
]
end_scores
=
alphas
+
last_transition
.
unsqueeze
(
0
)
# get the final most probable score and the final most probable tag
# max_final_scores, max_final_tags = paddle.max(end_scores, axis=1)
max_final_scores
=
paddle
.
max
(
end_scores
,
axis
=
1
)
max_final_tags
=
paddle
.
argmax
(
end_scores
,
axis
=
1
)
# find the best sequence of labels for each sample in the batch
best_sequences
=
[]
emission_lengths
=
mask
.
int
().
sum
(
axis
=
1
)
for
i
in
range
(
batch_size
):
# recover the original sentence length for the i-th sample in the batch
sample_length
=
emission_lengths
[
i
].
item
()
# recover the max tag for the last timestep
sample_final_tag
=
max_final_tags
[
i
].
item
()
# limit the backpointers until the last but one
# since the last corresponds to the sample_final_tag
sample_backpointers
=
backpointers
[:
sample_length
-
1
]
# follow the backpointers to build the sequence of labels
sample_path
=
self
.
_find_best_path
(
i
,
sample_final_tag
,
sample_backpointers
)
# add this path to the list of best sequences
best_sequences
.
append
(
sample_path
)
return
max_final_scores
,
best_sequences
def
_find_best_path
(
self
,
sample_id
,
best_tag
,
backpointers
):
"""Auxiliary function to find the best path sequence for a specific sample.
Args:
sample_id (int): sample index in the range [0, batch_size)
best_tag (int): tag which maximizes the final score
backpointers (list of lists of tensors): list of pointers with
shape (seq_len_i-1, nb_labels, batch_size) where seq_len_i
represents the length of the ith sample in the batch
Returns:
list of ints: a list of tag indexes representing the bast path
"""
# add the final best_tag to our best path
best_path
=
[
best_tag
]
# traverse the backpointers in backwards
for
backpointers_t
in
reversed
(
backpointers
):
# recover the best_tag at this timestep
best_tag
=
backpointers_t
[
best_tag
][
sample_id
].
item
()
# append to the beginning of the list so we don't need to reverse it later
best_path
.
insert
(
0
,
best_tag
)
return
best_path
deepspeech/modules/encoder.py
浏览文件 @
a1c6ee5c
...
...
@@ -209,7 +209,9 @@ class BaseEncoder(nn.Layer):
"""
assert
xs
.
size
(
0
)
==
1
# batch size must be one
# tmp_masks is just for interface compatibility
tmp_masks
=
paddle
.
ones
([
1
,
xs
.
size
(
1
)],
dtype
=
paddle
.
bool
)
# TODO(Hui Zhang): stride_slice not support bool tensor
# tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
tmp_masks
=
paddle
.
ones
([
1
,
xs
.
size
(
1
)],
dtype
=
paddle
.
int32
)
tmp_masks
=
tmp_masks
.
unsqueeze
(
1
)
#[B=1, C=1, T]
if
self
.
global_cmvn
is
not
None
:
...
...
deepspeech/modules/mask.py
浏览文件 @
a1c6ee5c
...
...
@@ -121,7 +121,7 @@ def subsequent_chunk_mask(
[1, 1, 1, 1],
[1, 1, 1, 1]]
"""
ret
=
torch
.
zeros
([
size
,
size
],
dtype
=
paddle
.
bool
)
ret
=
paddle
.
zeros
([
size
,
size
],
dtype
=
paddle
.
bool
)
for
i
in
range
(
size
):
if
num_left_chunks
<
0
:
start
=
0
...
...
@@ -186,13 +186,15 @@ def add_optional_chunk_mask(xs: paddle.Tensor,
chunk_masks
=
subsequent_chunk_mask
(
xs
.
shape
[
1
],
chunk_size
,
num_left_chunks
)
# (L, L)
chunk_masks
=
chunk_masks
.
unsqueeze
(
0
)
# (1, L, L)
chunk_masks
=
masks
&
chunk_masks
# (B, L, L)
# chunk_masks = masks & chunk_masks # (B, L, L)
chunk_masks
=
masks
.
logical_and
(
chunk_masks
)
# (B, L, L)
elif
static_chunk_size
>
0
:
num_left_chunks
=
num_decoding_left_chunks
chunk_masks
=
subsequent_chunk_mask
(
xs
.
shape
[
1
],
static_chunk_size
,
num_left_chunks
)
# (L, L)
chunk_masks
=
chunk_masks
.
unsqueeze
(
0
)
# (1, L, L)
chunk_masks
=
masks
&
chunk_masks
# (B, L, L)
# chunk_masks = masks & chunk_masks # (B, L, L)
chunk_masks
=
masks
.
logical_and
(
chunk_masks
)
# (B, L, L)
else
:
chunk_masks
=
masks
return
chunk_masks
...
...
examples/aishell/s0/README.md
浏览文件 @
a1c6ee5c
...
...
@@ -4,6 +4,7 @@
| Model | release | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2 | 1.8.5 | - | test | - | 0.080447 |
examples/aishell/s0/conf/augmentation.json
浏览文件 @
a1c6ee5c
...
...
@@ -15,5 +15,20 @@
"max_shift_ms"
:
5
},
"prob"
:
1.0
},
{
"type"
:
"specaug"
,
"params"
:
{
"F"
:
10
,
"T"
:
50
,
"n_freq_masks"
:
2
,
"n_time_masks"
:
2
,
"p"
:
1.0
,
"W"
:
80
,
"adaptive_number_ratio"
:
0
,
"adaptive_size_ratio"
:
0
,
"max_n_time_masks"
:
20
},
"prob"
:
1.0
}
]
examples/aishell/s1/conf/chunk_conformer.yaml
0 → 100644
浏览文件 @
a1c6ee5c
# https://yaml.org/type/float.html
data
:
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test
vocab_filepath
:
data/vocab.txt
unit_type
:
'
char'
spm_model_prefix
:
'
'
augmentation_config
:
conf/augmentation.json
batch_size
:
32
min_input_len
:
0.5
max_input_len
:
20.0
# second
min_output_len
:
0.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
raw_wav
:
True
# use raw_wav or kaldi feature
specgram_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
delta_delta
:
False
dither
:
1.0
target_sample_rate
:
16000
max_freq
:
None
n_fft
:
None
stride_ms
:
10.0
window_ms
:
25.0
use_dB_normalization
:
True
target_dB
:
-20
random_seed
:
0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
# network architecture
model
:
cmvn_file
:
"
data/mean_std.json"
cmvn_file_type
:
"
json"
# encoder related
encoder
:
conformer
encoder_conf
:
output_size
:
256
# dimension of attention
attention_heads
:
4
linear_units
:
2048
# the number of units of position-wise feed forward
num_blocks
:
12
# the number of encoder blocks
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
attention_dropout_rate
:
0.0
input_layer
:
conv2d
# encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before
:
True
use_cnn_module
:
True
cnn_module_kernel
:
15
activation_type
:
'
swish'
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
causal
:
true
use_dynamic_chunk
:
true
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk
:
false
# decoder related
decoder
:
transformer
decoder_conf
:
attention_heads
:
4
linear_units
:
2048
num_blocks
:
6
dropout_rate
:
0.1
positional_dropout_rate
:
0.1
self_attention_dropout_rate
:
0.0
src_attention_dropout_rate
:
0.0
# hybrid CTC/attention
model_conf
:
ctc_weight
:
0.3
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
training
:
n_epoch
:
180
accum_grad
:
4
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
lr
:
0.001
weight_decay
:
1e-6
scheduler
:
warmuplr
# pytorch v1.1.0+ required
scheduler_conf
:
warmup_steps
:
25000
lr_decay
:
1.0
log_interval
:
100
decoding
:
batch_size
:
128
error_rate_type
:
cer
decoding_method
:
attention
# 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
10
cutoff_prob
:
1.0
cutoff_top_n
:
0
num_proc_bsearch
:
8
ctc_weight
:
0.5
# ctc weight for attention rescoring decode mode.
decoding_chunk_size
:
-1
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
true
# simulate streaming inference. Defaults to False.
examples/aishell/s1/conf/conformer.yaml
浏览文件 @
a1c6ee5c
...
...
@@ -76,7 +76,7 @@ model:
training
:
n_epoch
:
240
accum_grad
:
2
global_grad_clip
:
3
.0
global_grad_clip
:
5
.0
optim
:
adam
optim_conf
:
lr
:
0.002
...
...
examples/librispeech/s1/conf/chunk_confermer.yaml
浏览文件 @
a1c6ee5c
...
...
@@ -56,7 +56,7 @@ model:
pos_enc_layer_type
:
'
rel_pos'
selfattention_layer_type
:
'
rel_selfattn'
causal
:
True
use_dynamic_chunk
:
T
rue
use_dynamic_chunk
:
t
rue
cnn_module_norm
:
'
layer_norm'
# using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk
:
false
...
...
@@ -110,6 +110,6 @@ decoding:
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
Fals
e
# simulate streaming inference. Defaults to False.
simulate_streaming
:
tru
e
# simulate streaming inference. Defaults to False.
examples/tiny/s1/conf/transformer.yaml
浏览文件 @
a1c6ee5c
...
...
@@ -8,7 +8,7 @@ data:
spm_model_prefix
:
'
data/bpe_unigram_200'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
2
#
4
batch_size
:
4
min_input_len
:
0.5
# second
max_input_len
:
20.0
# second
min_output_len
:
0.0
# tokens
...
...
@@ -31,7 +31,7 @@ data:
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
#
2
num_workers
:
2
# network architecture
...
...
examples/tiny/s1/run.sh
浏览文件 @
a1c6ee5c
...
...
@@ -30,12 +30,10 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# CUDA_VISIBLE_DEVICES=7
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
7 ./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
# CUDA_VISIBLE_DEVICES=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
third_party/nnAudio/nnAudio/Spectrogram.py
0 → 100755
浏览文件 @
a1c6ee5c
此差异已折叠。
点击以展开。
third_party/nnAudio/nnAudio/__init__.py
0 → 100755
浏览文件 @
a1c6ee5c
__version__
=
"0.2.2"
\ No newline at end of file
third_party/nnAudio/nnAudio/librosa_functions.py
0 → 100755
浏览文件 @
a1c6ee5c
"""
Module containing functions cloned from librosa
To make sure nnAudio would not become broken when updating librosa
"""
import
numpy
as
np
import
warnings
### ----------------Functions for generating kenral for Mel Spectrogram------------ ###
# This code is equalvant to from librosa.filters import mel
# By doing so, we can run nnAudio without installing librosa
def
fft2gammatonemx
(
sr
=
20000
,
n_fft
=
2048
,
n_bins
=
64
,
width
=
1.0
,
fmin
=
0.0
,
fmax
=
11025
,
maxlen
=
1024
):
"""
# Ellis' description in MATLAB:
# [wts,cfreqa] = fft2gammatonemx(nfft, sr, nfilts, width, minfreq, maxfreq, maxlen)
# Generate a matrix of weights to combine FFT bins into
# Gammatone bins. nfft defines the source FFT size at
# sampling rate sr. Optional nfilts specifies the number of
# output bands required (default 64), and width is the
# constant width of each band in Bark (default 1).
# minfreq, maxfreq specify range covered in Hz (100, sr/2).
# While wts has nfft columns, the second half are all zero.
# Hence, aud spectrum is
# fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft));
# maxlen truncates the rows to this many bins.
# cfreqs returns the actual center frequencies of each
# gammatone band in Hz.
#
# 2009/02/22 02:29:25 Dan Ellis dpwe@ee.columbia.edu based on rastamat/audspec.m
# Sat May 27 15:37:50 2017 Maddie Cusimano, mcusi@mit.edu 27 May 2017: convert to python
"""
wts
=
np
.
zeros
([
n_bins
,
n_fft
],
dtype
=
np
.
float32
)
# after Slaney's MakeERBFilters
EarQ
=
9.26449
;
minBW
=
24.7
;
order
=
1
;
nFr
=
np
.
array
(
range
(
n_bins
))
+
1
em
=
EarQ
*
minBW
cfreqs
=
(
fmax
+
em
)
*
np
.
exp
(
nFr
*
(
-
np
.
log
(
fmax
+
em
)
+
np
.
log
(
fmin
+
em
))
/
n_bins
)
-
em
cfreqs
=
cfreqs
[::
-
1
]
GTord
=
4
ucircArray
=
np
.
array
(
range
(
int
(
n_fft
/
2
+
1
)))
ucirc
=
np
.
exp
(
1j
*
2
*
np
.
pi
*
ucircArray
/
n_fft
);
# justpoles = 0 :taking out the 'if' corresponding to this.
ERB
=
width
*
np
.
power
(
np
.
power
(
cfreqs
/
EarQ
,
order
)
+
np
.
power
(
minBW
,
order
),
1
/
order
);
B
=
1.019
*
2
*
np
.
pi
*
ERB
;
r
=
np
.
exp
(
-
B
/
sr
)
theta
=
2
*
np
.
pi
*
cfreqs
/
sr
pole
=
r
*
np
.
exp
(
1j
*
theta
)
T
=
1
/
sr
ebt
=
np
.
exp
(
B
*
T
);
cpt
=
2
*
cfreqs
*
np
.
pi
*
T
;
ccpt
=
2
*
T
*
np
.
cos
(
cpt
);
scpt
=
2
*
T
*
np
.
sin
(
cpt
);
A11
=
-
np
.
divide
(
np
.
divide
(
ccpt
,
ebt
)
+
np
.
divide
(
np
.
sqrt
(
3
+
2
**
1.5
)
*
scpt
,
ebt
),
2
);
A12
=
-
np
.
divide
(
np
.
divide
(
ccpt
,
ebt
)
-
np
.
divide
(
np
.
sqrt
(
3
+
2
**
1.5
)
*
scpt
,
ebt
),
2
);
A13
=
-
np
.
divide
(
np
.
divide
(
ccpt
,
ebt
)
+
np
.
divide
(
np
.
sqrt
(
3
-
2
**
1.5
)
*
scpt
,
ebt
),
2
);
A14
=
-
np
.
divide
(
np
.
divide
(
ccpt
,
ebt
)
-
np
.
divide
(
np
.
sqrt
(
3
-
2
**
1.5
)
*
scpt
,
ebt
),
2
);
zros
=
-
np
.
array
([
A11
,
A12
,
A13
,
A14
])
/
T
;
wIdx
=
range
(
int
(
n_fft
/
2
+
1
))
gain
=
np
.
abs
((
-
2
*
np
.
exp
(
4
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
+
2
*
np
.
exp
(
-
(
B
*
T
)
+
2
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
*
(
np
.
cos
(
2
*
cfreqs
*
np
.
pi
*
T
)
-
np
.
sqrt
(
3
-
2
**
(
3
/
2
))
*
np
.
sin
(
2
*
cfreqs
*
np
.
pi
*
T
)))
*
(
-
2
*
np
.
exp
(
4
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
+
2
*
np
.
exp
(
-
(
B
*
T
)
+
2
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
*
(
np
.
cos
(
2
*
cfreqs
*
np
.
pi
*
T
)
+
np
.
sqrt
(
3
-
2
**
(
3
/
2
))
*
np
.
sin
(
2
*
cfreqs
*
np
.
pi
*
T
)))
*
(
-
2
*
np
.
exp
(
4
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
+
2
*
np
.
exp
(
-
(
B
*
T
)
+
2
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
*
(
np
.
cos
(
2
*
cfreqs
*
np
.
pi
*
T
)
-
np
.
sqrt
(
3
+
2
**
(
3
/
2
))
*
np
.
sin
(
2
*
cfreqs
*
np
.
pi
*
T
)))
*
(
-
2
*
np
.
exp
(
4
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
+
2
*
np
.
exp
(
-
(
B
*
T
)
+
2
*
1j
*
cfreqs
*
np
.
pi
*
T
)
*
T
*
(
np
.
cos
(
2
*
cfreqs
*
np
.
pi
*
T
)
+
np
.
sqrt
(
3
+
2
**
(
3
/
2
))
*
np
.
sin
(
2
*
cfreqs
*
np
.
pi
*
T
)))
/
(
-
2
/
np
.
exp
(
2
*
B
*
T
)
-
2
*
np
.
exp
(
4
*
1j
*
cfreqs
*
np
.
pi
*
T
)
+
2
*
(
1
+
np
.
exp
(
4
*
1j
*
cfreqs
*
np
.
pi
*
T
))
/
np
.
exp
(
B
*
T
))
**
4
);
# in MATLAB, there used to be 64 where here it says n_bins:
wts
[:,
wIdx
]
=
((
T
**
4
)
/
np
.
reshape
(
gain
,
(
n_bins
,
1
)))
*
np
.
abs
(
ucirc
-
np
.
reshape
(
zros
[
0
],
(
n_bins
,
1
)))
*
np
.
abs
(
ucirc
-
np
.
reshape
(
zros
[
1
],
(
n_bins
,
1
)))
*
np
.
abs
(
ucirc
-
np
.
reshape
(
zros
[
2
],
(
n_bins
,
1
)))
*
np
.
abs
(
ucirc
-
np
.
reshape
(
zros
[
3
],
(
n_bins
,
1
)))
*
(
np
.
abs
(
np
.
power
(
np
.
multiply
(
np
.
reshape
(
pole
,
(
n_bins
,
1
))
-
ucirc
,
np
.
conj
(
np
.
reshape
(
pole
,
(
n_bins
,
1
)))
-
ucirc
),
-
GTord
)));
wts
=
wts
[:,
range
(
maxlen
)];
return
wts
,
cfreqs
def
gammatone
(
sr
,
n_fft
,
n_bins
=
64
,
fmin
=
20.0
,
fmax
=
None
,
htk
=
False
,
norm
=
1
,
dtype
=
np
.
float32
):
"""Create a Filterbank matrix to combine FFT bins into Gammatone bins
Parameters
----------
sr : number > 0 [scalar]
sampling rate of the incoming signal
n_fft : int > 0 [scalar]
number of FFT components
n_bins : int > 0 [scalar]
number of Mel bands to generate
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use `fmax = sr / 2.0`
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 1, np.inf} [scalar]
if 1, divide the triangular mel weights by the width of the mel band
(area normalization). Otherwise, leave all the triangles aiming for
a peak value of 1.0
dtype : np.dtype
The data type of the output basis.
By default, uses 32-bit (single-precision) floating point.
Returns
-------
G : np.ndarray [shape=(n_bins, 1 + n_fft/2)]
Gammatone transform matrix
"""
if
fmax
is
None
:
fmax
=
float
(
sr
)
/
2
n_bins
=
int
(
n_bins
)
weights
,
_
=
fft2gammatonemx
(
sr
=
sr
,
n_fft
=
n_fft
,
n_bins
=
n_bins
,
fmin
=
fmin
,
fmax
=
fmax
,
maxlen
=
int
(
n_fft
//
2
+
1
))
return
(
1
/
n_fft
)
*
weights
def
mel_to_hz
(
mels
,
htk
=
False
):
"""Convert mel bin numbers to frequencies
Examples
--------
>>> librosa.mel_to_hz(3)
200.
>>> librosa.mel_to_hz([1,2,3,4,5])
array([ 66.667, 133.333, 200. , 266.667, 333.333])
Parameters
----------
mels : np.ndarray [shape=(n,)], float
mel bins to convert
htk : bool
use HTK formula instead of Slaney
Returns
-------
frequencies : np.ndarray [shape=(n,)]
input mels in Hz
See Also
--------
hz_to_mel
"""
mels
=
np
.
asanyarray
(
mels
)
if
htk
:
return
700.0
*
(
10.0
**
(
mels
/
2595.0
)
-
1.0
)
# Fill in the linear scale
f_min
=
0.0
f_sp
=
200.0
/
3
freqs
=
f_min
+
f_sp
*
mels
# And now the nonlinear scale
min_log_hz
=
1000.0
# beginning of log region (Hz)
min_log_mel
=
(
min_log_hz
-
f_min
)
/
f_sp
# same (Mels)
logstep
=
np
.
log
(
6.4
)
/
27.0
# step size for log region
if
mels
.
ndim
:
# If we have vector data, vectorize
log_t
=
(
mels
>=
min_log_mel
)
freqs
[
log_t
]
=
min_log_hz
*
np
.
exp
(
logstep
*
(
mels
[
log_t
]
-
min_log_mel
))
elif
mels
>=
min_log_mel
:
# If we have scalar data, check directly
freqs
=
min_log_hz
*
np
.
exp
(
logstep
*
(
mels
-
min_log_mel
))
return
freqs
def
hz_to_mel
(
frequencies
,
htk
=
False
):
"""Convert Hz to Mels
Examples
--------
>>> librosa.hz_to_mel(60)
0.9
>>> librosa.hz_to_mel([110, 220, 440])
array([ 1.65, 3.3 , 6.6 ])
Parameters
----------
frequencies : number or np.ndarray [shape=(n,)] , float
scalar or array of frequencies
htk : bool
use HTK formula instead of Slaney
Returns
-------
mels : number or np.ndarray [shape=(n,)]
input frequencies in Mels
See Also
--------
mel_to_hz
"""
frequencies
=
np
.
asanyarray
(
frequencies
)
if
htk
:
return
2595.0
*
np
.
log10
(
1.0
+
frequencies
/
700.0
)
# Fill in the linear part
f_min
=
0.0
f_sp
=
200.0
/
3
mels
=
(
frequencies
-
f_min
)
/
f_sp
# Fill in the log-scale part
min_log_hz
=
1000.0
# beginning of log region (Hz)
min_log_mel
=
(
min_log_hz
-
f_min
)
/
f_sp
# same (Mels)
logstep
=
np
.
log
(
6.4
)
/
27.0
# step size for log region
if
frequencies
.
ndim
:
# If we have array data, vectorize
log_t
=
(
frequencies
>=
min_log_hz
)
mels
[
log_t
]
=
min_log_mel
+
np
.
log
(
frequencies
[
log_t
]
/
min_log_hz
)
/
logstep
elif
frequencies
>=
min_log_hz
:
# If we have scalar data, heck directly
mels
=
min_log_mel
+
np
.
log
(
frequencies
/
min_log_hz
)
/
logstep
return
mels
def
fft_frequencies
(
sr
=
22050
,
n_fft
=
2048
):
'''Alternative implementation of `np.fft.fftfreq`
Parameters
----------
sr : number > 0 [scalar]
Audio sampling rate
n_fft : int > 0 [scalar]
FFT window size
Returns
-------
freqs : np.ndarray [shape=(1 + n_fft/2,)]
Frequencies `(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)`
Examples
--------
>>> librosa.fft_frequencies(sr=22050, n_fft=16)
array([ 0. , 1378.125, 2756.25 , 4134.375,
5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ])
'''
return
np
.
linspace
(
0
,
float
(
sr
)
/
2
,
int
(
1
+
n_fft
//
2
),
endpoint
=
True
)
def
mel_frequencies
(
n_mels
=
128
,
fmin
=
0.0
,
fmax
=
11025.0
,
htk
=
False
):
"""
This function is cloned from librosa 0.7.
Please refer to the original
`documentation <https://librosa.org/doc/latest/generated/librosa.mel_frequencies.html?highlight=mel_frequencies#librosa.mel_frequencies>`__
for more info.
Parameters
----------
n_mels : int > 0 [scalar]
Number of mel bins.
fmin : float >= 0 [scalar]
Minimum frequency (Hz).
fmax : float >= 0 [scalar]
Maximum frequency (Hz).
htk : bool
If True, use HTK formula to convert Hz to mel.
Otherwise (False), use Slaney's Auditory Toolbox.
Returns
-------
bin_frequencies : ndarray [shape=(n_mels,)]
Vector of n_mels frequencies in Hz which are uniformly spaced on the Mel
axis.
Examples
--------
>>> librosa.mel_frequencies(n_mels=40)
array([ 0. , 85.317, 170.635, 255.952,
341.269, 426.586, 511.904, 597.221,
682.538, 767.855, 853.173, 938.49 ,
1024.856, 1119.114, 1222.042, 1334.436,
1457.167, 1591.187, 1737.532, 1897.337,
2071.84 , 2262.393, 2470.47 , 2697.686,
2945.799, 3216.731, 3512.582, 3835.643,
4188.417, 4573.636, 4994.285, 5453.621,
5955.205, 6502.92 , 7101.009, 7754.107,
8467.272, 9246.028, 10096.408, 11025. ])
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel
=
hz_to_mel
(
fmin
,
htk
=
htk
)
max_mel
=
hz_to_mel
(
fmax
,
htk
=
htk
)
mels
=
np
.
linspace
(
min_mel
,
max_mel
,
n_mels
)
return
mel_to_hz
(
mels
,
htk
=
htk
)
def
mel
(
sr
,
n_fft
,
n_mels
=
128
,
fmin
=
0.0
,
fmax
=
None
,
htk
=
False
,
norm
=
1
,
dtype
=
np
.
float32
):
"""
This function is cloned from librosa 0.7.
Please refer to the original
`documentation <https://librosa.org/doc/latest/generated/librosa.filters.mel.html>`__
for more info.
Create a Filterbank matrix to combine FFT bins into Mel-frequency bins
Parameters
----------
sr : number > 0 [scalar]
sampling rate of the incoming signal
n_fft : int > 0 [scalar]
number of FFT components
n_mels : int > 0 [scalar]
number of Mel bands to generate
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use `fmax = sr / 2.0`
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 1, np.inf} [scalar]
if 1, divide the triangular mel weights by the width of the mel band
(area normalization). Otherwise, leave all the triangles aiming for
a peak value of 1.0
dtype : np.dtype
The data type of the output basis.
By default, uses 32-bit (single-precision) floating point.
Returns
-------
M : np.ndarray [shape=(n_mels, 1 + n_fft/2)]
Mel transform matrix
Notes
-----
This function caches at level 10.
Examples
--------
>>> melfb = librosa.filters.mel(22050, 2048)
>>> melfb
array([[ 0. , 0.016, ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ],
...,
[ 0. , 0. , ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ]])
Clip the maximum frequency to 8KHz
>>> librosa.filters.mel(22050, 2048, fmax=8000)
array([[ 0. , 0.02, ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ],
...,
[ 0. , 0. , ..., 0. , 0. ],
[ 0. , 0. , ..., 0. , 0. ]])
>>> import matplotlib.pyplot as plt
>>> plt.figure()
>>> librosa.display.specshow(melfb, x_axis='linear')
>>> plt.ylabel('Mel filter')
>>> plt.title('Mel filter bank')
>>> plt.colorbar()
>>> plt.tight_layout()
>>> plt.show()
"""
if
fmax
is
None
:
fmax
=
float
(
sr
)
/
2
if
norm
is
not
None
and
norm
!=
1
and
norm
!=
np
.
inf
:
raise
ParameterError
(
'Unsupported norm: {}'
.
format
(
repr
(
norm
)))
# Initialize the weights
n_mels
=
int
(
n_mels
)
weights
=
np
.
zeros
((
n_mels
,
int
(
1
+
n_fft
//
2
)),
dtype
=
dtype
)
# Center freqs of each FFT bin
fftfreqs
=
fft_frequencies
(
sr
=
sr
,
n_fft
=
n_fft
)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f
=
mel_frequencies
(
n_mels
+
2
,
fmin
=
fmin
,
fmax
=
fmax
,
htk
=
htk
)
fdiff
=
np
.
diff
(
mel_f
)
ramps
=
np
.
subtract
.
outer
(
mel_f
,
fftfreqs
)
for
i
in
range
(
n_mels
):
# lower and upper slopes for all bins
lower
=
-
ramps
[
i
]
/
fdiff
[
i
]
upper
=
ramps
[
i
+
2
]
/
fdiff
[
i
+
1
]
# .. then intersect them with each other and zero
weights
[
i
]
=
np
.
maximum
(
0
,
np
.
minimum
(
lower
,
upper
))
if
norm
==
1
:
# Slaney-style mel is scaled to be approx constant energy per channel
enorm
=
2.0
/
(
mel_f
[
2
:
n_mels
+
2
]
-
mel_f
[:
n_mels
])
weights
*=
enorm
[:,
np
.
newaxis
]
# Only check weights if f_mel[0] is positive
if
not
np
.
all
((
mel_f
[:
-
2
]
==
0
)
|
(
weights
.
max
(
axis
=
1
)
>
0
)):
# This means we have an empty channel somewhere
warnings
.
warn
(
'Empty filters detected in mel frequency basis. '
'Some channels will produce empty responses. '
'Try increasing your sampling rate (and fmax) or '
'reducing n_mels.'
)
return
weights
### ------------------End of Functions for generating kenral for Mel Spectrogram ----------------###
### ------------------Functions for making STFT same as librosa ---------------------------------###
def
pad_center
(
data
,
size
,
axis
=-
1
,
**
kwargs
):
'''Wrapper for np.pad to automatically center an array prior to padding.
This is analogous to `str.center()`
Examples
--------
>>> # Generate a vector
>>> data = np.ones(5)
>>> librosa.util.pad_center(data, 10, mode='constant')
array([ 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.])
>>> # Pad a matrix along its first dimension
>>> data = np.ones((3, 5))
>>> librosa.util.pad_center(data, 7, axis=0)
array([[ 0., 0., 0., 0., 0.],
[ 0., 0., 0., 0., 0.],
[ 1., 1., 1., 1., 1.],
[ 1., 1., 1., 1., 1.],
[ 1., 1., 1., 1., 1.],
[ 0., 0., 0., 0., 0.],
[ 0., 0., 0., 0., 0.]])
>>> # Or its second dimension
>>> librosa.util.pad_center(data, 7, axis=1)
array([[ 0., 1., 1., 1., 1., 1., 0.],
[ 0., 1., 1., 1., 1., 1., 0.],
[ 0., 1., 1., 1., 1., 1., 0.]])
Parameters
----------
data : np.ndarray
Vector to be padded and centered
size : int >= len(data) [scalar]
Length to pad `data`
axis : int
Axis along which to pad and center the data
kwargs : additional keyword arguments
arguments passed to `np.pad()`
Returns
-------
data_padded : np.ndarray
`data` centered and padded to length `size` along the
specified axis
Raises
------
ParameterError
If `size < data.shape[axis]`
See Also
--------
numpy.pad
'''
kwargs
.
setdefault
(
'mode'
,
'constant'
)
n
=
data
.
shape
[
axis
]
lpad
=
int
((
size
-
n
)
//
2
)
lengths
=
[(
0
,
0
)]
*
data
.
ndim
lengths
[
axis
]
=
(
lpad
,
int
(
size
-
n
-
lpad
))
if
lpad
<
0
:
raise
ParameterError
((
'Target size ({:d}) must be '
'at least input size ({:d})'
).
format
(
size
,
n
))
return
np
.
pad
(
data
,
lengths
,
**
kwargs
)
### ------------------End of functions for making STFT same as librosa ---------------------------###
third_party/nnAudio/nnAudio/utils.py
0 → 100644
浏览文件 @
a1c6ee5c
此差异已折叠。
点击以展开。
third_party/nnAudio/setup.py
0 → 100755
浏览文件 @
a1c6ee5c
import
setuptools
import
codecs
import
os.path
with
open
(
"README.md"
,
"r"
)
as
fh
:
long_description
=
fh
.
read
()
def
read
(
rel_path
):
here
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
with
codecs
.
open
(
os
.
path
.
join
(
here
,
rel_path
),
'r'
)
as
fp
:
return
fp
.
read
()
def
get_version
(
rel_path
):
for
line
in
read
(
rel_path
).
splitlines
():
if
line
.
startswith
(
'__version__'
):
delim
=
'"'
if
'"'
in
line
else
"'"
return
line
.
split
(
delim
)[
1
]
else
:
raise
RuntimeError
(
"Unable to find version string."
)
setuptools
.
setup
(
name
=
"nnAudio"
,
# Replace with your own username
version
=
get_version
(
"nnAudio/__init__.py"
),
author
=
"KinWaiCheuk"
,
author_email
=
"u3500684@connect.hku.hk"
,
description
=
"A fast GPU audio processing toolbox with 1D convolutional neural network"
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
url
=
"https://github.com/KinWaiCheuk/nnAudio"
,
packages
=
setuptools
.
find_packages
(),
classifiers
=
[
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
python_requires
=
'>=3.6'
,
)
third_party/nnAudio/tests/parameters.py
0 → 100644
浏览文件 @
a1c6ee5c
# Creating parameters for STFT test
"""
It is equivalent to
[(1024, 128, 'ones'),
(1024, 128, 'hann'),
(1024, 128, 'hamming'),
(2048, 128, 'ones'),
(2048, 512, 'ones'),
(2048, 128, 'hann'),
(2048, 512, 'hann'),
(2048, 128, 'hamming'),
(2048, 512, 'hamming'),
(None, None, None)]
"""
stft_parameters
=
[]
n_fft
=
[
1024
,
2048
]
hop_length
=
{
128
,
512
,
1024
}
window
=
[
'ones'
,
'hann'
,
'hamming'
]
for
i
in
n_fft
:
for
k
in
window
:
for
j
in
hop_length
:
if
j
<
(
i
/
2
):
stft_parameters
.
append
((
i
,
j
,
k
))
stft_parameters
.
append
((
256
,
None
,
'hann'
))
stft_with_win_parameters
=
[]
n_fft
=
[
512
,
1024
]
win_length
=
[
400
,
900
]
hop_length
=
{
128
,
256
}
for
i
in
n_fft
:
for
j
in
win_length
:
if
j
<
i
:
for
k
in
hop_length
:
if
k
<
(
i
/
2
):
stft_with_win_parameters
.
append
((
i
,
j
,
k
))
mel_win_parameters
=
[(
512
,
400
),
(
1024
,
1000
)]
\ No newline at end of file
third_party/nnAudio/tests/test_spectrogram.py
0 → 100644
浏览文件 @
a1c6ee5c
import
pytest
import
librosa
import
torch
import
matplotlib.pyplot
as
plt
from
scipy.signal
import
chirp
,
sweep_poly
from
nnAudio.Spectrogram
import
*
from
parameters
import
*
gpu_idx
=
0
# librosa example audio for testing
example_y
,
example_sr
=
librosa
.
load
(
librosa
.
util
.
example_audio_file
())
@
pytest
.
mark
.
parametrize
(
"n_fft, hop_length, window"
,
stft_parameters
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_inverse2
(
n_fft
,
hop_length
,
window
,
device
):
x
=
torch
.
tensor
(
example_y
,
device
=
device
)
stft
=
STFT
(
n_fft
=
n_fft
,
hop_length
=
hop_length
,
window
=
window
).
to
(
device
)
istft
=
iSTFT
(
n_fft
=
n_fft
,
hop_length
=
hop_length
,
window
=
window
).
to
(
device
)
X
=
stft
(
x
.
unsqueeze
(
0
),
output_format
=
"Complex"
)
x_recon
=
istft
(
X
,
length
=
x
.
shape
[
0
],
onesided
=
True
).
squeeze
()
assert
np
.
allclose
(
x
.
cpu
(),
x_recon
.
cpu
(),
rtol
=
1e-5
,
atol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"n_fft, hop_length, window"
,
stft_parameters
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_inverse
(
n_fft
,
hop_length
,
window
,
device
):
x
=
torch
.
tensor
(
example_y
,
device
=
device
)
stft
=
STFT
(
n_fft
=
n_fft
,
hop_length
=
hop_length
,
window
=
window
,
iSTFT
=
True
).
to
(
device
)
X
=
stft
(
x
.
unsqueeze
(
0
),
output_format
=
"Complex"
)
x_recon
=
stft
.
inverse
(
X
,
length
=
x
.
shape
[
0
]).
squeeze
()
assert
np
.
allclose
(
x
.
cpu
(),
x_recon
.
cpu
(),
rtol
=
1e-3
,
atol
=
1
)
# @pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)
# def test_inverse_GPU(n_fft, hop_length, window):
# x = torch.tensor(example_y,device=f'cuda:{gpu_idx}')
# stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, device=f'cuda:{gpu_idx}')
# X = stft(x.unsqueeze(0), output_format="Complex")
# x_recon = stft.inverse(X, num_samples=x.shape[0]).squeeze()
# assert np.allclose(x.cpu(), x_recon.cpu(), rtol=1e-3, atol=1)
@
pytest
.
mark
.
parametrize
(
"n_fft, hop_length, window"
,
stft_parameters
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_stft_complex
(
n_fft
,
hop_length
,
window
,
device
):
x
=
example_y
stft
=
STFT
(
n_fft
=
n_fft
,
hop_length
=
hop_length
,
window
=
window
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
),
output_format
=
"Complex"
)
X_real
,
X_imag
=
X
[:,
:,
:,
0
].
squeeze
(),
X
[:,
:,
:,
1
].
squeeze
()
X_librosa
=
librosa
.
stft
(
x
,
n_fft
=
n_fft
,
hop_length
=
hop_length
,
window
=
window
)
real_diff
,
imag_diff
=
np
.
allclose
(
X_real
.
cpu
(),
X_librosa
.
real
,
rtol
=
1e-3
,
atol
=
1e-3
),
\
np
.
allclose
(
X_imag
.
cpu
(),
X_librosa
.
imag
,
rtol
=
1e-3
,
atol
=
1e-3
)
assert
real_diff
and
imag_diff
# @pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)
# def test_stft_complex_GPU(n_fft, hop_length, window):
# x = example_y
# stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, device=f'cuda:{gpu_idx}')
# X = stft(torch.tensor(x,device=f'cuda:{gpu_idx}').unsqueeze(0), output_format="Complex")
# X_real, X_imag = X[:, :, :, 0].squeeze().detach().cpu(), X[:, :, :, 1].squeeze().detach().cpu()
# X_librosa = librosa.stft(x, n_fft=n_fft, hop_length=hop_length, window=window)
# real_diff, imag_diff = np.allclose(X_real, X_librosa.real, rtol=1e-3, atol=1e-3), \
# np.allclose(X_imag, X_librosa.imag, rtol=1e-3, atol=1e-3)
# assert real_diff and imag_diff
@
pytest
.
mark
.
parametrize
(
"n_fft, win_length, hop_length"
,
stft_with_win_parameters
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_stft_complex_winlength
(
n_fft
,
win_length
,
hop_length
,
device
):
x
=
example_y
stft
=
STFT
(
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
hop_length
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
),
output_format
=
"Complex"
)
X_real
,
X_imag
=
X
[:,
:,
:,
0
].
squeeze
(),
X
[:,
:,
:,
1
].
squeeze
()
X_librosa
=
librosa
.
stft
(
x
,
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
hop_length
)
real_diff
,
imag_diff
=
np
.
allclose
(
X_real
.
cpu
(),
X_librosa
.
real
,
rtol
=
1e-3
,
atol
=
1e-3
),
\
np
.
allclose
(
X_imag
.
cpu
(),
X_librosa
.
imag
,
rtol
=
1e-3
,
atol
=
1e-3
)
assert
real_diff
and
imag_diff
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_stft_magnitude
(
device
):
x
=
example_y
stft
=
STFT
(
n_fft
=
2048
,
hop_length
=
512
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
),
output_format
=
"Magnitude"
).
squeeze
()
X_librosa
,
_
=
librosa
.
core
.
magphase
(
librosa
.
stft
(
x
,
n_fft
=
2048
,
hop_length
=
512
))
assert
np
.
allclose
(
X
.
cpu
(),
X_librosa
,
rtol
=
1e-3
,
atol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_stft_phase
(
device
):
x
=
example_y
stft
=
STFT
(
n_fft
=
2048
,
hop_length
=
512
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
),
output_format
=
"Phase"
)
X_real
,
X_imag
=
torch
.
cos
(
X
).
squeeze
(),
torch
.
sin
(
X
).
squeeze
()
_
,
X_librosa
=
librosa
.
core
.
magphase
(
librosa
.
stft
(
x
,
n_fft
=
2048
,
hop_length
=
512
))
real_diff
,
imag_diff
=
np
.
mean
(
np
.
abs
(
X_real
.
cpu
().
numpy
()
-
X_librosa
.
real
)),
\
np
.
mean
(
np
.
abs
(
X_imag
.
cpu
().
numpy
()
-
X_librosa
.
imag
))
# I find that np.allclose is too strict for allowing phase to be similar to librosa.
# Hence for phase we use average element-wise distance as the test metric.
assert
real_diff
<
2e-4
and
imag_diff
<
2e-4
@
pytest
.
mark
.
parametrize
(
"n_fft, win_length"
,
mel_win_parameters
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_mel_spectrogram
(
n_fft
,
win_length
,
device
):
x
=
example_y
melspec
=
MelSpectrogram
(
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
512
).
to
(
device
)
X
=
melspec
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
)).
squeeze
()
X_librosa
=
librosa
.
feature
.
melspectrogram
(
x
,
n_fft
=
n_fft
,
win_length
=
win_length
,
hop_length
=
512
)
assert
np
.
allclose
(
X
.
cpu
(),
X_librosa
,
rtol
=
1e-3
,
atol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_cqt_1992
(
device
):
# Log sweep case
fs
=
44100
t
=
1
f0
=
55
f1
=
22050
s
=
np
.
linspace
(
0
,
t
,
fs
*
t
)
x
=
chirp
(
s
,
f0
,
1
,
f1
,
method
=
'logarithmic'
)
x
=
x
.
astype
(
dtype
=
np
.
float32
)
# Magnitude
stft
=
CQT1992
(
sr
=
fs
,
fmin
=
220
,
output_format
=
"Magnitude"
,
n_bins
=
80
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
# Complex
stft
=
CQT1992
(
sr
=
fs
,
fmin
=
220
,
output_format
=
"Complex"
,
n_bins
=
80
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
# Phase
stft
=
CQT1992
(
sr
=
fs
,
fmin
=
220
,
output_format
=
"Phase"
,
n_bins
=
160
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
assert
True
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_cqt_2010
(
device
):
# Log sweep case
fs
=
44100
t
=
1
f0
=
55
f1
=
22050
s
=
np
.
linspace
(
0
,
t
,
fs
*
t
)
x
=
chirp
(
s
,
f0
,
1
,
f1
,
method
=
'logarithmic'
)
x
=
x
.
astype
(
dtype
=
np
.
float32
)
# Magnitude
stft
=
CQT2010
(
sr
=
fs
,
fmin
=
110
,
output_format
=
"Magnitude"
,
n_bins
=
160
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
# Complex
stft
=
CQT2010
(
sr
=
fs
,
fmin
=
110
,
output_format
=
"Complex"
,
n_bins
=
160
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
# Phase
stft
=
CQT2010
(
sr
=
fs
,
fmin
=
110
,
output_format
=
"Phase"
,
n_bins
=
160
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
assert
True
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_cqt_1992_v2_log
(
device
):
# Log sweep case
fs
=
44100
t
=
1
f0
=
55
f1
=
22050
s
=
np
.
linspace
(
0
,
t
,
fs
*
t
)
x
=
chirp
(
s
,
f0
,
1
,
f1
,
method
=
'logarithmic'
)
x
=
x
.
astype
(
dtype
=
np
.
float32
)
# Magnitude
stft
=
CQT1992v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Magnitude"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
ground_truth
=
np
.
load
(
"tests/ground-truths/log-sweep-cqt-1992-mag-ground-truth.npy"
)
X
=
torch
.
log
(
X
+
1e-5
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Complex
stft
=
CQT1992v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Complex"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
ground_truth
=
np
.
load
(
"tests/ground-truths/log-sweep-cqt-1992-complex-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Phase
stft
=
CQT1992v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Phase"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
ground_truth
=
np
.
load
(
"tests/ground-truths/log-sweep-cqt-1992-phase-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_cqt_1992_v2_linear
(
device
):
# Linear sweep case
fs
=
44100
t
=
1
f0
=
55
f1
=
22050
s
=
np
.
linspace
(
0
,
t
,
fs
*
t
)
x
=
chirp
(
s
,
f0
,
1
,
f1
,
method
=
'linear'
)
x
=
x
.
astype
(
dtype
=
np
.
float32
)
# Magnitude
stft
=
CQT1992v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Magnitude"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
ground_truth
=
np
.
load
(
"tests/ground-truths/linear-sweep-cqt-1992-mag-ground-truth.npy"
)
X
=
torch
.
log
(
X
+
1e-5
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Complex
stft
=
CQT1992v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Complex"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
ground_truth
=
np
.
load
(
"tests/ground-truths/linear-sweep-cqt-1992-complex-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Phase
stft
=
CQT1992v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Phase"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
ground_truth
=
np
.
load
(
"tests/ground-truths/linear-sweep-cqt-1992-phase-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_cqt_2010_v2_log
(
device
):
# Log sweep case
fs
=
44100
t
=
1
f0
=
55
f1
=
22050
s
=
np
.
linspace
(
0
,
t
,
fs
*
t
)
x
=
chirp
(
s
,
f0
,
1
,
f1
,
method
=
'logarithmic'
)
x
=
x
.
astype
(
dtype
=
np
.
float32
)
# Magnitude
stft
=
CQT2010v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Magnitude"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
X
=
torch
.
log
(
X
+
1e-2
)
# np.save("tests/ground-truths/log-sweep-cqt-2010-mag-ground-truth", X.cpu())
ground_truth
=
np
.
load
(
"tests/ground-truths/log-sweep-cqt-2010-mag-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Complex
stft
=
CQT2010v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Complex"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
# np.save("tests/ground-truths/log-sweep-cqt-2010-complex-ground-truth", X.cpu())
ground_truth
=
np
.
load
(
"tests/ground-truths/log-sweep-cqt-2010-complex-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# # Phase
# stft = CQT2010v2(sr=fs, fmin=55, device=device, output_format="Phase",
# n_bins=207, bins_per_octave=24)
# X = stft(torch.tensor(x, device=device).unsqueeze(0))
# # np.save("tests/ground-truths/log-sweep-cqt-2010-phase-ground-truth", X.cpu())
# ground_truth = np.load("tests/ground-truths/log-sweep-cqt-2010-phase-ground-truth.npy")
# assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_cqt_2010_v2_linear
(
device
):
# Linear sweep case
fs
=
44100
t
=
1
f0
=
55
f1
=
22050
s
=
np
.
linspace
(
0
,
t
,
fs
*
t
)
x
=
chirp
(
s
,
f0
,
1
,
f1
,
method
=
'linear'
)
x
=
x
.
astype
(
dtype
=
np
.
float32
)
# Magnitude
stft
=
CQT2010v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Magnitude"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
X
=
torch
.
log
(
X
+
1e-2
)
# np.save("tests/ground-truths/linear-sweep-cqt-2010-mag-ground-truth", X.cpu())
ground_truth
=
np
.
load
(
"tests/ground-truths/linear-sweep-cqt-2010-mag-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Complex
stft
=
CQT2010v2
(
sr
=
fs
,
fmin
=
55
,
output_format
=
"Complex"
,
n_bins
=
207
,
bins_per_octave
=
24
).
to
(
device
)
X
=
stft
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
))
# np.save("tests/ground-truths/linear-sweep-cqt-2010-complex-ground-truth", X.cpu())
ground_truth
=
np
.
load
(
"tests/ground-truths/linear-sweep-cqt-2010-complex-ground-truth.npy"
)
assert
np
.
allclose
(
X
.
cpu
(),
ground_truth
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Phase
# stft = CQT2010v2(sr=fs, fmin=55, device=device, output_format="Phase",
# n_bins=207, bins_per_octave=24)
# X = stft(torch.tensor(x, device=device).unsqueeze(0))
# # np.save("tests/ground-truths/linear-sweep-cqt-2010-phase-ground-truth", X.cpu())
# ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-2010-phase-ground-truth.npy")
# assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
f
'cuda:
{
gpu_idx
}
'
])
def
test_mfcc
(
device
):
x
=
example_y
mfcc
=
MFCC
(
sr
=
example_sr
).
to
(
device
)
X
=
mfcc
(
torch
.
tensor
(
x
,
device
=
device
).
unsqueeze
(
0
)).
squeeze
()
X_librosa
=
librosa
.
feature
.
mfcc
(
x
,
sr
=
example_sr
)
assert
np
.
allclose
(
X
.
cpu
(),
X_librosa
,
rtol
=
1e-3
,
atol
=
1e-3
)
x
=
torch
.
randn
((
4
,
44100
))
# Create a batch of input for the following Data.Parallel test
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_STFT_Parallel
(
device
):
spec_layer
=
STFT
(
hop_length
=
512
,
n_fft
=
2048
,
window
=
'hann'
,
freq_scale
=
'no'
,
output_format
=
'Complex'
).
to
(
device
)
inverse_spec_layer
=
iSTFT
(
hop_length
=
512
,
n_fft
=
2048
,
window
=
'hann'
,
freq_scale
=
'no'
).
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
inverse_spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
inverse_spec_layer
)
spec
=
spec_layer_parallel
(
x
)
x_recon
=
inverse_spec_layer_parallel
(
spec
,
onesided
=
True
,
length
=
x
.
shape
[
-
1
])
assert
np
.
allclose
(
x_recon
.
detach
().
cpu
(),
x
.
detach
().
cpu
(),
rtol
=
1e-3
,
atol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_MelSpectrogram_Parallel
(
device
):
spec_layer
=
MelSpectrogram
(
sr
=
22050
,
n_fft
=
2048
,
n_mels
=
128
,
hop_length
=
512
,
window
=
'hann'
,
center
=
True
,
pad_mode
=
'reflect'
,
power
=
2.0
,
htk
=
False
,
fmin
=
0.0
,
fmax
=
None
,
norm
=
1
,
verbose
=
True
).
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
spec
=
spec_layer_parallel
(
x
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_MFCC_Parallel
(
device
):
spec_layer
=
MFCC
().
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
spec
=
spec_layer_parallel
(
x
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_CQT1992_Parallel
(
device
):
spec_layer
=
CQT1992
(
fmin
=
110
,
n_bins
=
60
,
bins_per_octave
=
12
).
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
spec
=
spec_layer_parallel
(
x
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_CQT1992v2_Parallel
(
device
):
spec_layer
=
CQT1992v2
().
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
spec
=
spec_layer_parallel
(
x
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_CQT2010_Parallel
(
device
):
spec_layer
=
CQT2010
().
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
spec
=
spec_layer_parallel
(
x
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
f
'cuda:
{
gpu_idx
}
'
])
def
test_CQT2010v2_Parallel
(
device
):
spec_layer
=
CQT2010v2
().
to
(
device
)
spec_layer_parallel
=
torch
.
nn
.
DataParallel
(
spec_layer
)
spec
=
spec_layer_parallel
(
x
)
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录