Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
44ec1931
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
44ec1931
编写于
8月 17, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor io
上级
ecfac8ca
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
489 addition
and
459 deletion
+489
-459
deepspeech/io/dataloader.py
deepspeech/io/dataloader.py
+3
-63
deepspeech/io/dataset.py
deepspeech/io/dataset.py
+76
-7
deepspeech/io/reader.py
deepspeech/io/reader.py
+409
-0
deepspeech/io/utility.py
deepspeech/io/utility.py
+1
-389
未找到文件。
deepspeech/io/dataloader.py
浏览文件 @
44ec1931
...
...
@@ -11,80 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
paddle.io
import
DataLoader
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.io.batchfy
import
make_batchset
from
deepspeech.io.dataset
import
CustomConverter
from
deepspeech.io.dataset
import
TransformDataset
from
deepspeech.io.utility
import
LoadInputsAndTargets
from
deepspeech.io.utility
import
pad_list
from
deepspeech.io.reader
import
LoadInputsAndTargets
from
deepspeech.utils.log
import
Log
__all__
=
[
"
CustomConverter"
,
"
BatchDataLoader"
]
__all__
=
[
"BatchDataLoader"
]
logger
=
Log
(
__name__
).
getlog
()
class
CustomConverter
():
"""Custom batch converter.
Args:
subsampling_factor (int): The subsampling factor.
dtype (np.dtype): Data type to convert.
"""
def
__init__
(
self
,
subsampling_factor
=
1
,
dtype
=
np
.
float32
):
"""Construct a CustomConverter object."""
self
.
subsampling_factor
=
subsampling_factor
self
.
ignore_id
=
-
1
self
.
dtype
=
dtype
def
__call__
(
self
,
batch
):
"""Transform a batch and send it to a device.
Args:
batch (list): The batch to transform.
Returns:
tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
"""
# batch should be located in list
assert
len
(
batch
)
==
1
(
xs
,
ys
),
utts
=
batch
[
0
]
# perform subsampling
if
self
.
subsampling_factor
>
1
:
xs
=
[
x
[::
self
.
subsampling_factor
,
:]
for
x
in
xs
]
# get batch of lengths of input sequences
ilens
=
np
.
array
([
x
.
shape
[
0
]
for
x
in
xs
])
# perform padding and convert to tensor
# currently only support real number
if
xs
[
0
].
dtype
.
kind
==
"c"
:
xs_pad_real
=
pad_list
([
x
.
real
for
x
in
xs
],
0
).
astype
(
self
.
dtype
)
xs_pad_imag
=
pad_list
([
x
.
imag
for
x
in
xs
],
0
).
astype
(
self
.
dtype
)
# Note(kamo):
# {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
# Don't create ComplexTensor and give it E2E here
# because torch.nn.DataParellel can't handle it.
xs_pad
=
{
"real"
:
xs_pad_real
,
"imag"
:
xs_pad_imag
}
else
:
xs_pad
=
pad_list
(
xs
,
0
).
astype
(
self
.
dtype
)
# NOTE: this is for multi-output (e.g., speech translation)
ys_pad
=
pad_list
(
[
np
.
array
(
y
[
0
][:])
if
isinstance
(
y
,
tuple
)
else
y
for
y
in
ys
],
self
.
ignore_id
)
olens
=
np
.
array
(
[
y
[
0
].
shape
[
0
]
if
isinstance
(
y
,
tuple
)
else
y
.
shape
[
0
]
for
y
in
ys
])
return
utts
,
xs_pad
,
ilens
,
ys_pad
,
olens
class
BatchDataLoader
():
def
__init__
(
self
,
json_file
:
str
,
...
...
deepspeech/io/dataset.py
浏览文件 @
44ec1931
...
...
@@ -17,9 +17,13 @@ from paddle.io import Dataset
from
yacs.config
import
CfgNode
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.io.utility
import
pad_list
from
deepspeech.utils.log
import
Log
__all__
=
[
"ManifestDataset"
,
"TripletManifestDataset"
,
"TransformDataset"
]
__all__
=
[
"ManifestDataset"
,
"TripletManifestDataset"
,
"TransformDataset"
,
"CustomConverter"
]
logger
=
Log
(
__name__
).
getlog
()
...
...
@@ -76,12 +80,18 @@ class ManifestDataset(Dataset):
Args:
manifest_path (str): manifest josn file path
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
max_input_len ([type], optional): maximum output seq length,
in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
min_input_len (float, optional): minimum input seq length,
in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
Defaults to 0.05.
"""
super
().
__init__
()
...
...
@@ -118,6 +128,65 @@ class TripletManifestDataset(ManifestDataset):
"text1"
]
class
CustomConverter
():
"""Custom batch converter.
Args:
subsampling_factor (int): The subsampling factor.
dtype (np.dtype): Data type to convert.
"""
def
__init__
(
self
,
subsampling_factor
=
1
,
dtype
=
np
.
float32
):
"""Construct a CustomConverter object."""
self
.
subsampling_factor
=
subsampling_factor
self
.
ignore_id
=
-
1
self
.
dtype
=
dtype
def
__call__
(
self
,
batch
):
"""Transform a batch and send it to a device.
Args:
batch (list): The batch to transform.
Returns:
tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
"""
# batch should be located in list
assert
len
(
batch
)
==
1
(
xs
,
ys
),
utts
=
batch
[
0
]
# perform subsampling
if
self
.
subsampling_factor
>
1
:
xs
=
[
x
[::
self
.
subsampling_factor
,
:]
for
x
in
xs
]
# get batch of lengths of input sequences
ilens
=
np
.
array
([
x
.
shape
[
0
]
for
x
in
xs
])
# perform padding and convert to tensor
# currently only support real number
if
xs
[
0
].
dtype
.
kind
==
"c"
:
xs_pad_real
=
pad_list
([
x
.
real
for
x
in
xs
],
0
).
astype
(
self
.
dtype
)
xs_pad_imag
=
pad_list
([
x
.
imag
for
x
in
xs
],
0
).
astype
(
self
.
dtype
)
# Note(kamo):
# {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
# Don't create ComplexTensor and give it E2E here
# because torch.nn.DataParellel can't handle it.
xs_pad
=
{
"real"
:
xs_pad_real
,
"imag"
:
xs_pad_imag
}
else
:
xs_pad
=
pad_list
(
xs
,
0
).
astype
(
self
.
dtype
)
# NOTE: this is for multi-output (e.g., speech translation)
ys_pad
=
pad_list
(
[
np
.
array
(
y
[
0
][:])
if
isinstance
(
y
,
tuple
)
else
y
for
y
in
ys
],
self
.
ignore_id
)
olens
=
np
.
array
(
[
y
[
0
].
shape
[
0
]
if
isinstance
(
y
,
tuple
)
else
y
.
shape
[
0
]
for
y
in
ys
])
return
utts
,
xs_pad
,
ilens
,
ys_pad
,
olens
class
TransformDataset
(
Dataset
):
"""Transform Dataset.
...
...
deepspeech/io/reader.py
0 → 100644
浏览文件 @
44ec1931
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
collections
import
OrderedDict
import
kaldiio
import
numpy
as
np
import
soundfile
from
deepspeech.frontend.augmentor.augmentation
import
AugmentationPipeline
from
deepspeech.utils.log
import
Log
__all__
=
[
"LoadInputsAndTargets"
]
logger
=
Log
(
__name__
).
getlog
()
class
LoadInputsAndTargets
():
"""Create a mini-batch from a list of dicts
>>> batch = [('utt1',
... dict(input=[dict(feat='some.ark:123',
... filetype='mat',
... name='input1',
... shape=[100, 80])],
... output=[dict(tokenid='1 2 3 4',
... name='target1',
... shape=[4, 31])]]))
>>> l = LoadInputsAndTargets()
>>> feat, target = l(batch)
:param: str mode: Specify the task mode, "asr" or "tts"
:param: str preprocess_conf: The path of a json file for pre-processing
:param: bool load_input: If False, not to load the input data
:param: bool load_output: If False, not to load the output data
:param: bool sort_in_input_length: Sort the mini-batch in descending order
of the input length
:param: bool use_speaker_embedding: Used for tts mode only
:param: bool use_second_target: Used for tts mode only
:param: dict preprocess_args: Set some optional arguments for preprocessing
:param: Optional[dict] preprocess_args: Used for tts mode only
"""
def
__init__
(
self
,
mode
=
"asr"
,
preprocess_conf
=
None
,
load_input
=
True
,
load_output
=
True
,
sort_in_input_length
=
True
,
preprocess_args
=
None
,
keep_all_data_on_mem
=
False
,
):
self
.
_loaders
=
{}
if
mode
not
in
[
"asr"
]:
raise
ValueError
(
"Only asr are allowed: mode={}"
.
format
(
mode
))
if
preprocess_conf
is
not
None
:
self
.
preprocessing
=
AugmentationPipeline
(
preprocess_conf
)
logging
.
warning
(
"[Experimental feature] Some preprocessing will be done "
"for the mini-batch creation using {}"
.
format
(
self
.
preprocessing
))
else
:
# If conf doesn't exist, this function don't touch anything.
self
.
preprocessing
=
None
self
.
mode
=
mode
self
.
load_output
=
load_output
self
.
load_input
=
load_input
self
.
sort_in_input_length
=
sort_in_input_length
if
preprocess_args
is
None
:
self
.
preprocess_args
=
{}
else
:
assert
isinstance
(
preprocess_args
,
dict
),
type
(
preprocess_args
)
self
.
preprocess_args
=
dict
(
preprocess_args
)
self
.
keep_all_data_on_mem
=
keep_all_data_on_mem
def
__call__
(
self
,
batch
,
return_uttid
=
False
):
"""Function to load inputs and targets from list of dicts
:param List[Tuple[str, dict]] batch: list of dict which is subset of
loaded data.json
:param bool return_uttid: return utterance ID information for visualization
:return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
:return: list of input feature sequences
[(T_1, D), (T_2, D), ..., (T_B, D)]
:rtype: list of float ndarray
:return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
:rtype: list of int ndarray
"""
x_feats_dict
=
OrderedDict
()
# OrderedDict[str, List[np.ndarray]]
y_feats_dict
=
OrderedDict
()
# OrderedDict[str, List[np.ndarray]]
uttid_list
=
[]
# List[str]
for
uttid
,
info
in
batch
:
uttid_list
.
append
(
uttid
)
if
self
.
load_input
:
# Note(kamo): This for-loop is for multiple inputs
for
idx
,
inp
in
enumerate
(
info
[
"input"
]):
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "input1", ...}], ...}
x
=
self
.
_get_from_loader
(
filepath
=
inp
[
"feat"
],
filetype
=
inp
.
get
(
"filetype"
,
"mat"
))
x_feats_dict
.
setdefault
(
inp
[
"name"
],
[]).
append
(
x
)
if
self
.
load_output
:
for
idx
,
inp
in
enumerate
(
info
[
"output"
]):
if
"tokenid"
in
inp
:
# ======= Legacy format for output =======
# {"output": [{"tokenid": "1 2 3 4"}])
x
=
np
.
fromiter
(
map
(
int
,
inp
[
"tokenid"
].
split
()),
dtype
=
np
.
int64
)
else
:
# ======= New format =======
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "target1", ...}], ...}
x
=
self
.
_get_from_loader
(
filepath
=
inp
[
"feat"
],
filetype
=
inp
.
get
(
"filetype"
,
"mat"
))
y_feats_dict
.
setdefault
(
inp
[
"name"
],
[]).
append
(
x
)
if
self
.
mode
==
"asr"
:
return_batch
,
uttid_list
=
self
.
_create_batch_asr
(
x_feats_dict
,
y_feats_dict
,
uttid_list
)
else
:
raise
NotImplementedError
(
self
.
mode
)
if
self
.
preprocessing
is
not
None
:
# Apply pre-processing all input features
for
x_name
in
return_batch
.
keys
():
if
x_name
.
startswith
(
"input"
):
return_batch
[
x_name
]
=
self
.
preprocessing
(
return_batch
[
x_name
],
uttid_list
,
**
self
.
preprocess_args
)
if
return_uttid
:
return
tuple
(
return_batch
.
values
()),
uttid_list
# Doesn't return the names now.
return
tuple
(
return_batch
.
values
())
def
_create_batch_asr
(
self
,
x_feats_dict
,
y_feats_dict
,
uttid_list
):
"""Create a OrderedDict for the mini-batch
:param OrderedDict x_feats_dict:
e.g. {"input1": [ndarray, ndarray, ...],
"input2": [ndarray, ndarray, ...]}
:param OrderedDict y_feats_dict:
e.g. {"target1": [ndarray, ndarray, ...],
"target2": [ndarray, ndarray, ...]}
:param: List[str] uttid_list:
Give uttid_list to sort in the same order as the mini-batch
:return: batch, uttid_list
:rtype: Tuple[OrderedDict, List[str]]
"""
# handle single-input and multi-input (paralell) asr mode
xs
=
list
(
x_feats_dict
.
values
())
if
self
.
load_output
:
ys
=
list
(
y_feats_dict
.
values
())
assert
len
(
xs
[
0
])
==
len
(
ys
[
0
]),
(
len
(
xs
[
0
]),
len
(
ys
[
0
]))
# get index of non-zero length samples
nonzero_idx
=
list
(
filter
(
lambda
i
:
len
(
ys
[
0
][
i
])
>
0
,
range
(
len
(
ys
[
0
]))))
for
n
in
range
(
1
,
len
(
y_feats_dict
)):
nonzero_idx
=
filter
(
lambda
i
:
len
(
ys
[
n
][
i
])
>
0
,
nonzero_idx
)
else
:
# Note(kamo): Be careful not to make nonzero_idx to a generator
nonzero_idx
=
list
(
range
(
len
(
xs
[
0
])))
if
self
.
sort_in_input_length
:
# sort in input lengths based on the first input
nonzero_sorted_idx
=
sorted
(
nonzero_idx
,
key
=
lambda
i
:
-
len
(
xs
[
0
][
i
]))
else
:
nonzero_sorted_idx
=
nonzero_idx
if
len
(
nonzero_sorted_idx
)
!=
len
(
xs
[
0
]):
logging
.
warning
(
"Target sequences include empty tokenid (batch {} -> {})."
.
format
(
len
(
xs
[
0
]),
len
(
nonzero_sorted_idx
)))
# remove zero-length samples
xs
=
[[
x
[
i
]
for
i
in
nonzero_sorted_idx
]
for
x
in
xs
]
uttid_list
=
[
uttid_list
[
i
]
for
i
in
nonzero_sorted_idx
]
x_names
=
list
(
x_feats_dict
.
keys
())
if
self
.
load_output
:
ys
=
[[
y
[
i
]
for
i
in
nonzero_sorted_idx
]
for
y
in
ys
]
y_names
=
list
(
y_feats_dict
.
keys
())
# Keeping x_name and y_name, e.g. input1, for future extension
return_batch
=
OrderedDict
([
*
[(
x_name
,
x
)
for
x_name
,
x
in
zip
(
x_names
,
xs
)],
*
[(
y_name
,
y
)
for
y_name
,
y
in
zip
(
y_names
,
ys
)],
])
else
:
return_batch
=
OrderedDict
(
[(
x_name
,
x
)
for
x_name
,
x
in
zip
(
x_names
,
xs
)])
return
return_batch
,
uttid_list
def
_get_from_loader
(
self
,
filepath
,
filetype
):
"""Return ndarray
In order to make the fds to be opened only at the first referring,
the loader are stored in self._loaders
>>> ndarray = loader.get_from_loader(
... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
:param: str filepath:
:param: str filetype:
:return:
:rtype: np.ndarray
"""
if
filetype
==
"hdf5"
:
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
h5py
.
File
(
filepath
,
"r"
)
self
.
_loaders
[
filepath
]
=
loader
return
loader
[
key
][()]
elif
filetype
==
"sound.hdf5"
:
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "sound.hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
SoundHDF5File
(
filepath
,
"r"
,
dtype
=
"int16"
)
self
.
_loaders
[
filepath
]
=
loader
array
,
rate
=
loader
[
key
]
return
array
elif
filetype
==
"sound"
:
# e.g.
# {"input": [{"feat": "some/path.wav",
# "filetype": "sound"},
# Assume PCM16
if
not
self
.
keep_all_data_on_mem
:
array
,
_
=
soundfile
.
read
(
filepath
,
dtype
=
"int16"
)
return
array
if
filepath
not
in
self
.
_loaders
:
array
,
_
=
soundfile
.
read
(
filepath
,
dtype
=
"int16"
)
self
.
_loaders
[
filepath
]
=
array
return
self
.
_loaders
[
filepath
]
elif
filetype
==
"npz"
:
# e.g.
# {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
# "filetype": "npz",
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
np
.
load
(
filepath
)
self
.
_loaders
[
filepath
]
=
loader
return
loader
[
key
]
elif
filetype
==
"npy"
:
# e.g.
# {"input": [{"feat": "some/path.npy",
# "filetype": "npy"},
if
not
self
.
keep_all_data_on_mem
:
return
np
.
load
(
filepath
)
if
filepath
not
in
self
.
_loaders
:
self
.
_loaders
[
filepath
]
=
np
.
load
(
filepath
)
return
self
.
_loaders
[
filepath
]
elif
filetype
in
[
"mat"
,
"vec"
]:
# e.g.
# {"input": [{"feat": "some/path.ark:123",
# "filetype": "mat"}]},
# In this case, "123" indicates the starting points of the matrix
# load_mat can load both matrix and vector
if
not
self
.
keep_all_data_on_mem
:
return
kaldiio
.
load_mat
(
filepath
)
if
filepath
not
in
self
.
_loaders
:
self
.
_loaders
[
filepath
]
=
kaldiio
.
load_mat
(
filepath
)
return
self
.
_loaders
[
filepath
]
elif
filetype
==
"scp"
:
# e.g.
# {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
# "filetype": "scp",
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
kaldiio
.
load_scp
(
filepath
)
self
.
_loaders
[
filepath
]
=
loader
return
loader
[
key
]
else
:
raise
NotImplementedError
(
"Not supported: loader_type={}"
.
format
(
filetype
))
class
SoundHDF5File
():
"""Collecting sound files to a HDF5 file
>>> f = SoundHDF5File('a.flac.h5', mode='a')
>>> array = np.random.randint(0, 100, 100, dtype=np.int16)
>>> f['id'] = (array, 16000)
>>> array, rate = f['id']
:param: str filepath:
:param: str mode:
:param: str format: The type used when saving wav. flac, nist, htk, etc.
:param: str dtype:
"""
def
__init__
(
self
,
filepath
,
mode
=
"r+"
,
format
=
None
,
dtype
=
"int16"
,
**
kwargs
):
self
.
filepath
=
filepath
self
.
mode
=
mode
self
.
dtype
=
dtype
self
.
file
=
h5py
.
File
(
filepath
,
mode
,
**
kwargs
)
if
format
is
None
:
# filepath = a.flac.h5 -> format = flac
second_ext
=
os
.
path
.
splitext
(
os
.
path
.
splitext
(
filepath
)[
0
])[
1
]
format
=
second_ext
[
1
:]
if
format
.
upper
()
not
in
soundfile
.
available_formats
():
# If not found, flac is selected
format
=
"flac"
# This format affects only saving
self
.
format
=
format
def
__repr__
(
self
):
return
'<SoundHDF5 file "{}" (mode {}, format {}, type {})>'
.
format
(
self
.
filepath
,
self
.
mode
,
self
.
format
,
self
.
dtype
)
def
create_dataset
(
self
,
name
,
shape
=
None
,
data
=
None
,
**
kwds
):
f
=
io
.
BytesIO
()
array
,
rate
=
data
soundfile
.
write
(
f
,
array
,
rate
,
format
=
self
.
format
)
self
.
file
.
create_dataset
(
name
,
shape
=
shape
,
data
=
np
.
void
(
f
.
getvalue
()),
**
kwds
)
def
__setitem__
(
self
,
name
,
data
):
self
.
create_dataset
(
name
,
data
=
data
)
def
__getitem__
(
self
,
key
):
data
=
self
.
file
[
key
][()]
f
=
io
.
BytesIO
(
data
.
tobytes
())
array
,
rate
=
soundfile
.
read
(
f
,
dtype
=
self
.
dtype
)
return
array
,
rate
def
keys
(
self
):
return
self
.
file
.
keys
()
def
values
(
self
):
for
k
in
self
.
file
:
yield
self
[
k
]
def
items
(
self
):
for
k
in
self
.
file
:
yield
k
,
self
[
k
]
def
__iter__
(
self
):
return
iter
(
self
.
file
)
def
__contains__
(
self
,
item
):
return
item
in
self
.
file
def
__len__
(
self
,
item
):
return
len
(
self
.
file
)
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
file
.
close
()
def
close
(
self
):
self
.
file
.
close
()
deepspeech/io/utility.py
浏览文件 @
44ec1931
...
...
@@ -11,17 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
collections
import
OrderedDict
from
typing
import
List
import
kaldiio
import
numpy
as
np
import
soundfile
from
deepspeech.frontend.augmentor.augmentation
import
AugmentationPipeline
from
deepspeech.utils.log
import
Log
__all__
=
[
"pad_list"
,
"pad_sequence"
,
"LoadInputsAndTargets"
]
__all__
=
[
"pad_list"
,
"pad_sequence"
]
logger
=
Log
(
__name__
).
getlog
()
...
...
@@ -89,387 +85,3 @@ def pad_sequence(sequences: List[np.ndarray],
out_tensor
[:
length
,
i
,
...]
=
tensor
return
out_tensor
class
LoadInputsAndTargets
():
"""Create a mini-batch from a list of dicts
>>> batch = [('utt1',
... dict(input=[dict(feat='some.ark:123',
... filetype='mat',
... name='input1',
... shape=[100, 80])],
... output=[dict(tokenid='1 2 3 4',
... name='target1',
... shape=[4, 31])]]))
>>> l = LoadInputsAndTargets()
>>> feat, target = l(batch)
:param: str mode: Specify the task mode, "asr" or "tts"
:param: str preprocess_conf: The path of a json file for pre-processing
:param: bool load_input: If False, not to load the input data
:param: bool load_output: If False, not to load the output data
:param: bool sort_in_input_length: Sort the mini-batch in descending order
of the input length
:param: bool use_speaker_embedding: Used for tts mode only
:param: bool use_second_target: Used for tts mode only
:param: dict preprocess_args: Set some optional arguments for preprocessing
:param: Optional[dict] preprocess_args: Used for tts mode only
"""
def
__init__
(
self
,
mode
=
"asr"
,
preprocess_conf
=
None
,
load_input
=
True
,
load_output
=
True
,
sort_in_input_length
=
True
,
preprocess_args
=
None
,
keep_all_data_on_mem
=
False
,
):
self
.
_loaders
=
{}
if
mode
not
in
[
"asr"
]:
raise
ValueError
(
"Only asr are allowed: mode={}"
.
format
(
mode
))
if
preprocess_conf
is
not
None
:
self
.
preprocessing
=
AugmentationPipeline
(
preprocess_conf
)
logging
.
warning
(
"[Experimental feature] Some preprocessing will be done "
"for the mini-batch creation using {}"
.
format
(
self
.
preprocessing
))
else
:
# If conf doesn't exist, this function don't touch anything.
self
.
preprocessing
=
None
self
.
mode
=
mode
self
.
load_output
=
load_output
self
.
load_input
=
load_input
self
.
sort_in_input_length
=
sort_in_input_length
if
preprocess_args
is
None
:
self
.
preprocess_args
=
{}
else
:
assert
isinstance
(
preprocess_args
,
dict
),
type
(
preprocess_args
)
self
.
preprocess_args
=
dict
(
preprocess_args
)
self
.
keep_all_data_on_mem
=
keep_all_data_on_mem
def
__call__
(
self
,
batch
,
return_uttid
=
False
):
"""Function to load inputs and targets from list of dicts
:param List[Tuple[str, dict]] batch: list of dict which is subset of
loaded data.json
:param bool return_uttid: return utterance ID information for visualization
:return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
:return: list of input feature sequences
[(T_1, D), (T_2, D), ..., (T_B, D)]
:rtype: list of float ndarray
:return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
:rtype: list of int ndarray
"""
x_feats_dict
=
OrderedDict
()
# OrderedDict[str, List[np.ndarray]]
y_feats_dict
=
OrderedDict
()
# OrderedDict[str, List[np.ndarray]]
uttid_list
=
[]
# List[str]
for
uttid
,
info
in
batch
:
uttid_list
.
append
(
uttid
)
if
self
.
load_input
:
# Note(kamo): This for-loop is for multiple inputs
for
idx
,
inp
in
enumerate
(
info
[
"input"
]):
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "input1", ...}], ...}
x
=
self
.
_get_from_loader
(
filepath
=
inp
[
"feat"
],
filetype
=
inp
.
get
(
"filetype"
,
"mat"
))
x_feats_dict
.
setdefault
(
inp
[
"name"
],
[]).
append
(
x
)
if
self
.
load_output
:
for
idx
,
inp
in
enumerate
(
info
[
"output"
]):
if
"tokenid"
in
inp
:
# ======= Legacy format for output =======
# {"output": [{"tokenid": "1 2 3 4"}])
x
=
np
.
fromiter
(
map
(
int
,
inp
[
"tokenid"
].
split
()),
dtype
=
np
.
int64
)
else
:
# ======= New format =======
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "target1", ...}], ...}
x
=
self
.
_get_from_loader
(
filepath
=
inp
[
"feat"
],
filetype
=
inp
.
get
(
"filetype"
,
"mat"
))
y_feats_dict
.
setdefault
(
inp
[
"name"
],
[]).
append
(
x
)
if
self
.
mode
==
"asr"
:
return_batch
,
uttid_list
=
self
.
_create_batch_asr
(
x_feats_dict
,
y_feats_dict
,
uttid_list
)
else
:
raise
NotImplementedError
(
self
.
mode
)
if
self
.
preprocessing
is
not
None
:
# Apply pre-processing all input features
for
x_name
in
return_batch
.
keys
():
if
x_name
.
startswith
(
"input"
):
return_batch
[
x_name
]
=
self
.
preprocessing
(
return_batch
[
x_name
],
uttid_list
,
**
self
.
preprocess_args
)
if
return_uttid
:
return
tuple
(
return_batch
.
values
()),
uttid_list
# Doesn't return the names now.
return
tuple
(
return_batch
.
values
())
def
_create_batch_asr
(
self
,
x_feats_dict
,
y_feats_dict
,
uttid_list
):
"""Create a OrderedDict for the mini-batch
:param OrderedDict x_feats_dict:
e.g. {"input1": [ndarray, ndarray, ...],
"input2": [ndarray, ndarray, ...]}
:param OrderedDict y_feats_dict:
e.g. {"target1": [ndarray, ndarray, ...],
"target2": [ndarray, ndarray, ...]}
:param: List[str] uttid_list:
Give uttid_list to sort in the same order as the mini-batch
:return: batch, uttid_list
:rtype: Tuple[OrderedDict, List[str]]
"""
# handle single-input and multi-input (paralell) asr mode
xs
=
list
(
x_feats_dict
.
values
())
if
self
.
load_output
:
ys
=
list
(
y_feats_dict
.
values
())
assert
len
(
xs
[
0
])
==
len
(
ys
[
0
]),
(
len
(
xs
[
0
]),
len
(
ys
[
0
]))
# get index of non-zero length samples
nonzero_idx
=
list
(
filter
(
lambda
i
:
len
(
ys
[
0
][
i
])
>
0
,
range
(
len
(
ys
[
0
]))))
for
n
in
range
(
1
,
len
(
y_feats_dict
)):
nonzero_idx
=
filter
(
lambda
i
:
len
(
ys
[
n
][
i
])
>
0
,
nonzero_idx
)
else
:
# Note(kamo): Be careful not to make nonzero_idx to a generator
nonzero_idx
=
list
(
range
(
len
(
xs
[
0
])))
if
self
.
sort_in_input_length
:
# sort in input lengths based on the first input
nonzero_sorted_idx
=
sorted
(
nonzero_idx
,
key
=
lambda
i
:
-
len
(
xs
[
0
][
i
]))
else
:
nonzero_sorted_idx
=
nonzero_idx
if
len
(
nonzero_sorted_idx
)
!=
len
(
xs
[
0
]):
logging
.
warning
(
"Target sequences include empty tokenid (batch {} -> {})."
.
format
(
len
(
xs
[
0
]),
len
(
nonzero_sorted_idx
)))
# remove zero-length samples
xs
=
[[
x
[
i
]
for
i
in
nonzero_sorted_idx
]
for
x
in
xs
]
uttid_list
=
[
uttid_list
[
i
]
for
i
in
nonzero_sorted_idx
]
x_names
=
list
(
x_feats_dict
.
keys
())
if
self
.
load_output
:
ys
=
[[
y
[
i
]
for
i
in
nonzero_sorted_idx
]
for
y
in
ys
]
y_names
=
list
(
y_feats_dict
.
keys
())
# Keeping x_name and y_name, e.g. input1, for future extension
return_batch
=
OrderedDict
([
*
[(
x_name
,
x
)
for
x_name
,
x
in
zip
(
x_names
,
xs
)],
*
[(
y_name
,
y
)
for
y_name
,
y
in
zip
(
y_names
,
ys
)],
])
else
:
return_batch
=
OrderedDict
(
[(
x_name
,
x
)
for
x_name
,
x
in
zip
(
x_names
,
xs
)])
return
return_batch
,
uttid_list
def
_get_from_loader
(
self
,
filepath
,
filetype
):
"""Return ndarray
In order to make the fds to be opened only at the first referring,
the loader are stored in self._loaders
>>> ndarray = loader.get_from_loader(
... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
:param: str filepath:
:param: str filetype:
:return:
:rtype: np.ndarray
"""
if
filetype
==
"hdf5"
:
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
h5py
.
File
(
filepath
,
"r"
)
self
.
_loaders
[
filepath
]
=
loader
return
loader
[
key
][()]
elif
filetype
==
"sound.hdf5"
:
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "sound.hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
SoundHDF5File
(
filepath
,
"r"
,
dtype
=
"int16"
)
self
.
_loaders
[
filepath
]
=
loader
array
,
rate
=
loader
[
key
]
return
array
elif
filetype
==
"sound"
:
# e.g.
# {"input": [{"feat": "some/path.wav",
# "filetype": "sound"},
# Assume PCM16
if
not
self
.
keep_all_data_on_mem
:
array
,
_
=
soundfile
.
read
(
filepath
,
dtype
=
"int16"
)
return
array
if
filepath
not
in
self
.
_loaders
:
array
,
_
=
soundfile
.
read
(
filepath
,
dtype
=
"int16"
)
self
.
_loaders
[
filepath
]
=
array
return
self
.
_loaders
[
filepath
]
elif
filetype
==
"npz"
:
# e.g.
# {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
# "filetype": "npz",
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
np
.
load
(
filepath
)
self
.
_loaders
[
filepath
]
=
loader
return
loader
[
key
]
elif
filetype
==
"npy"
:
# e.g.
# {"input": [{"feat": "some/path.npy",
# "filetype": "npy"},
if
not
self
.
keep_all_data_on_mem
:
return
np
.
load
(
filepath
)
if
filepath
not
in
self
.
_loaders
:
self
.
_loaders
[
filepath
]
=
np
.
load
(
filepath
)
return
self
.
_loaders
[
filepath
]
elif
filetype
in
[
"mat"
,
"vec"
]:
# e.g.
# {"input": [{"feat": "some/path.ark:123",
# "filetype": "mat"}]},
# In this case, "123" indicates the starting points of the matrix
# load_mat can load both matrix and vector
if
not
self
.
keep_all_data_on_mem
:
return
kaldiio
.
load_mat
(
filepath
)
if
filepath
not
in
self
.
_loaders
:
self
.
_loaders
[
filepath
]
=
kaldiio
.
load_mat
(
filepath
)
return
self
.
_loaders
[
filepath
]
elif
filetype
==
"scp"
:
# e.g.
# {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
# "filetype": "scp",
filepath
,
key
=
filepath
.
split
(
":"
,
1
)
loader
=
self
.
_loaders
.
get
(
filepath
)
if
loader
is
None
:
# To avoid disk access, create loader only for the first time
loader
=
kaldiio
.
load_scp
(
filepath
)
self
.
_loaders
[
filepath
]
=
loader
return
loader
[
key
]
else
:
raise
NotImplementedError
(
"Not supported: loader_type={}"
.
format
(
filetype
))
class
SoundHDF5File
():
"""Collecting sound files to a HDF5 file
>>> f = SoundHDF5File('a.flac.h5', mode='a')
>>> array = np.random.randint(0, 100, 100, dtype=np.int16)
>>> f['id'] = (array, 16000)
>>> array, rate = f['id']
:param: str filepath:
:param: str mode:
:param: str format: The type used when saving wav. flac, nist, htk, etc.
:param: str dtype:
"""
def
__init__
(
self
,
filepath
,
mode
=
"r+"
,
format
=
None
,
dtype
=
"int16"
,
**
kwargs
):
self
.
filepath
=
filepath
self
.
mode
=
mode
self
.
dtype
=
dtype
self
.
file
=
h5py
.
File
(
filepath
,
mode
,
**
kwargs
)
if
format
is
None
:
# filepath = a.flac.h5 -> format = flac
second_ext
=
os
.
path
.
splitext
(
os
.
path
.
splitext
(
filepath
)[
0
])[
1
]
format
=
second_ext
[
1
:]
if
format
.
upper
()
not
in
soundfile
.
available_formats
():
# If not found, flac is selected
format
=
"flac"
# This format affects only saving
self
.
format
=
format
def
__repr__
(
self
):
return
'<SoundHDF5 file "{}" (mode {}, format {}, type {})>'
.
format
(
self
.
filepath
,
self
.
mode
,
self
.
format
,
self
.
dtype
)
def
create_dataset
(
self
,
name
,
shape
=
None
,
data
=
None
,
**
kwds
):
f
=
io
.
BytesIO
()
array
,
rate
=
data
soundfile
.
write
(
f
,
array
,
rate
,
format
=
self
.
format
)
self
.
file
.
create_dataset
(
name
,
shape
=
shape
,
data
=
np
.
void
(
f
.
getvalue
()),
**
kwds
)
def
__setitem__
(
self
,
name
,
data
):
self
.
create_dataset
(
name
,
data
=
data
)
def
__getitem__
(
self
,
key
):
data
=
self
.
file
[
key
][()]
f
=
io
.
BytesIO
(
data
.
tobytes
())
array
,
rate
=
soundfile
.
read
(
f
,
dtype
=
self
.
dtype
)
return
array
,
rate
def
keys
(
self
):
return
self
.
file
.
keys
()
def
values
(
self
):
for
k
in
self
.
file
:
yield
self
[
k
]
def
items
(
self
):
for
k
in
self
.
file
:
yield
k
,
self
[
k
]
def
__iter__
(
self
):
return
iter
(
self
.
file
)
def
__contains__
(
self
,
item
):
return
item
in
self
.
file
def
__len__
(
self
,
item
):
return
len
(
self
.
file
)
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
file
.
close
()
def
close
(
self
):
self
.
file
.
close
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录