提交 564b6b68 编写于 作者: H huangyuxin

fix conflict

......@@ -42,6 +42,10 @@ ignore =
# these ignores are from flake8-comprehensions; please fix!
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
per-file-ignores =
*/__init__.py: F401
# Specify the list of error codes you wish Flake8 to report.
select =
E,
......
此差异已折叠。
......@@ -83,8 +83,8 @@
"from deepspeech.frontend.utility import read_manifest\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"\n",
"from deepspeech.models.deepspeech2 import DeepSpeech2Model\n",
"from deepspeech.models.deepspeech2 import DeepSpeech2InferModel\n",
"from deepspeech.models.ds2 import DeepSpeech2Model\n",
"from deepspeech.models.ds2 import DeepSpeech2InferModel\n",
"from deepspeech.io.dataset import ManifestDataset\n",
"\n",
"\n",
......@@ -669,4 +669,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
}
......@@ -3431,7 +3431,7 @@
" convolution_layer_args = (output_size, cnn_module_kernel, activation,\n",
" cnn_module_norm, causal)\n",
"\n",
" self.encoders = nn.ModuleList([\n",
" self.encoders = nn.LayerList([\n",
" ConformerEncoderLayer(\n",
" size=output_size,\n",
" self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n",
......
[中文版](README_cn.md)
# PaddlePaddle ASR toolkit
# PaddlePaddle Speech to Any toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
![support os](https://img.shields.io/badge/os-linux-yellow.svg)
*PaddleASR* is an open-source implementation of end-to-end Automatic Speech Recognition (ASR) engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient, samller and scalable implementation, including training, inference & testing module, and deployment.
*DeepSpeech* is an open-source implementation of end-to-end Automatic Speech Recognition engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient, samller and scalable implementation, including training, inference & testing module, and deployment.
## Features
......@@ -15,6 +15,8 @@
## Setup
All tested under:
* Ubuntu 16.04
* python>=3.7
* paddlepaddle>=2.1.2
......
[English](README.md)
# PaddlePaddle ASR toolkit
# PaddlePaddle Speech to Any toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
![support os](https://img.shields.io/badge/os-linux-yellow.svg)
*PaddleASR*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别(ASR)引擎的开源项目,
*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目,
我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效、小型化和可扩展的工具,包括训练,推理,以及 部署。
## 特性
......@@ -16,6 +16,9 @@
## 安装
在以下环境测试验证过:
* Ubuntu 16.04
* python>=3.7
* paddlepaddle>=2.1.2
......
......@@ -30,24 +30,13 @@ logger = Log(__name__).getlog()
logger.warn = logger.warning
########### hcak paddle #############
paddle.bool = 'bool'
paddle.float16 = 'float16'
paddle.half = 'float16'
paddle.float32 = 'float32'
paddle.float = 'float32'
paddle.float64 = 'float64'
paddle.double = 'float64'
paddle.int8 = 'int8'
paddle.int16 = 'int16'
paddle.short = 'int16'
paddle.int32 = 'int32'
paddle.int = 'int32'
paddle.int64 = 'int64'
paddle.long = 'int64'
paddle.uint8 = 'uint8'
paddle.uint16 = 'uint16'
paddle.complex64 = 'complex64'
paddle.complex128 = 'complex128'
paddle.cdouble = 'complex128'
......@@ -363,85 +352,8 @@ if not hasattr(paddle.Tensor, 'tolist'):
"register user tolist to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'tolist', tolist)
########### hcak paddle.nn.functional #############
def glu(x: paddle.Tensor, axis=-1) -> paddle.Tensor:
"""The gated linear unit (GLU) activation."""
a, b = x.split(2, axis=axis)
act_b = F.sigmoid(b)
return a * act_b
if not hasattr(paddle.nn.functional, 'glu'):
logger.warn(
"register user glu to paddle.nn.functional, remove this when fixed!")
setattr(paddle.nn.functional, 'glu', glu)
# def softplus(x):
# """Softplus function."""
# if hasattr(paddle.nn.functional, 'softplus'):
# #return paddle.nn.functional.softplus(x.float()).type_as(x)
# return paddle.nn.functional.softplus(x)
# else:
# raise NotImplementedError
# def gelu_accurate(x):
# """Gaussian Error Linear Units (GELU) activation."""
# # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py
# if not hasattr(gelu_accurate, "_a"):
# gelu_accurate._a = math.sqrt(2 / math.pi)
# return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
# (x + 0.044715 * paddle.pow(x, 3))))
# def gelu(x):
# """Gaussian Error Linear Units (GELU) activation."""
# if hasattr(nn.functional, 'gelu'):
# #return nn.functional.gelu(x.float()).type_as(x)
# return nn.functional.gelu(x)
# else:
# return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
# hack loss
def ctc_loss(logits,
labels,
input_lengths,
label_lengths,
blank=0,
reduction='mean',
norm_by_times=True):
#logger.info("my ctc loss with norm by times")
## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
input_lengths, label_lengths)
loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
assert reduction in ['mean', 'sum', 'none']
if reduction == 'mean':
loss_out = paddle.mean(loss_out / label_lengths)
elif reduction == 'sum':
loss_out = paddle.sum(loss_out)
return loss_out
logger.warn(
"override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
)
F.ctc_loss = ctc_loss
########### hcak paddle.nn #############
if not hasattr(paddle.nn, 'Module'):
logger.warn("register user Module to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'Module', paddle.nn.Layer)
# maybe cause assert isinstance(sublayer, core.Layer)
if not hasattr(paddle.nn, 'ModuleList'):
logger.warn(
"register user ModuleList to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""
......@@ -450,48 +362,9 @@ class GLU(nn.Layer):
self.dim = dim
def forward(self, xs):
return glu(xs, dim=self.dim)
return F.glu(xs, dim=self.dim)
if not hasattr(paddle.nn, 'GLU'):
logger.warn("register user GLU to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'GLU', GLU)
# TODO(Hui Zhang): remove this Layer
class ConstantPad2d(nn.Layer):
"""Pads the input tensor boundaries with a constant value.
For N-dimensional padding, use paddle.nn.functional.pad().
"""
def __init__(self, padding: Union[tuple, list, int], value: float):
"""
Args:
paddle ([tuple]): the size of the padding.
If is int, uses the same padding in all boundaries.
If a 4-tuple, uses (padding_left, padding_right, padding_top, padding_bottom)
value ([flaot]): pad value
"""
self.padding = padding if isinstance(padding,
[tuple, list]) else [padding] * 4
self.value = value
def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
return nn.functional.pad(
xs,
self.padding,
mode='constant',
value=self.value,
data_format='NCHW')
if not hasattr(paddle.nn, 'ConstantPad2d'):
logger.warn(
"register user ConstantPad2d to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'ConstantPad2d', ConstantPad2d)
########### hcak paddle.jit #############
if not hasattr(paddle.jit, 'export'):
logger.warn("register user export to paddle.jit, remove this when fixed!")
setattr(paddle.jit, 'export', paddle.jit.to_static)
......@@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
FILES = [
fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
or fn.endswith('unittest.cc'))
fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
'unittest.cc'))
]
LIBS = ['stdc++']
......
......@@ -23,7 +23,7 @@ from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.deepspeech2 import DeepSpeech2Model
from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.socket_server import AsrRequestHandler
from deepspeech.utils.socket_server import AsrTCPServer
......
......@@ -21,7 +21,7 @@ from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.deepspeech2 import DeepSpeech2Model
from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.socket_server import AsrRequestHandler
from deepspeech.utils.socket_server import AsrTCPServer
......
......@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
parser.add_argument("--model_type")
args = parser.parse_args()
if args.model_type is None:
......
......@@ -31,6 +31,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
parser.add_argument("--model_type")
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())
if args.model_type is None:
......
......@@ -30,6 +30,12 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
#load jit model from
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
parser.add_argument("--model_type")
args = parser.parse_args()
print_arguments(args, globals())
......
......@@ -21,7 +21,7 @@ from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.deepspeech2 import DeepSpeech2Model
from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils import error_rate
from deepspeech.utils.utility import add_arguments
......
......@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())
......
......@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args()
print_arguments(args, globals())
......
......@@ -34,6 +34,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())
......
......@@ -264,12 +264,12 @@ class U2Trainer(Trainer):
config.data.manifest = config.data.test_manifest
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf')
config.data.min_input_len = 0.0 # second
config.data.max_input_len = float('inf') # second
config.data.min_output_len = 0.0 # tokens
config.data.max_output_len = float('inf') # tokens
config.data.min_output_input_ratio = 0.00
config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config)
# return text ord id
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation for U2 model."""
import cProfile
from yacs.config import CfgNode
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.dynamic_import import dynamic_import
from deepspeech.utils.utility import print_arguments
model_test_alias = {
"u2": "deepspeech.exps.u2.model:U2Tester",
"u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester",
}
def main_sp(config, args):
class_obj = dynamic_import(args.model_name, model_test_alias)
exp = class_obj(config, args)
exp.setup()
if args.run_mode == 'test':
exp.run_test()
elif args.run_mode == 'export':
exp.run_export()
elif args.run_mode == 'align':
exp.run_align()
def main(config, args):
main_sp(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
parser.add_argument(
'--model-name',
type=str,
default='u2_kaldi',
help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
parser.add_argument(
'--run-mode',
type=str,
default='test',
help='run mode, e.g. test, align, export')
parser.add_argument(
'--dict-path', type=str, default=None, help='dict path.')
# save asr result to
parser.add_argument(
"--result-file", type=str, help="path of save the asr result")
# save jit model to
parser.add_argument(
"--export-path", type=str, help="path of the jit model to save")
args = parser.parse_args()
print_arguments(args, globals())
config = CfgNode()
config.set_new_allowed(True)
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
# Setting for profiling
pr = cProfile.Profile()
pr.runcall(main, config, args)
pr.dump_stats('test.profile')
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Trainer for U2 model."""
import cProfile
import os
from paddle import distributed as dist
from yacs.config import CfgNode
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.dynamic_import import dynamic_import
from deepspeech.utils.utility import print_arguments
model_train_alias = {
"u2": "deepspeech.exps.u2.model:U2Trainer",
"u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Trainer",
}
def main_sp(config, args):
class_obj = dynamic_import(args.model_name, model_train_alias)
exp = class_obj(config, args)
exp.setup()
exp.run()
def main(config, args):
if args.device == "gpu" and args.nprocs > 1:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else:
main_sp(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
parser.add_argument(
'--model-name',
type=str,
default='u2_kaldi',
help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
args = parser.parse_args()
print_arguments(args, globals())
config = CfgNode()
config.set_new_allowed(True)
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
# Setting for profiling
pr = cProfile.Profile()
pr.runcall(main, config, args)
pr.dump_stats(os.path.join(args.output, 'train.profile'))
此差异已折叠。
......@@ -30,6 +30,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save jit model to
parser.add_argument(
"--export_path", type=str, help="path of the jit model to save")
args = parser.parse_args()
print_arguments(args, globals())
......
......@@ -34,6 +34,9 @@ def main(config, args):
if __name__ == "__main__":
parser = default_argument_parser()
# save asr result to
parser.add_argument(
"--result_file", type=str, help="path of save the asr result")
args = parser.parse_args()
print_arguments(args, globals())
......
......@@ -13,18 +13,28 @@
# limitations under the License.
"""Contains the data augmentation pipeline."""
import json
from collections.abc import Sequence
from inspect import signature
import numpy as np
from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor
from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor
from deepspeech.frontend.augmentor.online_bayesian_normalization import \
OnlineBayesianNormalizationAugmentor
from deepspeech.frontend.augmentor.resample import ResampleAugmentor
from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor
from deepspeech.frontend.augmentor.spec_augment import SpecAugmentor
from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor
from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor
from deepspeech.frontend.augmentor.base import AugmentorBase
from deepspeech.utils.dynamic_import import dynamic_import
from deepspeech.utils.log import Log
__all__ = ["AugmentationPipeline"]
logger = Log(__name__).getlog()
import_alias = dict(
volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )
class AugmentationPipeline():
......@@ -78,20 +88,74 @@ class AugmentationPipeline():
augmentor to take effect. If "prob" is zero, the augmentor does not take
effect.
:param augmentation_config: Augmentation configuration in json string.
:type augmentation_config: str
:param random_seed: Random seed.
:type random_seed: int
:raises ValueError: If the augmentation json config is in incorrect format".
Params:
augmentation_config(str): Augmentation configuration in json string.
random_seed(int): Random seed.
train(bool): whether is train mode.
Raises:
ValueError: If the augmentation json config is in incorrect format".
"""
def __init__(self, augmentation_config: str, random_seed=0):
SPEC_TYPES = {'specaug'}
def __init__(self, augmentation_config: str, random_seed: int=0):
self._rng = np.random.RandomState(random_seed)
self._spec_types = ('specaug')
self._augmentors, self._rates = self._parse_pipeline_from(
augmentation_config, 'audio')
self.conf = {'mode': 'sequential', 'process': []}
if augmentation_config:
process = json.loads(augmentation_config)
self.conf['process'] += process
self._augmentors, self._rates = self._parse_pipeline_from('all')
self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
'audio')
self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
augmentation_config, 'feature')
'feature')
def __call__(self, xs, uttid_list=None, **kwargs):
if not isinstance(xs, Sequence):
is_batch = False
xs = [xs]
else:
is_batch = True
if isinstance(uttid_list, str):
uttid_list = [uttid_list for _ in range(len(xs))]
if self.conf.get("mode", "sequential") == "sequential":
for idx, (func, rate) in enumerate(
zip(self._augmentors, self._rates), 0):
if self._rng.uniform(0., 1.) >= rate:
continue
# Derive only the args which the func has
try:
param = signature(func).parameters
except ValueError:
# Some function, e.g. built-in function, are failed
param = {}
_kwargs = {k: v for k, v in kwargs.items() if k in param}
try:
if uttid_list is not None and "uttid" in param:
xs = [
func(x, u, **_kwargs)
for x, u in zip(xs, uttid_list)
]
else:
xs = [func(x, **_kwargs) for x in xs]
except Exception:
logger.fatal("Catch a exception from {}th func: {}".format(
idx, func))
raise
else:
raise NotImplementedError(
"Not supporting mode={}".format(self.conf["mode"]))
if is_batch:
return xs
else:
return xs[0]
def transform_audio(self, audio_segment):
"""Run the pre-processing pipeline for data augmentation.
......@@ -101,7 +165,7 @@ class AugmentationPipeline():
:param audio_segment: Audio segment to process.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
for augmentor, rate in zip(self._augmentors, self._rates):
for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
if self._rng.uniform(0., 1.) < rate:
augmentor.transform_audio(audio_segment)
......@@ -116,52 +180,39 @@ class AugmentationPipeline():
spec_segment = augmentor.transform_feature(spec_segment)
return spec_segment
def _parse_pipeline_from(self, config_json, aug_type='audio'):
def _parse_pipeline_from(self, aug_type='all'):
"""Parse the config json to build a augmentation pipelien."""
assert aug_type in ('audio', 'feature'), aug_type
try:
configs = json.loads(config_json)
audio_confs = []
feature_confs = []
for config in configs:
if config["type"] in self._spec_types:
feature_confs.append(config)
else:
audio_confs.append(config)
if aug_type == 'audio':
aug_confs = audio_confs
elif aug_type == 'feature':
aug_confs = feature_confs
augmentors = [
self._get_augmentor(config["type"], config["params"])
for config in aug_confs
]
rates = [config["prob"] for config in aug_confs]
except Exception as e:
raise ValueError("Failed to parse the augmentation config json: "
"%s" % str(e))
assert aug_type in ('audio', 'feature', 'all'), aug_type
audio_confs = []
feature_confs = []
all_confs = []
for config in self.conf['process']:
all_confs.append(config)
if config["type"] in self.SPEC_TYPES:
feature_confs.append(config)
else:
audio_confs.append(config)
if aug_type == 'audio':
aug_confs = audio_confs
elif aug_type == 'feature':
aug_confs = feature_confs
else:
aug_confs = all_confs
augmentors = [
self._get_augmentor(config["type"], config["params"])
for config in aug_confs
]
rates = [config["prob"] for config in aug_confs]
return augmentors, rates
def _get_augmentor(self, augmentor_type, params):
"""Return an augmentation model by the type name, and pass in params."""
if augmentor_type == "volume":
return VolumePerturbAugmentor(self._rng, **params)
elif augmentor_type == "shift":
return ShiftPerturbAugmentor(self._rng, **params)
elif augmentor_type == "speed":
return SpeedPerturbAugmentor(self._rng, **params)
elif augmentor_type == "resample":
return ResampleAugmentor(self._rng, **params)
elif augmentor_type == "bayesian_normal":
return OnlineBayesianNormalizationAugmentor(self._rng, **params)
elif augmentor_type == "noise":
return NoisePerturbAugmentor(self._rng, **params)
elif augmentor_type == "impulse":
return ImpulseResponseAugmentor(self._rng, **params)
elif augmentor_type == "specaug":
return SpecAugmentor(self._rng, **params)
else:
class_obj = dynamic_import(augmentor_type, import_alias)
assert issubclass(class_obj, AugmentorBase)
try:
obj = class_obj(self._rng, **params)
except Exception:
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
return obj
......@@ -28,6 +28,10 @@ class AugmentorBase():
def __init__(self):
pass
@abstractmethod
def __call__(self, xs):
raise NotImplementedError("AugmentorBase: Not impl __call__")
@abstractmethod
def transform_audio(self, audio_segment):
"""Adds various effects to the input audio segment. Such effects
......@@ -40,7 +44,7 @@ class AugmentorBase():
:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
raise NotImplementedError
raise NotImplementedError("AugmentorBase: Not impl transform_audio")
@abstractmethod
def transform_feature(self, spec_segment):
......@@ -52,4 +56,4 @@ class AugmentorBase():
Args:
spec_segment (Spectrogram): Spectrogram segment to add effects to.
"""
raise NotImplementedError
raise NotImplementedError("AugmentorBase: Not impl transform_feature")
......@@ -30,6 +30,12 @@ class ImpulseResponseAugmentor(AugmentorBase):
self._rng = rng
self._impulse_manifest = read_manifest(impulse_manifest_path)
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Add impulse response effect.
......
......@@ -36,6 +36,12 @@ class NoisePerturbAugmentor(AugmentorBase):
self._rng = rng
self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Add background noise audio.
......
......@@ -44,6 +44,12 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase):
self._rng = rng
self._startup_delay = startup_delay
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Normalizes the input audio using the online Bayesian approach.
......
......@@ -31,6 +31,12 @@ class ResampleAugmentor(AugmentorBase):
self._new_sample_rate = new_sample_rate
self._rng = rng
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Resamples the input audio to a target sample rate.
......
......@@ -31,6 +31,12 @@ class ShiftPerturbAugmentor(AugmentorBase):
self._max_shift_ms = max_shift_ms
self._rng = rng
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Shift audio.
......
......@@ -12,7 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains the volume perturb augmentation model."""
import random
import numpy as np
from PIL import Image
from PIL.Image import BICUBIC
from deepspeech.frontend.augmentor.base import AugmentorBase
from deepspeech.utils.log import Log
......@@ -41,7 +45,9 @@ class SpecAugmentor(AugmentorBase):
W=40,
adaptive_number_ratio=0,
adaptive_size_ratio=0,
max_n_time_masks=20):
max_n_time_masks=20,
replace_with_zero=True,
warp_mode='PIL'):
"""SpecAugment class.
Args:
rng (random.Random): random generator object.
......@@ -54,17 +60,22 @@ class SpecAugmentor(AugmentorBase):
adaptive_number_ratio (float): adaptive multiplicity ratio for time masking
adaptive_size_ratio (float): adaptive size ratio for time masking
max_n_time_masks (int): maximum number of time masking
replace_with_zero (bool): pad zero on mask if true else use mean
warp_mode (str): "PIL" (default, fast, not differentiable)
or "sparse_image_warp" (slow, differentiable)
"""
super().__init__()
self._rng = rng
self.inplace = True
self.replace_with_zero = replace_with_zero
self.mode = warp_mode
self.W = W
self.F = F
self.T = T
self.n_freq_masks = n_freq_masks
self.n_time_masks = n_time_masks
self.p = p
#logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}")
# adaptive SpecAugment
self.adaptive_number_ratio = adaptive_number_ratio
......@@ -121,21 +132,86 @@ class SpecAugmentor(AugmentorBase):
def time_mask(self):
return self._time_mask
def time_warp(xs, W=40):
raise NotImplementedError
def __repr__(self):
return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
def time_warp(self, x, mode='PIL'):
"""time warp for spec augment
move random center frame by the random width ~ uniform(-window, window)
Args:
x (np.ndarray): spectrogram (time, freq)
mode (str): PIL or sparse_image_warp
Raises:
NotImplementedError: [description]
NotImplementedError: [description]
Returns:
np.ndarray: time warped spectrogram (time, freq)
"""
window = max_time_warp = self.W
if window == 0:
return x
if mode == "PIL":
t = x.shape[0]
if t - window <= window:
return x
# NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
center = random.randrange(window, t - window)
warped = random.randrange(center - window, center +
window) + 1 # 1 ... t - 1
left = Image.fromarray(x[:center]).resize((x.shape[1], warped),
BICUBIC)
right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped),
BICUBIC)
if self.inplace:
x[:warped] = left
x[warped:] = right
return x
return np.concatenate((left, right), 0)
elif mode == "sparse_image_warp":
raise NotImplementedError('sparse_image_warp')
else:
raise NotImplementedError(
"unknown resize mode: " + mode +
", choose one from (PIL, sparse_image_warp).")
def mask_freq(self, x, replace_with_zero=False):
"""freq mask
Args:
x (np.ndarray): spectrogram (time, freq)
replace_with_zero (bool, optional): Defaults to False.
def mask_freq(self, xs, replace_with_zero=False):
n_bins = xs.shape[0]
Returns:
np.ndarray: freq mask spectrogram (time, freq)
"""
n_bins = x.shape[1]
for i in range(0, self.n_freq_masks):
f = int(self._rng.uniform(low=0, high=self.F))
f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
xs[f_0:f_0 + f, :] = 0
assert f_0 <= f_0 + f
if replace_with_zero:
x[:, f_0:f_0 + f] = 0
else:
x[:, f_0:f_0 + f] = x.mean()
self._freq_mask = (f_0, f_0 + f)
return xs
return x
def mask_time(self, xs, replace_with_zero=False):
n_frames = xs.shape[1]
def mask_time(self, x, replace_with_zero=False):
"""time mask
Args:
x (np.ndarray): spectrogram (time, freq)
replace_with_zero (bool, optional): Defaults to False.
Returns:
np.ndarray: time mask spectrogram (time, freq)
"""
n_frames = x.shape[0]
if self.adaptive_number_ratio > 0:
n_masks = int(n_frames * self.adaptive_number_ratio)
......@@ -152,19 +228,29 @@ class SpecAugmentor(AugmentorBase):
t = int(self._rng.uniform(low=0, high=T))
t = min(t, int(n_frames * self.p))
t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
xs[:, t_0:t_0 + t] = 0
assert t_0 <= t_0 + t
if replace_with_zero:
x[t_0:t_0 + t, :] = 0
else:
x[t_0:t_0 + t, :] = x.mean()
self._time_mask = (t_0, t_0 + t)
return xs
return x
def __call__(self, x, train=True):
if not train:
return x
return self.transform_feature(x)
def transform_feature(self, xs: np.ndarray):
def transform_feature(self, x: np.ndarray):
"""
Args:
xs (FloatTensor): `[F, T]`
x (np.ndarray): `[T, F]`
Returns:
xs (FloatTensor): `[F, T]`
x (np.ndarray): `[T, F]`
"""
# xs = self.time_warp(xs)
xs = self.mask_freq(xs)
xs = self.mask_time(xs)
return xs
assert isinstance(x, np.ndarray)
assert x.ndim == 2
x = self.time_warp(x, self.mode)
x = self.mask_freq(x, self.replace_with_zero)
x = self.mask_time(x, self.replace_with_zero)
return x
......@@ -79,6 +79,12 @@ class SpeedPerturbAugmentor(AugmentorBase):
self._rates = np.linspace(
self._min_rate, self._max_rate, self._num_rates, endpoint=True)
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Sample a new speed rate from the given range and
changes the speed of the given audio clip.
......
......@@ -37,6 +37,12 @@ class VolumePerturbAugmentor(AugmentorBase):
self._max_gain_dBFS = max_gain_dBFS
self._rng = rng
def __call__(self, x, uttid=None, train=True):
if not train:
return x
self.transform_audio(x)
return x
def transform_audio(self, audio_segment):
"""Change audio loadness.
......
......@@ -11,3 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio_featurizer import AudioFeaturizer #noqa: F401
from .speech_featurizer import SpeechFeaturizer
from .text_featurizer import TextFeaturizer
......@@ -18,7 +18,7 @@ from python_speech_features import logfbank
from python_speech_features import mfcc
class AudioFeaturizer(object):
class AudioFeaturizer():
"""Audio featurizer, for extracting features from audio contents of
AudioSegment or SpeechSegment.
......@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
raise ValueError("Unknown specgram_type %s. "
"Supported values: linear." % self._specgram_type)
def _compute_linear_specgram(self,
samples,
sample_rate,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
eps=1e-14):
"""Compute the linear spectrogram from FFT energy."""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
"sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than "
"window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
specgram, freqs = self._specgram_real(
samples,
window_size=window_size,
stride_size=stride_size,
sample_rate=sample_rate)
ind = np.where(freqs <= max_freq)[0][-1] + 1
return np.log(specgram[:ind, :] + eps)
def _specgram_real(self, samples, window_size, stride_size, sample_rate):
"""Compute the spectrogram for samples from a real signal."""
# extract strided windows
......@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
return fft, freqs
def _compute_linear_specgram(self,
samples,
sample_rate,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
eps=1e-14):
"""Compute the linear spectrogram from FFT energy.
Args:
samples ([type]): [description]
sample_rate ([type]): [description]
stride_ms (float, optional): [description]. Defaults to 10.0.
window_ms (float, optional): [description]. Defaults to 20.0.
max_freq ([type], optional): [description]. Defaults to None.
eps ([type], optional): [description]. Defaults to 1e-14.
Raises:
ValueError: [description]
ValueError: [description]
Returns:
np.ndarray: log spectrogram, (time, freq)
"""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
"sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than "
"window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
specgram, freqs = self._specgram_real(
samples,
window_size=window_size,
stride_size=stride_size,
sample_rate=sample_rate)
ind = np.where(freqs <= max_freq)[0][-1] + 1
# (freq, time)
spec = np.log(specgram[:ind, :] + eps)
return np.transpose(spec)
def _concat_delta_delta(self, feat):
"""append delat, delta-delta feature.
Args:
feat (np.ndarray): (D, T)
feat (np.ndarray): (T, D)
Returns:
np.ndarray: feat with delta-delta, (3*D, T)
np.ndarray: feat with delta-delta, (T, 3*D)
"""
feat = np.transpose(feat)
# Deltas
d_feat = delta(feat, 2)
# Deltas-Deltas
dd_feat = delta(feat, 2)
# transpose
feat = np.transpose(feat)
d_feat = np.transpose(d_feat)
dd_feat = np.transpose(dd_feat)
# concat above three features
concat_feat = np.concatenate((feat, d_feat, dd_feat))
concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
return concat_feat
def _compute_mfcc(self,
......@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
ceplifter=22,
useEnergy=True,
winfunc='povey')
mfcc_feat = np.transpose(mfcc_feat)
if delta_delta:
mfcc_feat = self._concat_delta_delta(mfcc_feat)
return mfcc_feat
......@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
remove_dc_offset=True,
preemph=0.97,
wintype='povey')
fbank_feat = np.transpose(fbank_feat)
if delta_delta:
fbank_feat = self._concat_delta_delta(fbank_feat)
return fbank_feat
......@@ -16,7 +16,7 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
class SpeechFeaturizer(object):
class SpeechFeaturizer():
"""Speech featurizer, for extracting features from both audio and transcript
contents of SpeechSegment.
......
......@@ -14,12 +14,19 @@
"""Contains the text featurizer class."""
import sentencepiece as spm
from deepspeech.frontend.utility import EOS
from deepspeech.frontend.utility import UNK
from ..utility import EOS
from ..utility import load_dict
from ..utility import UNK
__all__ = ["TextFeaturizer"]
class TextFeaturizer(object):
def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None):
class TextFeaturizer():
def __init__(self,
unit_type,
vocab_filepath,
spm_model_prefix=None,
maskctc=False):
"""Text featurizer, for processing or extracting features from text.
Currently, it supports char/word/sentence-piece level tokenizing and conversion into
......@@ -34,11 +41,12 @@ class TextFeaturizer(object):
assert unit_type in ('char', 'spm', 'word')
self.unit_type = unit_type
self.unk = UNK
self.maskctc = maskctc
if vocab_filepath:
self._vocab_dict, self._id2token, self._vocab_list = self._load_vocabulary_from_file(
vocab_filepath)
self.unk_id = self._vocab_list.index(self.unk)
self.eos_id = self._vocab_list.index(EOS)
self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file(
vocab_filepath, maskctc)
self.vocab_size = len(self.vocab_list)
if unit_type == 'spm':
spm_model = spm_model_prefix + '.model'
......@@ -67,7 +75,7 @@ class TextFeaturizer(object):
"""Convert text string to a list of token indices.
Args:
text (str): Text to process.
text (str): Text.
Returns:
List[int]: List of token indices.
......@@ -75,8 +83,8 @@ class TextFeaturizer(object):
tokens = self.tokenize(text)
ids = []
for token in tokens:
token = token if token in self._vocab_dict else self.unk
ids.append(self._vocab_dict[token])
token = token if token in self.vocab_dict else self.unk
ids.append(self.vocab_dict[token])
return ids
def defeaturize(self, idxs):
......@@ -87,7 +95,7 @@ class TextFeaturizer(object):
idxs (List[int]): List of token indices.
Returns:
str: Text to process.
str: Text.
"""
tokens = []
for idx in idxs:
......@@ -97,33 +105,6 @@ class TextFeaturizer(object):
text = self.detokenize(tokens)
return text
@property
def vocab_size(self):
"""Return the vocabulary size.
:return: Vocabulary size.
:rtype: int
"""
return len(self._vocab_list)
@property
def vocab_list(self):
"""Return the vocabulary in list.
Returns:
List[str]: tokens.
"""
return self._vocab_list
@property
def vocab_dict(self):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]: token str -> int
"""
return self._vocab_dict
def char_tokenize(self, text):
"""Character tokenizer.
......@@ -206,14 +187,16 @@ class TextFeaturizer(object):
return decode(tokens)
def _load_vocabulary_from_file(self, vocab_filepath):
def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool):
"""Load vocabulary from file."""
vocab_lines = []
with open(vocab_filepath, 'r', encoding='utf-8') as file:
vocab_lines.extend(file.readlines())
vocab_list = [line[:-1] for line in vocab_lines]
vocab_list = load_dict(vocab_filepath, maskctc)
assert vocab_list is not None
id2token = dict(
[(idx, token) for (idx, token) in enumerate(vocab_list)])
token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)])
return token2id, id2token, vocab_list
unk_id = vocab_list.index(UNK)
eos_id = vocab_list.index(EOS)
return token2id, id2token, vocab_list, unk_id, eos_id
......@@ -40,21 +40,21 @@ class CollateFunc(object):
number = 0
for item in batch:
audioseg = AudioSegment.from_file(item['feat'])
feat = self.feature_func(audioseg) #(D, T)
feat = self.feature_func(audioseg) #(T, D)
sums = np.sum(feat, axis=1)
sums = np.sum(feat, axis=0)
if mean_stat is None:
mean_stat = sums
else:
mean_stat += sums
square_sums = np.sum(np.square(feat), axis=1)
square_sums = np.sum(np.square(feat), axis=0)
if var_stat is None:
var_stat = square_sums
else:
var_stat += square_sums
number += feat.shape[1]
number += feat.shape[0]
return number, mean_stat, var_stat
......@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
"""Normalize features to be of zero mean and unit stddev.
:param features: Input features to be normalized.
:type features: ndarray, shape (D, T)
:type features: ndarray, shape (T, D)
:param eps: added to stddev to provide numerical stablibity.
:type eps: float
:return: Normalized features.
......@@ -131,8 +131,8 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file."""
mean, istd = load_cmvn(filepath, filetype='json')
self._mean = np.expand_dims(mean, axis=-1)
self._istd = np.expand_dims(istd, axis=-1)
self._mean = np.expand_dims(mean, axis=0)
self._istd = np.expand_dims(istd, axis=0)
def write_to_file(self, filepath):
"""Write the mean and stddev to the file.
......
......@@ -15,6 +15,9 @@
import codecs
import json
import math
from typing import List
from typing import Optional
from typing import Text
import numpy as np
......@@ -23,16 +26,35 @@ from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = [
"load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs",
"mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", "EOS", "UNK",
"BLANK"
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"EOS", "UNK", "BLANK", "MASKCTC"
]
IGNORE_ID = -1
SOS = "<sos/eos>"
# `sos` and `eos` using same token
SOS = "<eos>"
EOS = SOS
UNK = "<unk>"
BLANK = "<blank>"
MASKCTC = "<mask>"
def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
if dict_path is None:
return None
with open(dict_path, "r") as f:
dictionary = f.readlines()
char_list = [entry.strip().split(" ")[0] for entry in dictionary]
if BLANK not in char_list:
char_list.insert(0, BLANK)
if EOS not in char_list:
char_list.append(EOS)
# for non-autoregressive maskctc model
if maskctc and MASKCTC not in char_list:
char_list.append(MASKCTC)
return char_list
def read_manifest(
......@@ -47,12 +69,20 @@ def read_manifest(
Args:
manifest_path ([type]): Manifest file to load and parse.
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
max_input_len ([type], optional): maximum output seq length,
in seconds for raw wav, in frame numbers for feature data.
Defaults to float('inf').
min_input_len (float, optional): minimum input seq length,
in seconds for raw wav, in frame numbers for feature data.
Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional):
maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional):
minimum output seq length/output seq length ratio. Defaults to 0.05.
Raises:
IOError: If failed to parse the manifest.
......
......@@ -11,139 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from paddle.io import DataLoader
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler
def create_dataloader(manifest_path,
unit_type,
vocab_filepath,
mean_std_filepath,
spm_model_prefix,
augmentation_config='{}',
max_input_len=float('inf'),
min_input_len=0.0,
max_output_len=float('inf'),
min_output_len=0.0,
max_output_input_ratio=float('inf'),
min_output_input_ratio=0.0,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
specgram_type='linear',
feat_dim=None,
delta_delta=False,
use_dB_normalization=True,
random_seed=0,
keep_transcription_text=False,
is_training=False,
batch_size=1,
num_workers=0,
sortagrad=False,
shuffle_method=None,
dist=False):
dataset = ManifestDataset(
manifest_path=manifest_path,
unit_type=unit_type,
vocab_filepath=vocab_filepath,
mean_std_filepath=mean_std_filepath,
spm_model_prefix=spm_model_prefix,
augmentation_config=augmentation_config,
max_input_len=max_input_len,
min_input_len=min_input_len,
max_output_len=max_output_len,
min_output_len=min_output_len,
max_output_input_ratio=max_output_input_ratio,
min_output_input_ratio=min_output_input_ratio,
stride_ms=stride_ms,
window_ms=window_ms,
max_freq=max_freq,
specgram_type=specgram_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
use_dB_normalization=use_dB_normalization,
random_seed=random_seed,
keep_transcription_text=keep_transcription_text)
if dist:
batch_sampler = SortagradDistributedBatchSampler(
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=is_training,
drop_last=is_training,
sortagrad=is_training,
shuffle_method=shuffle_method)
else:
batch_sampler = SortagradBatchSampler(
dataset,
shuffle=is_training,
batch_size=batch_size,
drop_last=is_training,
sortagrad=is_training,
shuffle_method=shuffle_method)
def padding_batch(batch,
padding_to=-1,
flatten=False,
keep_transcription_text=True):
"""
Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one bach.
If ``padding_to`` is -1, the maximun shape in the batch will be used
as the target shape for padding. Otherwise, `padding_to` will be the
target shape (only refers to the second axis).
If `flatten` is True, features will be flatten to 1darray.
"""
new_batch = []
# get target shape
max_length = max([audio.shape[1] for audio, text in batch])
if padding_to != -1:
if padding_to < max_length:
raise ValueError("If padding_to is not -1, it should be larger "
"than any instance's shape in the batch")
max_length = padding_to
max_text_length = max([len(text) for audio, text in batch])
# padding
padded_audios = []
audio_lens = []
texts, text_lens = [], []
for audio, text in batch:
padded_audio = np.zeros([audio.shape[0], max_length])
padded_audio[:, :audio.shape[1]] = audio
if flatten:
padded_audio = padded_audio.flatten()
padded_audios.append(padded_audio)
audio_lens.append(audio.shape[1])
padded_text = np.zeros([max_text_length])
if keep_transcription_text:
padded_text[:len(text)] = [ord(t) for t in text] # string
else:
padded_text[:len(text)] = text # ids
texts.append(padded_text)
text_lens.append(len(text))
padded_audios = np.array(padded_audios).astype('float32')
audio_lens = np.array(audio_lens).astype('int64')
texts = np.array(texts).astype('int32')
text_lens = np.array(text_lens).astype('int64')
return padded_audios, audio_lens, texts, text_lens
# collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text),
collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text)
loader = DataLoader(
dataset,
batch_sampler=batch_sampler,
collate_fn=collate_fn,
num_workers=num_workers)
return loader
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import numpy as np
from deepspeech.utils.log import Log
__all__ = ["make_batchset"]
logger = Log(__name__).getlog()
def batchfy_by_seq(
sorted_data,
batch_size,
max_length_in,
max_length_out,
min_batch_size=1,
shortest_first=False,
ikey="input",
iaxis=0,
okey="output",
oaxis=0, ):
"""Make batch set from json dictionary
:param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
:param int batch_size: batch size
:param int max_length_in: maximum length of input to decide adaptive batch size
:param int max_length_out: maximum length of output to decide adaptive batch size
:param int min_batch_size: mininum batch size (for multi-gpu)
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str ikey: key to access input
(for ASR ikey="input", for TTS, MT ikey="output".)
:param int iaxis: dimension to access input
(for ASR, TTS iaxis=0, for MT iaxis="1".)
:param str okey: key to access output
(for ASR, MT okey="output". for TTS okey="input".)
:param int oaxis: dimension to access output
(for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
:return: List[List[Tuple[str, dict]]] list of batches
"""
if batch_size <= 0:
raise ValueError(f"Invalid batch_size={batch_size}")
# check #utts is more than min_batch_size
if len(sorted_data) < min_batch_size:
raise ValueError(
f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
)
# make list of minibatches
minibatches = []
start = 0
while True:
_, info = sorted_data[start]
ilen = int(info[ikey][iaxis]["shape"][0])
olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
max(map(lambda x: int(x["shape"][0]), info[okey])))
factor = max(int(ilen / max_length_in), int(olen / max_length_out))
# change batchsize depending on the input and output length
# if ilen = 1000 and max_length_in = 800
# then b = batchsize / 2
# and max(min_batches, .) avoids batchsize = 0
bs = max(min_batch_size, int(batch_size / (1 + factor)))
end = min(len(sorted_data), start + bs)
minibatch = sorted_data[start:end]
if shortest_first:
minibatch.reverse()
# check each batch is more than minimum batchsize
if len(minibatch) < min_batch_size:
mod = min_batch_size - len(minibatch) % min_batch_size
additional_minibatch = [
sorted_data[i] for i in np.random.randint(0, start, mod)
]
if shortest_first:
additional_minibatch.reverse()
minibatch.extend(additional_minibatch)
minibatches.append(minibatch)
if end == len(sorted_data):
break
start = end
# batch: List[List[Tuple[str, dict]]]
return minibatches
def batchfy_by_bin(
sorted_data,
batch_bins,
num_batches=0,
min_batch_size=1,
shortest_first=False,
ikey="input",
okey="output", ):
"""Make variably sized batch set, which maximizes
the number of bins up to `batch_bins`.
:param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
:param int batch_bins: Maximum frames of a batch
:param int num_batches: # number of batches to use (for debug)
:param int min_batch_size: minimum batch size (for multi-gpu)
:param int test: Return only every `test` batches
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
:param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
:return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
"""
if batch_bins <= 0:
raise ValueError(f"invalid batch_bins={batch_bins}")
length = len(sorted_data)
idim = int(sorted_data[0][1][ikey][0]["shape"][1])
odim = int(sorted_data[0][1][okey][0]["shape"][1])
logger.info("# utts: " + str(len(sorted_data)))
minibatches = []
start = 0
n = 0
while True:
# Dynamic batch size depending on size of samples
b = 0
next_size = 0
max_olen = 0
while next_size < batch_bins and (start + b) < length:
ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
if olen > max_olen:
max_olen = olen
next_size = (max_olen + ilen) * (b + 1)
if next_size <= batch_bins:
b += 1
elif next_size == 0:
raise ValueError(
f"Can't fit one sample in batch_bins ({batch_bins}): "
f"Please increase the value")
end = min(length, start + max(min_batch_size, b))
batch = sorted_data[start:end]
if shortest_first:
batch.reverse()
minibatches.append(batch)
# Check for min_batch_size and fixes the batches if needed
i = -1
while len(minibatches[i]) < min_batch_size:
missing = min_batch_size - len(minibatches[i])
if -i == len(minibatches):
minibatches[i + 1].extend(minibatches[i])
minibatches = minibatches[1:]
break
else:
minibatches[i].extend(minibatches[i - 1][:missing])
minibatches[i - 1] = minibatches[i - 1][missing:]
i -= 1
if end == length:
break
start = end
n += 1
if num_batches > 0:
minibatches = minibatches[:num_batches]
lengths = [len(x) for x in minibatches]
logger.info(
str(len(minibatches)) + " batches containing from " + str(min(lengths))
+ " to " + str(max(lengths)) + " samples " + "(avg " + str(
int(np.mean(lengths))) + " samples).")
return minibatches
def batchfy_by_frame(
sorted_data,
max_frames_in,
max_frames_out,
max_frames_inout,
num_batches=0,
min_batch_size=1,
shortest_first=False,
ikey="input",
okey="output", ):
"""Make variable batch set, which maximizes the number of frames to max_batch_frame.
:param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
:param int max_frames_in: Maximum input frames of a batch
:param int max_frames_out: Maximum output frames of a batch
:param int max_frames_inout: Maximum input+output frames of a batch
:param int num_batches: # number of batches to use (for debug)
:param int min_batch_size: minimum batch size (for multi-gpu)
:param int test: Return only every `test` batches
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
:param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
:return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
"""
if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
raise ValueError(
"At least, one of `--batch-frames-in`, `--batch-frames-out` or "
"`--batch-frames-inout` should be > 0")
length = len(sorted_data)
minibatches = []
start = 0
end = 0
while end != length:
# Dynamic batch size depending on size of samples
b = 0
max_olen = 0
max_ilen = 0
while (start + b) < length:
ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
if ilen > max_frames_in and max_frames_in != 0:
raise ValueError(
f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
f"Please increase the value")
olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
if olen > max_frames_out and max_frames_out != 0:
raise ValueError(
f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
f"Please increase the value")
if ilen + olen > max_frames_inout and max_frames_inout != 0:
raise ValueError(
f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
f"Please increase the value")
max_olen = max(max_olen, olen)
max_ilen = max(max_ilen, ilen)
in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
inout_ok = (max_ilen + max_olen) * (
b + 1) <= max_frames_inout or max_frames_inout == 0
if in_ok and out_ok and inout_ok:
# add more seq in the minibatch
b += 1
else:
# no more seq in the minibatch
break
end = min(length, start + b)
batch = sorted_data[start:end]
if shortest_first:
batch.reverse()
minibatches.append(batch)
# Check for min_batch_size and fixes the batches if needed
i = -1
while len(minibatches[i]) < min_batch_size:
missing = min_batch_size - len(minibatches[i])
if -i == len(minibatches):
minibatches[i + 1].extend(minibatches[i])
minibatches = minibatches[1:]
break
else:
minibatches[i].extend(minibatches[i - 1][:missing])
minibatches[i - 1] = minibatches[i - 1][missing:]
i -= 1
start = end
if num_batches > 0:
minibatches = minibatches[:num_batches]
lengths = [len(x) for x in minibatches]
logger.info(
str(len(minibatches)) + " batches containing from " + str(min(lengths))
+ " to " + str(max(lengths)) + " samples" + "(avg " + str(
int(np.mean(lengths))) + " samples).")
return minibatches
def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
shortest_first):
import random
logger.info("use shuffled batch.")
sorted_data = random.sample(data.items(), len(data.items()))
logger.info("# utts: " + str(len(sorted_data)))
# make list of minibatches
minibatches = []
start = 0
while True:
end = min(len(sorted_data), start + batch_size)
# check each batch is more than minimum batchsize
minibatch = sorted_data[start:end]
if shortest_first:
minibatch.reverse()
if len(minibatch) < min_batch_size:
mod = min_batch_size - len(minibatch) % min_batch_size
additional_minibatch = [
sorted_data[i] for i in np.random.randint(0, start, mod)
]
if shortest_first:
additional_minibatch.reverse()
minibatch.extend(additional_minibatch)
minibatches.append(minibatch)
if end == len(sorted_data):
break
start = end
# for debugging
if num_batches > 0:
minibatches = minibatches[:num_batches]
logger.info("# minibatches: " + str(len(minibatches)))
return minibatches
BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
def make_batchset(
data,
batch_size=0,
max_length_in=float("inf"),
max_length_out=float("inf"),
num_batches=0,
min_batch_size=1,
shortest_first=False,
batch_sort_key="input",
count="auto",
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
iaxis=0,
oaxis=0, ):
"""Make batch set from json dictionary
if utts have "category" value,
>>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
... {'category': 'B', 'input': ..., 'utt':'utt2'},
... {'category': 'B', 'input': ..., 'utt':'utt3'},
... {'category': 'A', 'input': ..., 'utt':'utt4'}]
>>> make_batchset(data, batchsize=2, ...)
[[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
Note that if any utts doesn't have "category",
perform as same as batchfy_by_{count}
:param List[Dict[str, Any]] data: dictionary loaded from data.json
:param int batch_size: maximum number of sequences in a minibatch.
:param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
:param int batch_frames_in: maximum number of input frames in a minibatch.
:param int batch_frames_out: maximum number of output frames in a minibatch.
:param int batch_frames_out: maximum number of input+output frames in a minibatch.
:param str count: strategy to count maximum size of batch.
For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
:param int max_length_in: maximum length of input to decide adaptive batch size
:param int max_length_out: maximum length of output to decide adaptive batch size
:param int num_batches: # number of batches to use (for debug)
:param int min_batch_size: minimum batch size (for multi-gpu)
:param bool shortest_first: Sort from batch with shortest samples
to longest if true, otherwise reverse
:param str batch_sort_key: how to sort data before creating minibatches
["input", "output", "shuffle"]
:param bool swap_io: if True, use "input" as output and "output"
as input in `data` dict
:param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
as input in `data` dict
:param int iaxis: dimension to access input
(for ASR, TTS iaxis=0, for MT iaxis="1".)
:param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
reserved for future research, -1 means all axis.)
:return: List[List[Tuple[str, dict]]] list of batches
"""
# check args
if count not in BATCH_COUNT_CHOICES:
raise ValueError(
f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
f"one of {BATCH_SORT_KEY_CHOICES}")
ikey = "input"
okey = "output"
batch_sort_axis = 0 # index of list
if count == "auto":
if batch_size != 0:
count = "seq"
elif batch_bins != 0:
count = "bin"
elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
count = "frame"
else:
raise ValueError(
f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
)
logger.info(f"count is auto detected as {count}")
if count != "seq" and batch_sort_key == "shuffle":
raise ValueError(
"batch_sort_key=shuffle is only available if batch_count=seq")
category2data = {} # Dict[str, dict]
for v in data:
k = v['utt']
category2data.setdefault(v.get("category"), {})[k] = v
batches_list = [] # List[List[List[Tuple[str, dict]]]]
for d in category2data.values():
if batch_sort_key == "shuffle":
batches = batchfy_shuffle(d, batch_size, min_batch_size,
num_batches, shortest_first)
batches_list.append(batches)
continue
# sort it by input lengths (long to short)
sorted_data = sorted(
d.items(),
key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
reverse=not shortest_first, )
logger.info("# utts: " + str(len(sorted_data)))
if count == "seq":
batches = batchfy_by_seq(
sorted_data,
batch_size=batch_size,
max_length_in=max_length_in,
max_length_out=max_length_out,
min_batch_size=min_batch_size,
shortest_first=shortest_first,
ikey=ikey,
iaxis=iaxis,
okey=okey,
oaxis=oaxis, )
if count == "bin":
batches = batchfy_by_bin(
sorted_data,
batch_bins=batch_bins,
min_batch_size=min_batch_size,
shortest_first=shortest_first,
ikey=ikey,
okey=okey, )
if count == "frame":
batches = batchfy_by_frame(
sorted_data,
max_frames_in=batch_frames_in,
max_frames_out=batch_frames_out,
max_frames_inout=batch_frames_inout,
min_batch_size=min_batch_size,
shortest_first=shortest_first,
ikey=ikey,
okey=okey, )
batches_list.append(batches)
if len(batches_list) == 1:
batches = batches_list[0]
else:
# Concat list. This way is faster than "sum(batch_list, [])"
batches = list(itertools.chain(*batches_list))
# for debugging
if num_batches > 0:
batches = batches[:num_batches]
logger.info("# minibatches: " + str(len(batches)))
# batch: List[List[Tuple[str, dict]]]
return batches
......@@ -23,7 +23,7 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
from deepspeech.frontend.normalizer import FeatureNormalizer
from deepspeech.frontend.speech import SpeechSegment
from deepspeech.frontend.utility import IGNORE_ID
from deepspeech.io.utility import pad_sequence
from deepspeech.io.utility import pad_list
from deepspeech.utils.log import Log
__all__ = ["SpeechCollator"]
......@@ -242,7 +242,6 @@ class SpeechCollator():
# specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram)
specgram = specgram.transpose([1, 0])
return specgram, transcript_part
def __call__(self, batch):
......@@ -250,7 +249,7 @@ class SpeechCollator():
Args:
batch ([List]): batch is (audio, text)
audio (np.ndarray) shape (D, T)
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
......@@ -286,13 +285,12 @@ class SpeechCollator():
texts.append(tokens)
text_lens.append(tokens.shape[0])
padded_audios = pad_sequence(
audios, padding_value=0.0).astype(np.float32) #[B, T, D]
audio_lens = np.array(audio_lens).astype(np.int64)
padded_texts = pad_sequence(
texts, padding_value=IGNORE_ID).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64)
return utts, padded_audios, audio_lens, padded_texts, text_lens
#[B, T, D]
xs_pad = pad_list(audios, 0.0).astype(np.float32)
ilens = np.array(audio_lens).astype(np.int64)
ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
olens = np.array(text_lens).astype(np.int64)
return utts, xs_pad, ilens, ys_pad, olens
@property
def manifest(self):
......
......@@ -217,6 +217,34 @@ class SpeechCollator():
return self._local_data.tar2object[tarpath].extractfile(
self._local_data.tar2info[tarpath][filename])
@property
def manifest(self):
return self._manifest
@property
def vocab_size(self):
return self._speech_featurizer.vocab_size
@property
def vocab_list(self):
return self._speech_featurizer.vocab_list
@property
def vocab_dict(self):
return self._speech_featurizer.vocab_dict
@property
def text_feature(self):
return self._speech_featurizer.text_feature
@property
def feature_size(self):
return self._speech_featurizer.feature_size
@property
def stride_ms(self):
return self._speech_featurizer.stride_ms
def process_utterance(self, audio_file, translation):
"""Load, augment, featurize and normalize for speech data.
......@@ -244,7 +272,6 @@ class SpeechCollator():
# specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram)
specgram = specgram.transpose([1, 0])
return specgram, translation_part
def __call__(self, batch):
......@@ -252,7 +279,7 @@ class SpeechCollator():
Args:
batch ([List]): batch is (audio, text)
audio (np.ndarray) shape (D, T)
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
......@@ -296,34 +323,6 @@ class SpeechCollator():
text_lens = np.array(text_lens).astype(np.int64)
return utts, padded_audios, audio_lens, padded_texts, text_lens
@property
def manifest(self):
return self._manifest
@property
def vocab_size(self):
return self._speech_featurizer.vocab_size
@property
def vocab_list(self):
return self._speech_featurizer.vocab_list
@property
def vocab_dict(self):
return self._speech_featurizer.vocab_dict
@property
def text_feature(self):
return self._speech_featurizer.text_feature
@property
def feature_size(self):
return self._speech_featurizer.feature_size
@property
def stride_ms(self):
return self._speech_featurizer.stride_ms
class TripletSpeechCollator(SpeechCollator):
def process_utterance(self, audio_file, translation, transcript):
......@@ -355,7 +354,6 @@ class TripletSpeechCollator(SpeechCollator):
# specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram)
specgram = specgram.transpose([1, 0])
return specgram, translation_part, transcript_part
def __call__(self, batch):
......@@ -363,7 +361,7 @@ class TripletSpeechCollator(SpeechCollator):
Args:
batch ([List]): batch is (audio, text)
audio (np.ndarray) shape (D, T)
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
......@@ -524,49 +522,19 @@ class KaldiPrePorocessedCollator(SpeechCollator):
:rtype: tuple of (2darray, list)
"""
specgram = kaldiio.load_mat(audio_file)
specgram = specgram.transpose([1, 0])
assert specgram.shape[
0] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
self._feat_dim, specgram.shape[0])
1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
self._feat_dim, specgram.shape[1])
# specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram)
specgram = specgram.transpose([1, 0])
if self._keep_transcription_text:
return specgram, translation
else:
text_ids = self._text_featurizer.featurize(translation)
return specgram, text_ids
@property
def manifest(self):
return self._manifest
@property
def vocab_size(self):
return self._text_featurizer.vocab_size
@property
def vocab_list(self):
return self._text_featurizer.vocab_list
@property
def vocab_dict(self):
return self._text_featurizer.vocab_dict
@property
def text_feature(self):
return self._text_featurizer
@property
def feature_size(self):
return self._feat_dim
@property
def stride_ms(self):
return self._stride_ms
class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
def process_utterance(self, audio_file, translation, transcript):
......@@ -583,15 +551,13 @@ class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
:rtype: tuple of (2darray, (list, list))
"""
specgram = kaldiio.load_mat(audio_file)
specgram = specgram.transpose([1, 0])
assert specgram.shape[
0] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
self._feat_dim, specgram.shape[0])
1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
self._feat_dim, specgram.shape[1])
# specgram augment
specgram = self._augmentation_pipeline.transform_feature(specgram)
specgram = specgram.transpose([1, 0])
if self._keep_transcription_text:
return specgram, translation, transcript
else:
......@@ -604,7 +570,7 @@ class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
Args:
batch ([List]): batch is (audio, text)
audio (np.ndarray) shape (D, T)
audio (np.ndarray) shape (T, D)
translation (List[int] or str): shape (U,)
transcription (List[int] or str): shape (V,)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from deepspeech.io.utility import pad_list
from deepspeech.utils.log import Log
__all__ = ["CustomConverter"]
logger = Log(__name__).getlog()
class CustomConverter():
"""Custom batch converter.
Args:
subsampling_factor (int): The subsampling factor.
dtype (np.dtype): Data type to convert.
"""
def __init__(self, subsampling_factor=1, dtype=np.float32):
"""Construct a CustomConverter object."""
self.subsampling_factor = subsampling_factor
self.ignore_id = -1
self.dtype = dtype
def __call__(self, batch):
"""Transform a batch and send it to a device.
Args:
batch (list): The batch to transform.
Returns:
tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
"""
# batch should be located in list
assert len(batch) == 1
(xs, ys), utts = batch[0]
assert xs[0] is not None, "please check Reader and Augmentation impl."
# perform subsampling
if self.subsampling_factor > 1:
xs = [x[::self.subsampling_factor, :] for x in xs]
# get batch of lengths of input sequences
ilens = np.array([x.shape[0] for x in xs])
# perform padding and convert to tensor
# currently only support real number
if xs[0].dtype.kind == "c":
xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
# Note(kamo):
# {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
# Don't create ComplexTensor and give it E2E here
# because torch.nn.DataParellel can't handle it.
xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
else:
xs_pad = pad_list(xs, 0).astype(self.dtype)
# NOTE: this is for multi-output (e.g., speech translation)
ys_pad = pad_list(
[np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
self.ignore_id)
olens = np.array(
[y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
return utts, xs_pad, ilens, ys_pad, olens
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
from typing import Dict
from typing import List
from typing import Text
import numpy as np
from paddle.io import DataLoader
from deepspeech.frontend.utility import read_manifest
from deepspeech.io.batchfy import make_batchset
from deepspeech.io.converter import CustomConverter
from deepspeech.io.dataset import TransformDataset
from deepspeech.io.reader import LoadInputsAndTargets
from deepspeech.utils.log import Log
__all__ = ["BatchDataLoader"]
logger = Log(__name__).getlog()
def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
mode: Text="asr",
iaxis=0,
oaxis=0):
if mode == 'asr':
feat_dim = data_json[0]['input'][oaxis]['shape'][1]
vocab_size = data_json[0]['output'][oaxis]['shape'][1]
else:
raise ValueError(f"{mode} mode not support!")
return feat_dim, vocab_size
class BatchDataLoader():
def __init__(self,
json_file: str,
train_mode: bool,
sortagrad: bool=False,
batch_size: int=0,
maxlen_in: float=float('inf'),
maxlen_out: float=float('inf'),
minibatches: int=0,
mini_batch_size: int=1,
batch_count: str='auto',
batch_bins: int=0,
batch_frames_in: int=0,
batch_frames_out: int=0,
batch_frames_inout: int=0,
preprocess_conf=None,
n_iter_processes: int=1,
subsampling_factor: int=1,
num_encs: int=1):
self.json_file = json_file
self.train_mode = train_mode
self.use_sortagrad = sortagrad == -1 or sortagrad > 0
self.batch_size = batch_size
self.maxlen_in = maxlen_in
self.maxlen_out = maxlen_out
self.batch_count = batch_count
self.batch_bins = batch_bins
self.batch_frames_in = batch_frames_in
self.batch_frames_out = batch_frames_out
self.batch_frames_inout = batch_frames_inout
self.subsampling_factor = subsampling_factor
self.num_encs = num_encs
self.preprocess_conf = preprocess_conf
self.n_iter_processes = n_iter_processes
# read json data
self.data_json = read_manifest(json_file)
self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
self.data_json, mode='asr')
# make minibatch list (variable length)
self.minibaches = make_batchset(
self.data_json,
batch_size,
maxlen_in,
maxlen_out,
minibatches, # for debug
min_batch_size=mini_batch_size,
shortest_first=self.use_sortagrad,
count=batch_count,
batch_bins=batch_bins,
batch_frames_in=batch_frames_in,
batch_frames_out=batch_frames_out,
batch_frames_inout=batch_frames_inout,
iaxis=0,
oaxis=0, )
# data reader
self.reader = LoadInputsAndTargets(
mode="asr",
load_output=True,
preprocess_conf=preprocess_conf,
preprocess_args={"train":
train_mode}, # Switch the mode of preprocessing
)
# Setup a converter
if num_encs == 1:
self.converter = CustomConverter(
subsampling_factor=subsampling_factor, dtype=np.float32)
else:
assert NotImplementedError("not impl CustomConverterMulEnc.")
# hack to make batchsize argument as 1
# actual bathsize is included in a list
# default collate function converts numpy array to pytorch tensor
# we used an empty collate function instead which returns list
self.dataset = TransformDataset(
self.minibaches,
lambda data: self.converter([self.reader(data, return_uttid=True)]))
self.dataloader = DataLoader(
dataset=self.dataset,
batch_size=1,
shuffle=not self.use_sortagrad if train_mode else False,
collate_fn=lambda x: x[0],
num_workers=n_iter_processes, )
def __repr__(self):
echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
echo += f"train_mode: {self.train_mode}, "
echo += f"sortagrad: {self.use_sortagrad}, "
echo += f"batch_size: {self.batch_size}, "
echo += f"maxlen_in: {self.maxlen_in}, "
echo += f"maxlen_out: {self.maxlen_out}, "
echo += f"batch_count: {self.batch_count}, "
echo += f"batch_bins: {self.batch_bins}, "
echo += f"batch_frames_in: {self.batch_frames_in}, "
echo += f"batch_frames_out: {self.batch_frames_out}, "
echo += f"batch_frames_inout: {self.batch_frames_inout}, "
echo += f"subsampling_factor: {self.subsampling_factor}, "
echo += f"num_encs: {self.num_encs}, "
echo += f"num_workers: {self.n_iter_processes}, "
echo += f"file: {self.json_file}"
return echo
def __len__(self):
return len(self.dataloader)
def __iter__(self):
return self.dataloader.__iter__()
def __call__(self):
return self.__iter__()
......@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from deepspeech.frontend.utility import read_manifest
from deepspeech.utils.log import Log
__all__ = ["ManifestDataset", "TripletManifestDataset"]
__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
logger = Log(__name__).getlog()
......@@ -76,12 +76,18 @@ class ManifestDataset(Dataset):
Args:
manifest_path (str): manifest josn file path
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
max_input_len ([type], optional): maximum output seq length,
in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
min_input_len (float, optional): minimum input seq length,
in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
Defaults to 10.0.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
Defaults to 0.05.
"""
super().__init__()
......@@ -116,3 +122,27 @@ class TripletManifestDataset(ManifestDataset):
instance = self._manifest[idx]
return instance["utt"], instance["feat"], instance["text"], instance[
"text1"]
class TransformDataset(Dataset):
"""Transform Dataset.
Args:
data: list object from make_batchset
transfrom: transform function
"""
def __init__(self, data, transform):
"""Init function."""
super().__init__()
self.data = data
self.transform = transform
def __len__(self):
"""Len function."""
return len(self.data)
def __getitem__(self, idx):
"""[] operator."""
return self.transform(self.data[idx])
此差异已折叠。
......@@ -17,11 +17,16 @@ import numpy as np
from deepspeech.utils.log import Log
__all__ = ["pad_sequence"]
__all__ = ["pad_list", "pad_sequence"]
logger = Log(__name__).getlog()
def pad_list(sequences: List[np.ndarray],
padding_value: float=0.0) -> np.ndarray:
return pad_sequence(sequences, True, padding_value)
def pad_sequence(sequences: List[np.ndarray],
batch_first: bool=True,
padding_value: float=0.0) -> np.ndarray:
......
......@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
share_weights=share_rnn_weights))
i_size = h_size * 2
self.rnn_stacks = nn.ModuleList(rnn_stacks)
self.rnn_stacks = nn.LayerList(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
"""
......
......@@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"]
logger = Log(__name__).getlog()
class U2BaseModel(nn.Module):
class U2BaseModel(nn.Layer):
"""CTC-Attention hybrid Encoder-Decoder model"""
@classmethod
......@@ -612,32 +612,32 @@ class U2BaseModel(nn.Module):
best_index = i
return hyps[best_index][0]
#@jit.export
#@jit.to_static
def subsampling_rate(self) -> int:
""" Export interface for c++ call, return subsampling_rate of the
model
"""
return self.encoder.embed.subsampling_rate
#@jit.export
#@jit.to_static
def right_context(self) -> int:
""" Export interface for c++ call, return right_context of the model
"""
return self.encoder.embed.right_context
#@jit.export
#@jit.to_static
def sos_symbol(self) -> int:
""" Export interface for c++ call, return sos symbol id of the model
"""
return self.sos
#@jit.export
#@jit.to_static
def eos_symbol(self) -> int:
""" Export interface for c++ call, return eos symbol id of the model
"""
return self.eos
@jit.export
@jit.to_static
def forward_encoder_chunk(
self,
xs: paddle.Tensor,
......@@ -667,7 +667,7 @@ class U2BaseModel(nn.Module):
xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache)
# @jit.export([
# @jit.to_static([
# paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D]
# ])
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
......@@ -680,7 +680,7 @@ class U2BaseModel(nn.Module):
"""
return self.ctc.log_softmax(xs)
@jit.export
@jit.to_static
def forward_attention_decoder(
self,
hyps: paddle.Tensor,
......
......@@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"]
logger = Log(__name__).getlog()
class U2STBaseModel(nn.Module):
class U2STBaseModel(nn.Layer):
"""CTC-Attention hybrid Encoder-Decoder model"""
@classmethod
......@@ -417,32 +417,32 @@ class U2STBaseModel(nn.Module):
best_hyps = best_hyps[:, 1:]
return best_hyps
@jit.export
@jit.to_static
def subsampling_rate(self) -> int:
""" Export interface for c++ call, return subsampling_rate of the
model
"""
return self.encoder.embed.subsampling_rate
@jit.export
@jit.to_static
def right_context(self) -> int:
""" Export interface for c++ call, return right_context of the model
"""
return self.encoder.embed.right_context
@jit.export
@jit.to_static
def sos_symbol(self) -> int:
""" Export interface for c++ call, return sos symbol id of the model
"""
return self.sos
@jit.export
@jit.to_static
def eos_symbol(self) -> int:
""" Export interface for c++ call, return eos symbol id of the model
"""
return self.eos
@jit.export
@jit.to_static
def forward_encoder_chunk(
self,
xs: paddle.Tensor,
......@@ -472,7 +472,7 @@ class U2STBaseModel(nn.Module):
xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache)
@jit.export
@jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log
softmax before ctc
......@@ -483,7 +483,7 @@ class U2STBaseModel(nn.Module):
"""
return self.ctc.log_softmax(xs)
@jit.export
@jit.to_static
def forward_attention_decoder(
self,
hyps: paddle.Tensor,
......
......@@ -69,7 +69,7 @@ class ConvGLUBlock(nn.Layer):
dim=0)
self.dropout_residual = nn.Dropout(p=dropout)
self.pad_left = ConstantPad2d((0, 0, kernel_size - 1, 0), 0)
self.pad_left = nn.Pad2d((0, 0, kernel_size - 1, 0), 0)
layers = OrderedDict()
if bottlececk_dim == 0:
......
......@@ -33,7 +33,7 @@ logger = Log(__name__).getlog()
__all__ = ["TransformerDecoder"]
class TransformerDecoder(nn.Module):
class TransformerDecoder(nn.Layer):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
......@@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module):
self.use_output_layer = use_output_layer
self.output_layer = nn.Linear(attention_dim, vocab_size)
self.decoders = nn.ModuleList([
self.decoders = nn.LayerList([
DecoderLayer(
size=attention_dim,
self_attn=MultiHeadedAttention(attention_heads, attention_dim,
......
......@@ -25,15 +25,15 @@ logger = Log(__name__).getlog()
__all__ = ["DecoderLayer"]
class DecoderLayer(nn.Module):
class DecoderLayer(nn.Layer):
"""Single decoder layer module.
Args:
size (int): Input dimension.
self_attn (nn.Module): Self-attention module instance.
self_attn (nn.Layer): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
src_attn (nn.Module): Self-attention module instance.
src_attn (nn.Layer): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
feed_forward (nn.Module): Feed-forward module instance.
feed_forward (nn.Layer): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
......@@ -48,9 +48,9 @@ class DecoderLayer(nn.Module):
def __init__(
self,
size: int,
self_attn: nn.Module,
src_attn: nn.Module,
feed_forward: nn.Module,
self_attn: nn.Layer,
src_attn: nn.Layer,
feed_forward: nn.Layer,
dropout_rate: float,
normalize_before: bool=True,
concat_after: bool=False, ):
......
......@@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder):
pos_enc_layer_type, normalize_before, concat_after,
static_chunk_size, use_dynamic_chunk, global_cmvn,
use_dynamic_left_chunk)
self.encoders = nn.ModuleList([
self.encoders = nn.LayerList([
TransformerEncoderLayer(
size=output_size,
self_attn=MultiHeadedAttention(attention_heads, output_size,
......@@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder):
convolution_layer_args = (output_size, cnn_module_kernel, activation,
cnn_module_norm, causal)
self.encoders = nn.ModuleList([
self.encoders = nn.LayerList([
ConformerEncoderLayer(
size=output_size,
self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
......
......@@ -48,7 +48,8 @@ class CTCLoss(nn.Layer):
logits = logits.transpose([1, 0, 2])
# (TODO:Hui Zhang) ctc loss does not support int64 labels
ys_pad = ys_pad.astype(paddle.int32)
loss = self.loss(logits, ys_pad, hlens, ys_lens)
loss = self.loss(
logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
if self.batch_average:
# Batch-size average
loss = loss / B
......
......@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
share_weights=share_rnn_weights))
i_size = h_size * 2
self.rnn_stacks = nn.ModuleList(rnn_stacks)
self.rnn_stacks = nn.LayerList(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
"""
......
......@@ -47,18 +47,11 @@ def default_argument_parser():
# data and output
parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
# parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
# load from saved checkpoint
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
# save jit model to
parser.add_argument("--export_path", type=str, help="path of the jit model to save")
# save asr result to
parser.add_argument("--result_file", type=str, help="path of save the asr result")
# running
parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
help="device type to use, cpu and gpu are supported.")
......
此差异已折叠。
此差异已折叠。
# Featrues
# Features
### Speech Recognition
......
export MAIN_ROOT=${PWD}
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:/usr/local/bin:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
......
此差异已折叠。
......@@ -26,7 +26,7 @@ python3 -u ${BIN_DIR}/test_export.py \
--device ${device} \
--nproc 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
--result_file ${jit_model_export_path}.rsl \
--export_path ${jit_model_export_path} \
--model_type ${model_type}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -25,6 +25,7 @@ if [ ${seed} ]; then
fi
python3 -u ${BIN_DIR}/train.py \
--model-name u2_kaldi \
--device ${device} \
--nproc ${ngpu} \
--config ${config_path} \
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册