diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py index 17eba5b5248d79941e088fa9604ccbaba9ea9999..d2316ab17ef6af9d1706b27431d45a36d1a8cb1e 100644 --- a/deepspeech/frontend/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -227,4 +227,4 @@ class AugmentationPipeline(): obj = class_obj(self._rng, **params) except Exception: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) - return obj \ No newline at end of file + return obj diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py index f65120536b8a4c24e7cbc11fd81cbd6c61b87350..590987522e6025839ecfa995b2fc013ffdd8fa8c 100644 --- a/deepspeech/io/reader.py +++ b/deepspeech/io/reader.py @@ -24,6 +24,7 @@ __all__ = ["LoadInputsAndTargets"] logger = Log(__name__).getlog() + class LoadInputsAndTargets(): """Create a mini-batch from a list of dicts diff --git a/deepspeech/models/lm/transformer.py b/deepspeech/models/lm/transformer.py index 72082e522deb9de86268098c11e307165c1407a4..b5f7580a20277fd483d17c22735cd2d65455181c 100644 --- a/deepspeech/models/lm/transformer.py +++ b/deepspeech/models/lm/transformer.py @@ -24,11 +24,11 @@ from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface from deepspeech.models.lm_interface import LMInterface from deepspeech.modules.encoder import TransformerEncoder from deepspeech.modules.mask import subsequent_mask - from deepspeech.utils.log import Log logger = Log(__name__).getlog() + class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): def __init__( self, diff --git a/deepspeech/modules/embedding.py b/deepspeech/modules/embedding.py index e154e434f866362f02b45c7bca65578ce2de6966..64d594c29f875da931bd51572daaf1d4a871d886 100644 --- a/deepspeech/modules/embedding.py +++ b/deepspeech/modules/embedding.py @@ -23,12 +23,14 @@ from deepspeech.utils.log import Log logger = Log(__name__).getlog() __all__ = [ - "PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding", "RelPositionalEncoding" + "PositionalEncodingInterface", "NoPositionalEncoding", "PositionalEncoding", + "RelPositionalEncoding" ] -class PositionalEncodingInterface: - def forward(self, x:paddle.Tensor, offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: +class PositionalEncodingInterface: + def forward(self, x: paddle.Tensor, + offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute positional encoding. Args: x (paddle.Tensor): Input tensor (batch, time, `*`). @@ -37,8 +39,8 @@ class PositionalEncodingInterface: paddle.Tensor: Positional embedding tensor (1, time, `*`). """ raise NotImplementedError("forward method is not implemented") - - def position_encoding(self, offset:int, size:int) -> paddle.Tensor: + + def position_encoding(self, offset: int, size: int) -> paddle.Tensor: """ For getting encoding in a streaming fashion Args: offset (int): start offset diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index 7941177123314ba0daee77bf63e95ada41500f4e..435b68949da47cb89eb206d472f9d0bc06410bfe 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -32,7 +32,6 @@ from deepspeech.modules.encoder_layer import TransformerEncoderLayer from deepspeech.modules.mask import add_optional_chunk_mask from deepspeech.modules.mask import make_non_pad_mask from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward -from deepspeech.modules.subsampling import Conv2dSubsampling from deepspeech.modules.subsampling import Conv2dSubsampling4 from deepspeech.modules.subsampling import Conv2dSubsampling6 from deepspeech.modules.subsampling import Conv2dSubsampling8 @@ -394,13 +393,8 @@ class TransformerEncoder(BaseEncoder): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - if isinstance(self.embed, Conv2dSubsampling): - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed( - xs, masks.astype(xs.dtype), offset=0) - else: - xs, pos_emb, masks = self.embed( - xs, masks.astype(xs.dtype), offset=0) + #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor + xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor masks = masks.astype(paddle.bool) diff --git a/deepspeech/transform/__init__.py b/deepspeech/transform/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..185a92b8d94d3426d616c0624f0f2ee04339349e 100644 --- a/deepspeech/transform/__init__.py +++ b/deepspeech/transform/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/deepspeech/transform/cmvn.py b/deepspeech/transform/cmvn.py index 094882297f81b40114ed4faec671f56f71d54e13..5d3185906f65b507ec6afcbec2ca4b2767b83164 100644 --- a/deepspeech/transform/cmvn.py +++ b/deepspeech/transform/cmvn.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import io import h5py @@ -9,16 +22,15 @@ class CMVN(): "Apply Global/Spk CMVN/iverserCMVN." def __init__( - self, - stats, - norm_means=True, - norm_vars=False, - filetype="mat", - utt2spk=None, - spk2utt=None, - reverse=False, - std_floor=1.0e-20, - ): + self, + stats, + norm_means=True, + norm_vars=False, + filetype="mat", + utt2spk=None, + spk2utt=None, + reverse=False, + std_floor=1.0e-20, ): self.stats_file = stats self.norm_means = norm_means self.norm_vars = norm_vars @@ -84,17 +96,14 @@ class CMVN(): self.scale[spk] = 1 / std def __repr__(self): - return ( - "{name}(stats_file={stats_file}, " - "norm_means={norm_means}, norm_vars={norm_vars}, " - "reverse={reverse})".format( - name=self.__class__.__name__, - stats_file=self.stats_file, - norm_means=self.norm_means, - norm_vars=self.norm_vars, - reverse=self.reverse, - ) - ) + return ("{name}(stats_file={stats_file}, " + "norm_means={norm_means}, norm_vars={norm_vars}, " + "reverse={reverse})".format( + name=self.__class__.__name__, + stats_file=self.stats_file, + norm_means=self.norm_means, + norm_vars=self.norm_vars, + reverse=self.reverse, )) def __call__(self, x, uttid=None): if self.utt2spk is not None: @@ -121,6 +130,7 @@ class CMVN(): class UtteranceCMVN(): "Apply Utterance CMVN" + def __init__(self, norm_means=True, norm_vars=False, std_floor=1.0e-20): self.norm_means = norm_means self.norm_vars = norm_vars @@ -130,20 +140,19 @@ class UtteranceCMVN(): return "{name}(norm_means={norm_means}, norm_vars={norm_vars})".format( name=self.__class__.__name__, norm_means=self.norm_means, - norm_vars=self.norm_vars, - ) + norm_vars=self.norm_vars, ) def __call__(self, x, uttid=None): # x: [Time, Dim] - square_sums = (x ** 2).sum(axis=0) + square_sums = (x**2).sum(axis=0) mean = x.mean(axis=0) if self.norm_means: x = np.subtract(x, mean) if self.norm_vars: - var = square_sums / x.shape[0] - mean ** 2 + var = square_sums / x.shape[0] - mean**2 std = np.maximum(np.sqrt(var), self.std_floor) x = np.divide(x, std) - return x \ No newline at end of file + return x diff --git a/deepspeech/utils/cli_readers.py b/deepspeech/utils/cli_readers.py index d744c0d390014d9d006e4b35ea2d9549b1201e31..72aa2bdb73dd9747eccf762c90d7fb34ea4d1997 100644 --- a/deepspeech/utils/cli_readers.py +++ b/deepspeech/utils/cli_readers.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import io import logging import sys @@ -10,11 +23,10 @@ from deepspeech.io.reader import SoundHDF5File def file_reader_helper( - rspecifier: str, - filetype: str = "mat", - return_shape: bool = False, - segments: str = None, -): + rspecifier: str, + filetype: str="mat", + return_shape: bool=False, + segments: str=None, ): """Read uttid and array in kaldi style This function might be a bit confusing as "ark" is used @@ -44,7 +56,8 @@ def file_reader_helper( """ if filetype == "mat": - return KaldiReader(rspecifier, return_shape=return_shape, segments=segments) + return KaldiReader( + rspecifier, return_shape=return_shape, segments=segments) elif filetype == "hdf5": return HDF5Reader(rspecifier, return_shape=return_shape) elif filetype == "sound.hdf5": @@ -62,7 +75,8 @@ class KaldiReader: self.segments = segments def __iter__(self): - with kaldiio.ReadHelper(self.rspecifier, segments=self.segments) as reader: + with kaldiio.ReadHelper( + self.rspecifier, segments=self.segments) as reader: for key, array in reader: if self.return_shape: array = array.shape @@ -72,9 +86,8 @@ class KaldiReader: class HDF5Reader: def __init__(self, rspecifier, return_shape=False): if ":" not in rspecifier: - raise ValueError( - 'Give "rspecifier" such as "ark:some.ark: {}"'.format(self.rspecifier) - ) + raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'. + format(self.rspecifier)) self.rspecifier = rspecifier self.ark_or_scp, self.filepath = self.rspecifier.split(":", 1) if self.ark_or_scp not in ["ark", "scp"]: @@ -93,9 +106,7 @@ class HDF5Reader: raise RuntimeError( "scp file for hdf5 should be like: " '"uttid filepath.h5:key": {}({})'.format( - line, self.filepath - ) - ) + line, self.filepath)) path, h5_key = value.split(":", 1) hdf5_file = hdf5_dict.get(path) @@ -110,9 +121,8 @@ class HDF5Reader: try: data = hdf5_file[h5_key] except Exception: - logging.error( - "Error when loading {} with key={}".format(path, h5_key) - ) + logging.error("Error when loading {} with key={}". + format(path, h5_key)) raise if self.return_shape: @@ -144,9 +154,8 @@ class HDF5Reader: class SoundHDF5Reader: def __init__(self, rspecifier, return_shape=False): if ":" not in rspecifier: - raise ValueError( - 'Give "rspecifier" such as "ark:some.ark: {}"'.format(rspecifier) - ) + raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"'. + format(rspecifier)) self.ark_or_scp, self.filepath = rspecifier.split(":", 1) if self.ark_or_scp not in ["ark", "scp"]: raise ValueError(f"Must be scp or ark: {self.ark_or_scp}") @@ -163,9 +172,7 @@ class SoundHDF5Reader: raise RuntimeError( "scp file for hdf5 should be like: " '"uttid filepath.h5:key": {}({})'.format( - line, self.filepath - ) - ) + line, self.filepath)) path, h5_key = value.split(":", 1) hdf5_file = hdf5_dict.get(path) @@ -180,9 +187,8 @@ class SoundHDF5Reader: try: data = hdf5_file[h5_key] except Exception: - logging.error( - "Error when loading {} with key={}".format(path, h5_key) - ) + logging.error("Error when loading {} with key={}". + format(path, h5_key)) raise # Change Tuple[ndarray, int] -> Tuple[int, ndarray] @@ -214,14 +220,12 @@ class SoundHDF5Reader: class SoundReader: def __init__(self, rspecifier, return_shape=False): if ":" not in rspecifier: - raise ValueError( - 'Give "rspecifier" such as "scp:some.scp: {}"'.format(rspecifier) - ) + raise ValueError('Give "rspecifier" such as "scp:some.scp: {}"'. + format(rspecifier)) self.ark_or_scp, self.filepath = rspecifier.split(":", 1) if self.ark_or_scp != "scp": - raise ValueError( - 'Only supporting "scp" for sound file: {}'.format(self.ark_or_scp) - ) + raise ValueError('Only supporting "scp" for sound file: {}'.format( + self.ark_or_scp)) self.return_shape = return_shape def __iter__(self): diff --git a/deepspeech/utils/cli_utils.py b/deepspeech/utils/cli_utils.py index c4a4cd15b72f832d9118aa7a7377a13de16c329b..f8e1d60bfe605001410869124b307f3d149fda68 100644 --- a/deepspeech/utils/cli_utils.py +++ b/deepspeech/utils/cli_utils.py @@ -1,6 +1,19 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys from collections.abc import Sequence from distutils.util import strtobool as dist_strtobool -import sys import numpy @@ -36,10 +49,9 @@ def get_commandline_args(): # Escape the extra characters for shell argv = [ - arg.replace("'", "'\\''") - if all(char not in arg for char in extra_chars) - else "'" + arg.replace("'", "'\\''") + "'" - for arg in sys.argv + arg.replace("'", "'\\''") if all(char not in arg + for char in extra_chars) else + "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv ] return sys.executable + " " + " ".join(argv) @@ -47,19 +59,12 @@ def get_commandline_args(): def is_scipy_wav_style(value): # If Tuple[int, numpy.ndarray] or not - return ( - isinstance(value, Sequence) - and len(value) == 2 - and isinstance(value[0], int) - and isinstance(value[1], numpy.ndarray) - ) + return (isinstance(value, Sequence) and len(value) == 2 and + isinstance(value[0], int) and isinstance(value[1], numpy.ndarray)) def assert_scipy_wav_style(value): assert is_scipy_wav_style( - value - ), "Must be Tuple[int, numpy.ndarray], but got {}".format( - type(value) - if not isinstance(value, Sequence) - else "{}[{}]".format(type(value), ", ".join(str(type(v)) for v in value)) - ) + value), "Must be Tuple[int, numpy.ndarray], but got {}".format( + type(value) if not isinstance(value, Sequence) else "{}[{}]".format( + type(value), ", ".join(str(type(v)) for v in value))) diff --git a/deepspeech/utils/cli_writers.py b/deepspeech/utils/cli_writers.py index 41e667d37a9b15924db3d7688ead9c0082027c4c..e07371934f346d5899447bdf2178f94f992200f4 100644 --- a/deepspeech/utils/cli_writers.py +++ b/deepspeech/utils/cli_writers.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pathlib import Path from typing import Dict @@ -6,18 +19,17 @@ import kaldiio import numpy import soundfile -from deepspeech.utils.cli_utils import assert_scipy_wav_style from deepspeech.io.reader import SoundHDF5File +from deepspeech.utils.cli_utils import assert_scipy_wav_style def file_writer_helper( - wspecifier: str, - filetype: str = "mat", - write_num_frames: str = None, - compress: bool = False, - compression_method: int = 2, - pcm_format: str = "wav", -): + wspecifier: str, + filetype: str="mat", + write_num_frames: str=None, + compress: bool=False, + compression_method: int=2, + pcm_format: str="wav", ): """Write matrices in kaldi style Args: @@ -61,20 +73,20 @@ def file_writer_helper( wspecifier, write_num_frames=write_num_frames, compress=compress, - compression_method=compression_method, - ) + compression_method=compression_method, ) elif filetype == "hdf5": return HDF5Writer( - wspecifier, write_num_frames=write_num_frames, compress=compress - ) + wspecifier, write_num_frames=write_num_frames, compress=compress) elif filetype == "sound.hdf5": return SoundHDF5Writer( - wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format - ) + wspecifier, + write_num_frames=write_num_frames, + pcm_format=pcm_format) elif filetype == "sound": return SoundWriter( - wspecifier, write_num_frames=write_num_frames, pcm_format=pcm_format - ) + wspecifier, + write_num_frames=write_num_frames, + pcm_format=pcm_format) else: raise NotImplementedError(f"filetype={filetype}") @@ -116,29 +128,27 @@ def get_num_frames_writer(write_num_frames: str): """ if write_num_frames is not None: if ":" not in write_num_frames: - raise ValueError( - 'Must include ":", write_num_frames={}'.format(write_num_frames) - ) + raise ValueError('Must include ":", write_num_frames={}'.format( + write_num_frames)) nframes_type, nframes_file = write_num_frames.split(":", 1) if nframes_type != "ark,t": - raise ValueError( - "Only supporting text mode. " - "e.g. --write-num-frames=ark,t:foo.txt :" - "{}".format(nframes_type) - ) + raise ValueError("Only supporting text mode. " + "e.g. --write-num-frames=ark,t:foo.txt :" + "{}".format(nframes_type)) return open(nframes_file, "w", encoding="utf-8") class KaldiWriter(BaseWriter): - def __init__( - self, wspecifier, write_num_frames=None, compress=False, compression_method=2 - ): + def __init__(self, + wspecifier, + write_num_frames=None, + compress=False, + compression_method=2): if compress: self.writer = kaldiio.WriteHelper( - wspecifier, compression_method=compression_method - ) + wspecifier, compression_method=compression_method) else: self.writer = kaldiio.WriteHelper(wspecifier) self.writer_scp = None @@ -220,7 +230,8 @@ class SoundHDF5Writer(BaseWriter): self.pcm_format = pcm_format spec_dict = parse_wspecifier(wspecifier) self.filename = spec_dict["ark"] - self.writer = SoundHDF5File(spec_dict["ark"], "w", format=self.pcm_format) + self.writer = SoundHDF5File( + spec_dict["ark"], "w", format=self.pcm_format) if "scp" in spec_dict: self.writer_scp = open(spec_dict["scp"], "w", encoding="utf-8") else: diff --git a/requirements.txt b/requirements.txt index a7310a0245245beb4a0930c28537ae4b61bdd59a..d654ef3d87d3b5769359a3aff8490e3316446620 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,7 @@ praatio~=4.1 pre-commit pybind11 pypinyin +python-dateutil pyworld resampy==0.2.2 sacrebleu diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py index 2b6631c24b6922c65828221c3f7444a6d7c5c560..f80053fbefb2895c7eb45097be650a9b7165d52f 100755 --- a/utils/apply-cmvn.py +++ b/utils/apply-cmvn.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import argparse -from distutils.util import strtobool import logging +from distutils.util import strtobool import kaldiio import numpy @@ -16,86 +16,81 @@ from deepspeech.utils.cli_writers import file_writer_helper def get_parser(): parser = argparse.ArgumentParser( description="apply mean-variance normalization to files", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--in-filetype", type=str, default="mat", choices=["mat", "hdf5", "sound.hdf5", "sound"], help="Specify the file format for the rspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--stats-filetype", type=str, default="mat", choices=["mat", "hdf5", "npy"], help="Specify the file format for the rspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--out-filetype", type=str, default="mat", choices=["mat", "hdf5"], help="Specify the file format for the wspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--norm-means", type=strtobool, default=True, - help="Do variance normalization or not.", - ) + help="Do variance normalization or not.", ) parser.add_argument( "--norm-vars", type=strtobool, default=False, - help="Do variance normalization or not.", - ) + help="Do variance normalization or not.", ) parser.add_argument( - "--reverse", type=strtobool, default=False, help="Do reverse mode or not" - ) + "--reverse", + type=strtobool, + default=False, + help="Do reverse mode or not") parser.add_argument( "--spk2utt", type=str, help="A text file of speaker to utterance-list map. " "(Don't give rspecifier format, such as " - '"ark:spk2utt")', - ) + '"ark:spk2utt")', ) parser.add_argument( "--utt2spk", type=str, help="A text file of utterance to speaker map. " "(Don't give rspecifier format, such as " - '"ark:utt2spk")', - ) + '"ark:utt2spk")', ) parser.add_argument( - "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames" - ) + "--write-num-frames", + type=str, + help="Specify wspecifer for utt2num_frames") parser.add_argument( - "--compress", type=strtobool, default=False, help="Save in compressed format" - ) + "--compress", + type=strtobool, + default=False, + help="Save in compressed format") parser.add_argument( "--compression-method", type=int, default=2, - help="Specify the method(if mat) or " "gzip-level(if hdf5)", - ) + help="Specify the method(if mat) or " + "gzip-level(if hdf5)", ) parser.add_argument( "stats_rspecifier_or_rxfilename", - help="Input stats. e.g. ark:stats.ark or stats.mat", - ) + help="Input stats. e.g. ark:stats.ark or stats.mat", ) parser.add_argument( - "rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark" - ) + "rspecifier", type=str, help="Read specifier id. e.g. ark:some.ark") parser.add_argument( - "wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark" - ) + "wspecifier", type=str, help="Write specifier id. e.g. ark:some.ark") return parser @@ -118,8 +113,8 @@ def main(): stats_filetype = args.stats_filetype stats_dict = dict( - file_reader_helper(args.stats_rspecifier_or_rxfilename, stats_filetype) - ) + file_reader_helper(args.stats_rspecifier_or_rxfilename, + stats_filetype)) else: is_rspcifier = False if args.stats_filetype == "mat": @@ -134,16 +129,14 @@ def main(): norm_vars=args.norm_vars, utt2spk=args.utt2spk, spk2utt=args.spk2utt, - reverse=args.reverse, - ) + reverse=args.reverse, ) with file_writer_helper( - args.wspecifier, - filetype=args.out_filetype, - write_num_frames=args.write_num_frames, - compress=args.compress, - compression_method=args.compression_method, - ) as writer: + args.wspecifier, + filetype=args.out_filetype, + write_num_frames=args.write_num_frames, + compress=args.compress, + compression_method=args.compression_method, ) as writer: for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] diff --git a/utils/caculate_rtf.py b/utils/caculate_rtf.py index 6be8dffd8eb80f84635453cf29effae9d356328b..fcc155edc3251962d57dd1c0cd77a8ce76f8d696 100755 --- a/utils/caculate_rtf.py +++ b/utils/caculate_rtf.py @@ -1,24 +1,23 @@ #!/usr/bin/env python3 # encoding: utf-8 - # Copyright 2021 Kyoto University (Hirofumi Inaguma) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - import argparse import codecs -from dateutil import parser import glob import os +from dateutil import parser + def get_parser(): - parser = argparse.ArgumentParser(description="calculate real time factor (RTF)") + parser = argparse.ArgumentParser( + description="calculate real time factor (RTF)") parser.add_argument( "--log-dir", type=str, default=None, - help="path to logging directory", - ) + help="path to logging directory", ) return parser @@ -37,23 +36,21 @@ def main(): with codecs.open(x, "r", "utf-8") as f: for line in f: x = line.strip() - if "INFO: input lengths" in x: - audio_durations += [int(x.split("input lengths: ")[1])] - start_times += [parser.parse(x.split("(")[0])] - elif "INFO: prediction" in x: - end_times += [parser.parse(x.split("(")[0])] - assert len(audio_durations) == len(end_times), ( - len(audio_durations), - len(end_times), - ) - assert len(start_times) == len(end_times), (len(start_times), len(end_times)) + # 2021-10-25 08:22:04.052 | INFO | xxx:recog_v2:188 - feat: (1570, 83) + if "feat:" in x: + dur = int(x.split("(")[1].split(',')[0]) + audio_durations += [dur] + start_times += [parser.parse(x.split("|")[0])] + elif "total log probability:" in x: + end_times += [parser.parse(x.split("|")[0])] + assert len(audio_durations) == len(end_times), (len(audio_durations), + len(end_times), ) + assert len(start_times) == len(end_times), (len(start_times), + len(end_times)) + audio_sec += sum(audio_durations) / 100 # [sec] - decode_sec += sum( - [ - (end - start).total_seconds() - for start, end in zip(start_times, end_times) - ] - ) + decode_sec += sum([(end - start).total_seconds() + for start, end in zip(start_times, end_times)]) n_utt += len(audio_durations) print("Total audio duration: %.3f [sec]" % audio_sec) diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py index d239d21de05871a7242ac6dc11c5c0a6275ee60a..706d8cd53b24cbe7c4a1243b416cee0bc0e2203e 100755 --- a/utils/compute-cmvn-stats.py +++ b/utils/compute-cmvn-stats.py @@ -19,44 +19,42 @@ def get_parser(): "If wspecifier provided: per-utterance by default, " "or per-speaker if" "spk2utt option provided; if wxfilename: global", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--spk2utt", type=str, help="A text file of speaker to utterance-list map. " "(Don't give rspecifier format, such as " - '"ark:utt2spk")', - ) - parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + '"ark:utt2spk")', ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--in-filetype", type=str, default="mat", choices=["mat", "hdf5", "sound.hdf5", "sound"], help="Specify the file format for the rspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--out-filetype", type=str, default="mat", choices=["mat", "hdf5", "npy"], help="Specify the file format for the wspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--preprocess-conf", type=str, default=None, - help="The configuration file for the pre-processing", - ) + help="The configuration file for the pre-processing", ) parser.add_argument( - "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark" - ) + "rspecifier", + type=str, + help="Read specifier for feats. e.g. ark:some.ark") parser.add_argument( - "wspecifier_or_wxfilename", type=str, help="Write specifier. e.g. ark:some.ark" - ) + "wspecifier_or_wxfilename", + type=str, + help="Write specifier. e.g. ark:some.ark") return parser @@ -92,10 +90,8 @@ def main(): return x if args.out_filetype == "npy": - logging.warning( - "--out-filetype npy is allowed only for " - "Global CMVN mode, changing to hdf5" - ) + logging.warning("--out-filetype npy is allowed only for " + "Global CMVN mode, changing to hdf5") args.out_filetype = "hdf5" else: @@ -107,10 +103,8 @@ def main(): return None if args.out_filetype == "hdf5": - logging.warning( - "--out-filetype hdf5 is not allowed for " - "Global CMVN mode, changing to npy" - ) + logging.warning("--out-filetype hdf5 is not allowed for " + "Global CMVN mode, changing to npy") args.out_filetype = "npy" if args.preprocess_conf is not None: @@ -126,8 +120,7 @@ def main(): idx = 0 for idx, (utt, matrix) in enumerate( - file_reader_helper(args.rspecifier, args.in_filetype), 1 - ): + file_reader_helper(args.rspecifier, args.in_filetype), 1): if is_scipy_wav_style(matrix): # If data is sound file, then got as Tuple[int, ndarray] rate, matrix = matrix @@ -146,7 +139,7 @@ def main(): counts[spk] += matrix.shape[0] sum_feats[spk] += matrix.sum(axis=0) - square_sum_feats[spk] += (matrix ** 2).sum(axis=0) + square_sum_feats[spk] += (matrix**2).sum(axis=0) logging.info("Processed {} utterances".format(idx)) assert idx > 0, idx @@ -171,8 +164,8 @@ def main(): # Per utterance or speaker CMVN if is_wspecifier: with file_writer_helper( - args.wspecifier_or_wxfilename, filetype=args.out_filetype - ) as writer: + args.wspecifier_or_wxfilename, + filetype=args.out_filetype) as writer: for spk, mat in cmvn_stats.items(): writer[spk] = mat @@ -186,8 +179,7 @@ def main(): kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) else: raise RuntimeError( - "Not supporting: --out-filetype {}".format(args.out_filetype) - ) + "Not supporting: --out-filetype {}".format(args.out_filetype)) if __name__ == "__main__": diff --git a/utils/copy-feats.py b/utils/copy-feats.py index 13c6b5433984183371d2c27d3a281c993f4b649f..7d1b858969b7016714f1de0f131947e161a0b506 100755 --- a/utils/copy-feats.py +++ b/utils/copy-feats.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import argparse -from distutils.util import strtobool import logging +from distutils.util import strtobool from deepspeech.transform.transformation import Transformation from deepspeech.utils.cli_readers import file_reader_helper @@ -13,50 +13,50 @@ from deepspeech.utils.cli_writers import file_writer_helper def get_parser(): parser = argparse.ArgumentParser( description="copy feature with preprocessing", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--in-filetype", type=str, default="mat", choices=["mat", "hdf5", "sound.hdf5", "sound"], help="Specify the file format for the rspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( "--out-filetype", type=str, default="mat", choices=["mat", "hdf5", "sound.hdf5", "sound"], help="Specify the file format for the wspecifier. " - '"mat" is the matrix format in kaldi', - ) + '"mat" is the matrix format in kaldi', ) parser.add_argument( - "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames" - ) + "--write-num-frames", + type=str, + help="Specify wspecifer for utt2num_frames") parser.add_argument( - "--compress", type=strtobool, default=False, help="Save in compressed format" - ) + "--compress", + type=strtobool, + default=False, + help="Save in compressed format") parser.add_argument( "--compression-method", type=int, default=2, - help="Specify the method(if mat) or " "gzip-level(if hdf5)", - ) + help="Specify the method(if mat) or " + "gzip-level(if hdf5)", ) parser.add_argument( "--preprocess-conf", type=str, default=None, - help="The configuration file for the pre-processing", - ) + help="The configuration file for the pre-processing", ) parser.add_argument( - "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark" - ) + "rspecifier", + type=str, + help="Read specifier for feats. e.g. ark:some.ark") parser.add_argument( - "wspecifier", type=str, help="Write specifier. e.g. ark:some.ark" - ) + "wspecifier", type=str, help="Write specifier. e.g. ark:some.ark") return parser @@ -79,12 +79,11 @@ def main(): preprocessing = None with file_writer_helper( - args.wspecifier, - filetype=args.out_filetype, - write_num_frames=args.write_num_frames, - compress=args.compress, - compression_method=args.compression_method, - ) as writer: + args.wspecifier, + filetype=args.out_filetype, + write_num_frames=args.write_num_frames, + compress=args.compress, + compression_method=args.compression_method, ) as writer: for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] diff --git a/utils/data2json.py b/utils/data2json.sh similarity index 100% rename from utils/data2json.py rename to utils/data2json.sh diff --git a/utils/merge_scp2json.py b/utils/merge_scp2json.py index 02d912a55bbff2bcdb5d89f05bdb73017d849fb6..b724a7dd988a9726da33808f69121a46a8a3f5c1 100755 --- a/utils/merge_scp2json.py +++ b/utils/merge_scp2json.py @@ -1,14 +1,12 @@ #!/usr/bin/env python3 # encoding: utf-8 - - import argparse import codecs -from distutils.util import strtobool -from io import open import json import logging import sys +from distutils.util import strtobool +from io import open from deepspeech.utils.cli_utils import get_commandline_args @@ -47,45 +45,41 @@ def get_parser(): "--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape " "--output-scps text:data/text shape:data/utt2text_shape:shape " "--scps utt2spk:data/utt2spk".format(sys.argv[0]), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--input-scps", type=str, nargs="*", action="append", default=[], - help="Json files for the inputs", - ) + help="Json files for the inputs", ) parser.add_argument( "--output-scps", type=str, nargs="*", action="append", default=[], - help="Json files for the outputs", - ) + help="Json files for the outputs", ) parser.add_argument( "--scps", type=str, nargs="+", default=[], - help="The json files except for the input and outputs", - ) - parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option") + help="The json files except for the input and outputs", ) + parser.add_argument( + "--verbose", "-V", default=1, type=int, help="Verbose option") parser.add_argument( "--allow-one-column", type=strtobool, default=False, help="Allow one column in input scp files. " - "In this case, the value will be empty string.", - ) + "In this case, the value will be empty string.", ) parser.add_argument( "--out", "-O", type=str, - help="The output filename. " "If omitted, then output to sys.stdout", - ) + help="The output filename. " + "If omitted, then output to sys.stdout", ) return parser @@ -128,37 +122,33 @@ if __name__ == "__main__": # e.g. type_func_str = "int" -> type_func = int type_func = eval(type_func_str) except Exception: - raise RuntimeError("Unknown type: {}".format(type_func_str)) + raise RuntimeError( + "Unknown type: {}".format(type_func_str)) if not callable(type_func): - raise RuntimeError("Unknown type: {}".format(type_func_str)) + raise RuntimeError( + "Unknown type: {}".format(type_func_str)) else: raise RuntimeError( "Format : " "or :: " "e.g. feat:data/feat.scp " - "or shape:data/feat.scp:shape: {}".format(key_scp) - ) + "or shape:data/feat.scp:shape: {}".format(key_scp)) for item in lis: if key == item[0]: - raise RuntimeError( - 'The key "{}" is duplicated: {} {}'.format( - key, item[3], key_scp - ) - ) + raise RuntimeError('The key "{}" is duplicated: {} {}'. + format(key, item[3], key_scp)) lis.append((key, scp, type_func, key_scp, type_func_str)) lis_list.append(lis) # Open scp files - input_fscps = [ - [open(i[1], "r", encoding="utf-8") for i in il] for il in input_infos - ] - output_fscps = [ - [open(i[1], "r", encoding="utf-8") for i in il] for il in output_infos - ] + input_fscps = [[open(i[1], "r", encoding="utf-8") for i in il] + for il in input_infos] + output_fscps = [[open(i[1], "r", encoding="utf-8") for i in il] + for il in output_infos] fscps = [[open(i[1], "r", encoding="utf-8") for i in il] for il in infos] # Note(kamo): What is done here? @@ -200,12 +190,10 @@ if __name__ == "__main__": if line == "" or first == "": if line != first: concat = sum(input_infos + output_infos + infos, []) - raise RuntimeError( - "The number of lines mismatch " - 'between: "{}" and "{}"'.format( - concat[0][1], concat[count][1] - ) - ) + raise RuntimeError("The number of lines mismatch " + 'between: "{}" and "{}"'.format( + concat[0][1], + concat[count][1])) elif line.split()[0] != first.split()[0]: concat = sum(input_infos + output_infos + infos, []) @@ -216,9 +204,7 @@ if __name__ == "__main__": concat[0][1], concat[count][1], first.rstrip(), - line.rstrip(), - ) - ) + line.rstrip(), )) count += 1 # The end of file @@ -237,7 +223,8 @@ if __name__ == "__main__": ]: lis = [] - for idx, (line_list, info_list) in enumerate(zip(_lines, _infos), 1): + for idx, (line_list, info_list) in enumerate( + zip(_lines, _infos), 1): if inout == "input": d = {"name": "input{}".format(idx)} elif inout == "output": @@ -254,9 +241,7 @@ if __name__ == "__main__": raise RuntimeError( "Format error {}th line in {}: " ' Expecting " ":\n>>> {}'.format( - nutt, info[1], line - ) - ) + nutt, info[1], line)) uttid = sps[0] value = "" else: @@ -274,9 +259,7 @@ if __name__ == "__main__": logging.error( '"{}" is an invalid function ' "for the {} th line in {}: \n>>> {}".format( - info[4], nutt, info[1], line - ) - ) + info[4], nutt, info[1], line)) raise d[key] = value @@ -289,8 +272,11 @@ if __name__ == "__main__": entry.update(lis[0]) entry = json.dumps( - entry, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ") - ) + entry, + indent=4, + ensure_ascii=False, + sort_keys=True, + separators=(",", ": ")) # Add indent indent = " " * 2 entry = ("\n" + indent).join(entry.split("\n")) diff --git a/utils/text2token.py b/utils/text2token.py index 56c39138ffc30cc431576796f9666b599f907295..4b25612ec96e1cfc613a5417941b109a06b41cc6 100755 --- a/utils/text2token.py +++ b/utils/text2token.py @@ -1,9 +1,6 @@ #!/usr/bin/env python3 - # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - - import argparse import codecs import re @@ -27,28 +24,26 @@ def exist_or_not(i, match_pos): def get_parser(): parser = argparse.ArgumentParser( description="convert raw text to tokenized text", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--nchar", "-n", default=1, type=int, help="number of characters to split, i.e., \ - aabb -> a a b b with -n 1 and aa bb with -n 2", - ) + aabb -> a a b b with -n 1 and aa bb with -n 2", ) parser.add_argument( - "--skip-ncols", "-s", default=0, type=int, help="skip first n columns" - ) - parser.add_argument("--space", default="", type=str, help="space symbol") + "--skip-ncols", "-s", default=0, type=int, help="skip first n columns") + parser.add_argument( + "--space", default="", type=str, help="space symbol") parser.add_argument( "--non-lang-syms", "-l", default=None, type=str, - help="list of non-linguistic symobles, e.g., etc.", - ) - parser.add_argument("text", type=str, default=False, nargs="?", help="input text") + help="list of non-linguistic symobles, e.g., etc.", ) + parser.add_argument( + "text", type=str, default=False, nargs="?", help="input text") parser.add_argument( "--trans_type", "-t", @@ -60,8 +55,7 @@ def get_parser(): read from SI1279.WRD file -> "bricks are an alternative" Else if trans_type is phn, read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l - sil t er n ih sil t ih v sil" """, - ) + sil t er n ih sil t ih v sil" """, ) return parser @@ -78,17 +72,17 @@ def main(): if args.text: f = codecs.open(args.text, encoding="utf-8") else: - f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) + f = codecs.getreader("utf-8")(sys.stdin + if is_python2 else sys.stdin.buffer) - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer - ) + sys.stdout = codecs.getwriter("utf-8")(sys.stdout + if is_python2 else sys.stdout.buffer) line = f.readline() n = args.nchar while line: x = line.split() - print(" ".join(x[: args.skip_ncols]), end=" ") - a = " ".join(x[args.skip_ncols :]) + print(" ".join(x[:args.skip_ncols]), end=" ") + a = " ".join(x[args.skip_ncols:]) # get all matched positions match_pos = [] @@ -118,7 +112,7 @@ def main(): i += 1 a = chars - a = [a[j : j + n] for j in range(0, len(a), n)] + a = [a[j:j + n] for j in range(0, len(a), n)] a_flat = [] for z in a: